In [None]:
import sys
import os
import glob
import pandas as pd

import numpy as np
import sklearn
print (sklearn.__version__)

from sklearn import linear_model
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.externals import joblib
import seaborn as sns

from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_val_score
from sklearn.svm import  SVC

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.externals.six import StringIO

from sklearn.ensemble import RandomForestClassifier,VotingClassifier,AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier

In [None]:
# ディレクトリ名
input_dir = ur"C:/Users/mirait/wk/git/input/"
output_dir = ur"C:/Users/mirait/wk/git/output/"

In [None]:
# 訓練データを読み込む
train_path = input_dir + ur"train.csv"
train_data = pd.read_csv(train_path, encoding="cp932",low_memory=False)
train_data.head()

In [None]:
# テストデータを読み込む
test_path = input_dir + ur"test.csv"
test_data = pd.read_csv(test_path, encoding="cp932",low_memory=False)
test_data.head()

In [None]:
print "訓練データ：レコード長", len(train_data)
print "訓練データ：カラム数",len(train_data.columns)
print "テストデータ：レコード長", len(test_data)
print "テストデータ：カラム数",len(test_data.columns)

In [None]:
# 訓練+テストを結合
merge_arr = [train_data, test_data]

In [None]:
# 分析に生かすために項目の特徴量を取得
def feature(df):
    
    df_feature = pd.DataFrame()
    
    for i in (range(len(df.columns))):
        tmp = pd.DataFrame()
        tmp = df.iloc[:, [i]]
        
        selList = list()
        record_cn = len(tmp) #レコード数
        column_name = tmp.columns[0] #カラム名
        value_type_cn = len(pd.value_counts(tmp.values.flatten())) #値の種類数(NAはカウント外)
        NA_cn = tmp.isnull().sum().values[0] #NA件数

        feature_list = list([record_cn, column_name, value_type_cn, NA_cn])
        now_column = pd.DataFrame(feature_list).T
        #print res1
        
        df_feature = pd.concat([df_feature, now_column], ignore_index=True)
    
    df_feature.columns = [u'レコード数', u'カラム名', 'v_count', u'NA件数']
        
    col_names = list(df_feature.columns)
    df_feature = df_feature.loc[:, col_names]

    return df_feature

In [None]:
# 訓練データの特徴量
feature_train = feature(train_data)
print len(feature_train)

In [None]:
output_feature_train = os.path.join(output_dir, "column_feature_train.csv")
feature_train.to_csv(output_feature_train, encoding="cp932",index=False)

In [None]:
# テストデータの特徴量
feature_test = feature(test_data)
print len(feature_test)

In [None]:
output_feature_test = os.path.join(output_dir, "column_feature_test.csv")
feature_test.to_csv(output_feature_test, encoding="cp932",index=False)

In [None]:
# 欠損値があるカラムを取得
null_columns = [col for col in train_data.columns if train_data[col].isnull().any()]
null_columns

In [None]:
# 「Cabin」の欠損値を埋める
train_data['Cabin'] = pd.Series([i[0] if pd.notnull(i) else 'X' for i in train_data['Cabin'] ])
train_data['Cabin'].replace('T','X',inplace=True)    # テストデータには"T"がないため、欠損値ど同義と捉え"X"に置換
test_data['Cabin'] = pd.Series([i[0] if pd.notnull(i) else 'X' for i in test_data['Cabin'] ])

In [None]:
# 「Cabin」のデータ分布を視覚化
print(train_data.Cabin.value_counts())
sns.countplot('Cabin', data=train_data)

In [None]:
# 人物名の敬称を取得
for data in merge_arr:
    data['Title'] = data.Name.str.split(', ',expand=True)[1].str.split('. ',expand=True)[0]
    title_cnt = data.Title.value_counts()<10
    data.Title = data.Title.apply(lambda x: x if title_cnt[x]==False else 'Misc')

In [None]:
train_data.head()

In [None]:
print title_cnt.head()
print data.head()

In [None]:
# 年齢の欠損値埋め
# チケットクラス、敬称、性別から年齢の平均値をそれぞれ取得して埋める
# 単純に平均値で埋めるのではなく、年齢と相関関係が高いと推測される項目から取得 (先に項目間の相関関係を取得する必要あり)
med_age = pd.DataFrame()
def fill_age(cols):
    pclass = cols[0]
    sex = cols[1]
    age = cols[2]
    title = cols[3]
    if pd.isnull(age):
        return med_age[(med_age['Pclass']==pclass) & (med_age['Title']==title) & (med_age['Sex']==sex)]['Age']
    else:
        return age

In [None]:
for dataset in merge_arr:
    med_age = dataset.groupby(['Pclass','Title','Sex'])['Age'].median().reset_index()
    dataset['Age'] = dataset[['Pclass','Sex','Age','Title']].apply(fill_age,axis=1)
    
    # Embarkedは欠損値が2つのみのため最頻値で埋める
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace=True)
    # 世帯人数に関連する項目を追加
    dataset['FamilySize'] = dataset.SibSp+dataset.Parch+1
    dataset['IsAlone'] = 1
    dataset['IsAlone'].loc[dataset['FamilySize']>1]=0
    # 賃金を四分割
    dataset['FareBin']=pd.qcut(dataset['Fare'],4,labels=[1,2,3,4])
    # 年齢を五分割
    dataset['AgeBin']=pd.cut(dataset['Age'],5,labels=[1,2,3,4,5])

In [None]:
# 生存者数の総数を確認
print(train_data.Survived.value_counts())
sns.countplot(x='Survived',data=train_data)
plt.show()

In [None]:
# 性別-生存者数
print(train_data.groupby('Sex')['Survived'].sum())
sns.countplot(x='Survived',hue='Sex',data=train_data)
plt.show()

In [None]:
# チケットクラス-生存者数
print(train_data.groupby('Pclass')['Survived'].sum())
sns.countplot(x='Survived',hue='Pclass',data=train_data)
plt.show()

In [None]:
# 敬称-生存者数
print(train_data.groupby('Title')['Survived'].sum())
sns.countplot(x='Survived',hue='Title',data=train_data)
plt.show()

In [None]:
# 単身者-生存者数
print(train_data.groupby('IsAlone')['Survived'].sum())
sns.countplot(x='Survived',hue='IsAlone',data=train_data)
plt.show()

In [None]:
# 世帯人数-生存者数
print(train_data.groupby('FamilySize')['Survived'].sum())
sns.countplot(x='Survived',hue='FamilySize',data=train_data)
plt.legend(loc=1) #moving the legned to the right
plt.show()

In [None]:
train_data.Age=train_data.Age.astype(int)
# 年齢-生存者数でプロット
ageplt = sns.FacetGrid(train_data,hue='Survived',aspect=4)
ageplt.map(sns.kdeplot,'Age',shade=True)
ageplt.set(xlim=(0,train_data.Age.max()))
ageplt.add_legend()
plt.show()

In [None]:
# 賃金-生存者数でプロット
fareplt = sns.FacetGrid(train_data,hue='Survived',aspect=5)
fareplt.map(sns.kdeplot,'Fare',shade=True)
fareplt.set(xlim=(0,train_data.Fare.max()))
fareplt.add_legend()
plt.show()

In [None]:
train_data.corr()

In [None]:
# 一意の値と推測できる項目は削除　（'PassengerId','Ticket','Name'）
# 新規項目追加時に使用した項目は、新規項目で代替可能なため削除　（'Fare','Age','SibSp','Parch'）
train_data.drop(['PassengerId','Ticket','Name','Fare','Age','SibSp','Parch'],axis=1,inplace=True)
test_data.drop(['Ticket','Name','Fare','Age','Parch','SibSp'],axis=1,inplace=True)
train_data.head()

In [None]:
# ダミー変数化
train_dum = pd.get_dummies(train_data, columns=['Sex','Embarked','Pclass','Title','AgeBin','FareBin','Cabin'],drop_first=True)
test_dum = pd.get_dummies(test_data, columns=['Sex','Embarked','Pclass','Title','AgeBin','FareBin','Cabin'],drop_first=True)

In [None]:
train_dum.corr()['Survived']

In [None]:
train_dum.info()

In [None]:
# 欠損値の処理
tmp = train_dum.copy()
print len(tmp)
tmp01 = tmp.dropna()
print len(tmp01),":NA行数 ",(len(tmp)-len(tmp01))
tmp02 = tmp01.replace([np.inf,-np.inf],np.nan)    #infの置換
tmp03 = tmp02.dropna().reset_index(drop =True)
print len(tmp03),":inf行数 ",(len(tmp02)-len(tmp03))

In [None]:
# 乗客番号は出力に用いるため退避し削除
PassengerId = test_data['PassengerId']
test_dum.drop(labels=['PassengerId'],inplace=True,axis=1)

In [None]:
# 説明変数、目的変数への分割
X = tmp03.drop(columns = {u"Survived"},axis =1)
y = tmp03.loc[:,[u"Survived"]]

print len(X.columns)
print len(X)

In [None]:
# データ分割
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=.20, random_state=1)

In [None]:
# グリッドサーチ
def grid(x, y, clf, grid_param, cv):
    
    # パラメータ探索
    gs = GridSearchCV(clf, grid_param, cv=cv)
    gs.fit(x, y)
    
    # グリッドサーチの結果を出力
    result = pd.DataFrame(gs.grid_scores_)
    result = result.iloc[:,[0,1]]
    print gs.best_params_
    
    return result, gs.best_params_

In [None]:
# モデル構築

In [None]:
# SGD

In [None]:
# SGD (回帰係数、適合率など評価)
def sgd(X_train, y_train, X_test, y_test, param):
    
    # モデル構築
#   clf =linear_model.SGDClassifier(loss='log', penalty='elasticnet', random_state=0, class_weight='balanced',
#                                     alpha = param['alpha'] , l1_ratio = param['l1_ratio'], max_iter=500)
    clf =linear_model.SGDClassifier(loss='log', penalty='elasticnet', class_weight='balanced',
                                    alpha = param['alpha'] , l1_ratio = param['l1_ratio'], max_iter=500)
    clf.fit(X_train,y_train)
    
    # モデル評価を出力
    print "score:", clf.score(X_test,y_test)
    print "confusion_matrix:"
    print confusion_matrix(y_test, clf.predict(X_test))

    # 回帰係数を出力
    coeff_df = pd.DataFrame([X_train.columns,  clf.coef_[0]]).T
    coeff_df.columns = ["col_name","coef"] 
    coeff_df["coef_abs"] = abs(coeff_df["coef"] )
    coeff_sort = coeff_df.sort_values(by="coef_abs", ascending=False).reset_index(drop=True)


    print ""
    print "回帰係数の総数",len(coeff_sort)
    coeff_sort.coef =coeff_sort.coef.astype(np.float)
    print "回帰係数 0の数",len(coeff_sort.query('coef == 0'))
    print "回帰係数 0以外の数",len(coeff_sort.query('coef != 0'))
    

    # 適合率、再現率、閾値をそれぞれ出力
    print ""
    precision, recall, threshold = precision_recall_curve(y_test, clf.predict_proba(X_test)[:,1] )#
    print ""
    print (classification_report(y_test, clf.predict(X_test)))
    
    print ""
    prob = clf.predict_proba(X_test)[:,1]
    fpr,tpr,thresholds = sklearn.metrics.roc_curve(y_test,prob)
    print "auc", round(sklearn.metrics.auc(fpr,tpr),4)
    
    # ROC曲線
    plt.plot(fpr, tpr)
    plt.title("ROC curve")
    plt.xlabel("False Positve Rate")
    plt.ylabel("True Positive Rate")
    #plt.show()
    
    return clf, coeff_sort

In [None]:
# 交差検定の実行回数
cv  = 10

In [None]:
# SGDのパラメータリスト
sgd_param = [{'alpha': [0.001, 0.0001, 0.00001],'l1_ratio': [0, 0.02, 0.04]}]

In [None]:
# グリッドサーチ用SGD
grid_sgd = linear_model.SGDClassifier(loss='log', penalty='elasticnet', random_state=0, class_weight='balanced',max_iter=50)

In [None]:
print grid_sgd

In [None]:
grid_result_SGD, best_param_SGD = grid(X_train, y_train, grid_sgd, sgd_param, cv)

In [None]:
# グリッドサーチの探索結果を出力
output_grid = os.path.join(output_dir, "grid_SGD.csv")
grid_result_SGD.to_csv(output_grid, encoding="cp932", index=False)

In [None]:
# 最適パラメータでモデル構築
clf_SGD, result = sgd(X_train, y_train, X_test, y_test, best_param_SGD)

In [None]:
print(clf_SGD.score(X_train,y_train))

In [None]:
# 交差検証
cross_scores = cross_val_score(clf_SGD, X_test, (np.array(y_test.iloc[:,0].values.flatten())), cv=10)
print cross_scores
print ("Accuracy: %0.2f" % (cross_scores.mean()))

In [None]:
# 決定木

In [None]:
def decisionTree (x_train, x_test, y_train, y_test):
    
    # モデル構築
#     clf = DecisionTreeClassifier(max_depth=5, random_state=0, class_weight='balanced')
    clf = DecisionTreeClassifier(max_depth=5, class_weight='balanced')
    clf.fit(X_train, y_train)
    
    predicted = clf.predict(x_test)
    #print "識別率：", float(sum(predicted == np.array(y_test.iloc[:,0]).T))/len(y_test)
    dot_data = StringIO()
    tree.export_graphviz(clf, out_file = dot_data, feature_names=list(x_train.columns), filled=True, rounded=True,impurity=False)
    res = dot_data.getvalue()
    res_wk = res.encode("cp932")
    dot_file = res_wk.replace("fontname=helvetica","fontname=meiryo")
    
    print "accuracy_score:", round(clf.score(x_test,y_test),4)
    print "confusion_matrix:"
    print confusion_matrix(y_test,clf.predict(x_test))
    print ""
    prob = clf.predict_proba(x_test)[:,1]
    fpr,tpr,thresholds = roc_curve(y_test,prob)
    print "auc:", round(auc(fpr,tpr),4)
    
    # 変数重要度
    column_importance = pd.DataFrame(clf.feature_importances_).T
    column_importance.columns = list(x_train.columns)
    column_importance_wk = column_importance.T.reset_index(drop=False)
    column_importance_wk.columns = ["col_name","feature_importances"]
    column_importance_sort = column_importance_wk.sort_values(by="feature_importances", ascending=False).reset_index(drop=True)

    return clf, dot_file, column_importance_sort

In [None]:
clf_dt, dot_file, dt_importance = decisionTree(X_train, X_test, y_train, y_test)

In [None]:
print(clf_dt.score(X_train,y_train))

In [None]:
#ファイル出力
f = open('decisionTree.dot','w')
f.write(dot_file)
f.close()

In [None]:
# ランダムフォレスト

In [None]:
def randomforest(x_train, x_test, y_train, y_test, param):
    clf = RandomForestClassifier(n_estimators = param['n_estimators'], max_depth = param['max_depth'], 
                                 random_state=0, class_weight='balanced')
    clf.fit(x_train, y_train)
    prob = clf.predict_proba(x_test)[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, prob)
    
    print "accuracy_score:",round(clf.score(x_test,y_test),4)
    print "auc:", round(auc(fpr,tpr),4)
    print "confusion_matrix:"
    print confusion_matrix(y_test,clf.predict(x_test))
    
    # ROC曲線を出力
    plt.plot(fpr, tpr)
    plt.title("ROC curve")
    plt.xlabel("False Positve Rate")
    plt.ylabel("True Positive Rate")
    
    return clf

In [None]:
# 特徴量を順位付け
def get_feature_importance(x, clf):
    col_name = pd.DataFrame(x.columns)
    fi = pd.DataFrame(clf.feature_importances_)
    df_wk = pd.concat([col_name, fi],axis=1)
    df_wk.columns = ["col_name","feature_importance"]
    importance_list = df_wk.sort_values(by="feature_importance", ascending=False).reset_index(drop=True)
    
    return importance_list 

In [None]:
# ランダムフォレストのパラメータリスト
depth_range = range(2, 13, 1)
rf_parameter = [{'n_estimators':[30, 50, 70], 'max_depth':depth_range}]

In [None]:
# グリッドサーチ用ランダムフォレスト
grid_rf = RandomForestClassifier(random_state=0, class_weight='balanced')

In [None]:
# パラメータ探索　（標準化の必要がないため、説明変数は標準化前のデータを使用）
grid_result_rf, best_param_rf = grid(X_train, y_train, grid_rf, rf_parameter, cv)

In [None]:
# グリッドサーチの探索結果を出力
output_grid = os.path.join(output_dir, "grid_rf.csv")
grid_result_rf.to_csv(output_grid, encoding="cp932", index=False)

In [None]:
# 最適パラメータでモデル構築
clf_rf = randomforest(X_train, X_test, y_train, y_test, best_param_rf)

In [None]:
print(clf_rf.score(X_train,y_train))

In [None]:
# 特徴量の順位を取得
feature_importance = get_feature_importance(X_train, clf_rf)
output_path = os.path.join(output_dir, "feature_importance.csv")
feature_importance.to_csv(output_path, encoding="cp932", index=False)

In [None]:
# LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression
clf_log = LogisticRegression (max_iter=100)
clf_log.fit(X_train,y_train)
ypred = clf_log.predict(X_test)
print(clf_log.score(X_train,y_train))
print(confusion_matrix(y_test,ypred))
print(classification_report(y_test,ypred))

In [None]:
# GBDT

In [None]:
def gbdt(x_train, x_test, y_train, y_test, param):
    clf = GradientBoostingClassifier(n_estimators = param['n_estimators'], max_depth = param['max_depth'], 
                                 random_state=0)
    clf.fit(x_train, y_train)
    prob = clf.predict_proba(x_test)[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, prob)
    
    print "acc:",round(clf.score(x_test,y_test),4)
    print "auc:", round(auc(fpr,tpr),4)
    print "confusion_matrix:"
    print confusion_matrix(y_test,clf.predict(x_test))
    
    # ROC曲線を出力
    plt.plot(fpr, tpr)
    plt.title("ROC curve")
    plt.xlabel("False Positve Rate")
    plt.ylabel("True Positive Rate")
    
    return clf

In [None]:
# GBDTのパラメータリスト
depth_range = range(2, 13, 1)
gbdt_parameter = [{'n_estimators':[10, 50, 100, 150], 'max_depth':depth_range}]

In [None]:
# グリッドサーチ用GBDT
# grid_gbdt = GradientBoostingClassifier(random_state=0)
grid_gbdt = GradientBoostingClassifier()

In [None]:
# パラメータ探索
grid_result_gbdt, best_param_gbdt = grid(X_train, y_train, grid_gbdt, gbdt_parameter, cv)

In [None]:
# グリッドサーチの探索結果を出力
output_grid = os.path.join(output_dir, "grid_gbdt.csv")
grid_result_gbdt.to_csv(output_grid, encoding="cp932", index=False)

In [None]:
# 最適パラメータでモデル構築
clf_gbdt = gbdt(X_train, X_test, y_train, y_test, best_param_gbdt)

In [None]:
print(clf_gbdt.score(X_train,y_train))

In [None]:
# モデルをエクスポート
joblib.dump(clf_gbdt, 'model_gbdt.pkl',compress=True)

In [None]:
# 学習済モデルを取り込み
clf_voting = VotingClassifier(estimators=[('sgd',clf_SGD),('dt',clf_dt),('rf',clf_rf),('log',clf_log),('gbdt',clf_gbdt)],voting='soft',n_jobs=5)

In [None]:
clf_voting.fit(X_train, y_train)

In [None]:
# モデルをエクスポート
joblib.dump(clf_voting, 'model_voting.pkl',compress=True)

In [None]:
gen_path = input_dir + ur"gender_submission.csv"
gen_data = pd.read_csv(gen_path, encoding="cp932",low_memory=False)

In [None]:
pred1 = clf_voting.predict(test_dum)
print(confusion_matrix(gen_data.Survived,pred1))
print(classification_report(gen_data.Survived,pred1))

In [None]:
# 乗客番号と予測結果を結合
y_re = pd.Series(pred1)
# y_re = y_predict.reshape(len(y_predict),1)
result = pd.concat([PassengerId, y_re], axis=1)
result_re = result.rename(columns={0: 'Survived'})

In [None]:
print result_re

In [None]:
# 目的変数の値を確認
print "値0", len(result_re.query('Survived == 0'))
print "値1", len(result_re.query('Survived == 1'))

In [None]:
# 予測結果を出力
output = os.path.join(output_dir, "survived_predict.csv")
result_re.to_csv(output, encoding="cp932", index=False)

In [None]:
prediction = clf_log.predict(test_dum)
print(confusion_matrix(gen_data.Survived,prediction))
print(classification_report(gen_data.Survived,prediction))

In [None]:
# 乗客番号と予測結果を結合
y_re2 = pd.Series(prediction)
# y_re = y_predict.reshape(len(y_predict),1)
result2 = pd.concat([PassengerId, y_re2], axis=1)
result_re2 = result2.rename(columns={0: 'Survived'})

In [None]:
# 予測結果を出力
output = os.path.join(output_dir, "survived_predict_2.csv")
result_re2.to_csv(output, encoding="cp932", index=False)