In [66]:
#pandasインポートとデータセット読み込み
import pandas as pd
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')


In [67]:
#ダミー変数化処理用関数

def dummylize(df,n_item):
    dum = pd.get_dummies(df[n_item], drop_first = True)
    df = pd.concat((df, dum),axis = 1)
    df = df.drop(n_item,axis = 1)
    return df


In [68]:
#nullを0、それ以外を1とする関数
def booleanize(df,n_item):
    df_tmp = df[n_item].notnull() * 1
    df_tmp.name = n_item

    df = df.drop(n_item,axis = 1)
    df = pd.concat((df, df_tmp), axis = 1)
    return df


In [69]:
#性別、乗車場所をダミー変数化、CabinをBoolean化

df_train = dummylize(df_train,'Sex')
df_train = dummylize(df_train,'Embarked')
df_train = booleanize(df_train,'Cabin')


df_test = dummylize(df_test,'Sex')
df_test = dummylize(df_test,'Embarked')
df_test = booleanize(df_test,'Cabin')

df_train.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,male,Q,S,Cabin
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,1,0,1,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,0,0,0,1
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,0,0,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,0,0,1,1
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,1,0,1,0


In [73]:
#Title列、Title数値化列、FamilyName列、FamilyName頻度列を追加する処理関数


#Listを含まれるタイトルのみの文字列に変換するサブ関数

def titleCheck(L):
        if 'Miss' in L: return 'Miss'
        elif 'Mrs' in L: return 'Mrs'
        elif 'Master' in L: return 'Master'
        elif 'Mr' in L: return 'Mr'
        else: return ""


#処理関数メイン

def nameAnalysis(df):

    l_names = [x.replace(",","").replace(".","").split(" ") for x in df.Name.values.tolist()]
#Mr. Mrs. などを仕分け
    l_title = [titleCheck(x) for x in l_names]
    df_title = pd.Series(l_title, name='Titles')

#Mr. Mrs. などを数値化
    from sklearn.preprocessing import LabelEncoder

    le = LabelEncoder()
    num_title = le.fit_transform(l_title)
    df_n_title = pd.Series(num_title, name='TitlesNum')

#ファミリーネーム抜出し

    l_family = [x[0] for x in l_names]
    df_family = pd.Series(l_family, name = "Family")

#ファミリーネームの出現回数列を追加
    df_n_family = df_family.apply(lambda x: (df_family == x).sum())
    df_n_family.name = 'FamilyNum'

#作成した各分析列を追加しリターン
    df = pd.concat((df, df_title), axis = 1)
    df = pd.concat((df, df_n_title), axis = 1)
    df = pd.concat((df, df_family), axis = 1)
    df = pd.concat((df, df_n_family), axis = 1)
        
    return df

In [75]:
#名前処理関数をApply

df_train = nameAnalysis(df_train)

df_test = nameAnalysis(df_test)

df_test.head()


Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,male,Q,S,Cabin,Titles,TitlesNum,Family,FamilyNum,Titles.1,TitlesNum.1,Family.1,FamilyNum.1
0,892,3,"Kelly, Mr. James",34.5,0,0,330911,7.8292,1,1,0,0,Mr,3,Kelly,1,Mr,3,Kelly,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",47.0,1,0,363272,7.0,0,0,1,0,Mrs,4,Wilkes,1,Mrs,4,Wilkes,1
2,894,2,"Myles, Mr. Thomas Francis",62.0,0,0,240276,9.6875,1,1,0,0,Mr,3,Myles,1,Mr,3,Myles,1
3,895,3,"Wirz, Mr. Albert",27.0,0,0,315154,8.6625,1,0,1,0,Mr,3,Wirz,1,Mr,3,Wirz,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",22.0,1,1,3101298,12.2875,0,0,1,0,Mrs,4,Hirvonen,1,Mrs,4,Hirvonen,1


In [76]:
#Mr. Mrs. 等の呼称ごとの年齢を推定

#欠損値の行を削除
df_fortest = df_train.dropna()

#年齢をInt化し100倍に
x = df_fortest.Age.apply(lambda x: int(x*100))
df_fortest = df_fortest.drop('Age',axis =1)
x.name = 'Age'
df_fortest = pd.concat((df_fortest, x), axis = 1)

#推定に使う要素Xを指定。数値化したタイトル列
l_pred = ['TitlesNum']


In [77]:
#機械学習モデル

def evaluate_models(df_fortest, l_pred):
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC, LinearSVC
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.neural_network import MLPClassifier

    from sklearn.metrics import mean_squared_error
    from sklearn.model_selection import train_test_split

    models = []

    models.append(("LogisticRegression", LogisticRegression()))
    models.append(("SVC", SVC()))
    models.append(("LinearSVC", LinearSVC()))
    models.append(("KNeighbors", KNeighborsClassifier()))
    models.append(("DecisionTree", DecisionTreeClassifier()))
    models.append(("RandomForest", RandomForestClassifier()))
    models.append(("MLPClassifier", MLPClassifier(solver='lbfgs', random_state=0)))


    results = []
    names = []

    X_train, X_test, y_train, y_test = train_test_split(df_fortest[l_pred], df_fortest.Age, test_size=0.25)

    print(X_train.dtypes)

    for name, model in models:
        model.fit(X_train, y_train)
        res_pred = model.predict(X_test)
        result = mean_squared_error(y_test, res_pred)

        names.append(name)
        results.append(result)

    return names, results

In [78]:
#各種機械学習モデルを使い年齢推定テスト

names = ['LogisticRegression',
  'SVC',
  'LinearSVC',
  'KNeighbors',
  'DecisionTree',
  'RandomForest',
  'MLPClassifier']

result_list = []
for i in range(0,10):
    n, r = evaluate_models(df_fortest, l_pred)
    for j, k in zip(n, r):
        result_list.append([j, k])

import statistics

for n in names:
    print(n)
    r = [i[1] for i in result_list if i[0] == n]
    print(sum(r)/len(r), statistics.median(r))

TitlesNum    int64
TitlesNum    int64
dtype: object
TitlesNum    int64
TitlesNum    int64
dtype: object
TitlesNum    int64
TitlesNum    int64
dtype: object
TitlesNum    int64
TitlesNum    int64
dtype: object
TitlesNum    int64
TitlesNum    int64
dtype: object
TitlesNum    int64
TitlesNum    int64
dtype: object
TitlesNum    int64
TitlesNum    int64
dtype: object
TitlesNum    int64
TitlesNum    int64
dtype: object
TitlesNum    int64
TitlesNum    int64
dtype: object
TitlesNum    int64
TitlesNum    int64
dtype: object
LogisticRegression
2604245.08603 2434643.48324
SVC
2133512.90726 2083641.03073
LinearSVC
2406825.75642 2327916.10894
KNeighbors
2725606.00056 2666345.88827
DecisionTree
2227769.10838 2147675.94693
RandomForest
2136869.66704 2147675.94693
MLPClassifier
2110048.43799 2145350.9162


In [29]:
#ランダムフォレストのグリッドサーチ

def gscv_random_forest_for_age(df_fortest, l_pred):

    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import GridSearchCV

    parameters = {
    'n_estimators': [5,10,20,30,50,100,300],
    'max_depth': [3,5,10,15,20,25,30,40,50,100],
    'random_state': [0]
    }
    gsc = GridSearchCV(RandomForestClassifier(), parameters, cv=3)
    gsc.fit(df_fortest[l_pred], df_fortest["Age"])


    return gsc

In [30]:
#ランダムフォレストモデルを学習させる

model_temp = gscv_random_forest_for_age(df_fortest, l_pred)




In [31]:
#学習したモデルを保存する

import pickle
filename = 'age_pred.sav'
pickle.dump(model_temp, open(filename, 'wb'))

In [32]:
#保存したモデルの読み込み

import pickle
filename = 'age_pred.sav'
loaded_model = pickle.load(open(filename, 'rb'))

In [33]:
#モデルを使って予測し結果をリストに保存→Seriesに変換
res_pred = loaded_model.predict(df_train[l_pred])
df_res_pred =pd.Series(res_pred)
df_res_pred.name = "prediction"
df_res_pred = df_res_pred.apply(lambda x: float(x/100))

In [71]:
#もともとのAge列の欠損値のみPredictの結果で置き換え　(もっといいやり方があるはず)

df_real_age = df_train['Age']
df_real_age = df_real_age.fillna(0)

l_mix = []

for i,v in enumerate(df_real_age):
    if v != 0:
        l_mix.append(v)
    else:
        l_mix.append(df_res_pred[i])

df_mix = pd.Series(l_mix, name='Age')


In [70]:
#Age列を差し替え
df_train = df_train.drop('Age',axis = 1)
df_train = pd.concat((df_train,df_mix),axis = 1)
df_train.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,SibSp,Parch,Ticket,Fare,Cabin,male,Q,S,Master,Miss,Mr,Mrs,Family,FamilyNum,KnowCabin,Age
0,1,0,3,"Braund, Mr. Owen Harris",1,0,A/5 21171,7.25,,1,0,1,0,0,1,0,Braund,2,0,22.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0,PC 17599,71.2833,C85,0,0,0,0,0,0,1,Cumings,1,1,38.0
2,3,1,3,"Heikkinen, Miss. Laina",0,0,STON/O2. 3101282,7.925,,0,0,1,0,1,0,0,Heikkinen,1,0,26.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0,113803,53.1,C123,0,0,1,0,0,0,1,Futrelle,2,1,35.0
4,5,0,3,"Allen, Mr. William Henry",0,0,373450,8.05,,1,0,1,0,0,1,0,Allen,2,0,35.0


In [113]:
#XGBoostモデル

def make_xgb_model3(X, Y):
    import xgboost as xgb

    import scipy.stats as st
    from sklearn.model_selection import RandomizedSearchCV

    one_to_left = st.beta(10, 1)
    from_zero_positive = st.expon(0, 50)

    params = {
        "n_estimators": st.randint(3, 40),
        "max_depth": st.randint(3, 40),
        "learning_rate": st.uniform(0.05, 0.4),
        "colsample_bytree": one_to_left,
        "subsample": one_to_left,
        "gamma": st.uniform(0, 10),
        'reg_alpha': from_zero_positive,
        "min_child_weight": from_zero_positive,
    }

    xgbreg = xgb.XGBRegressor(nthreads=-1)


    gs = RandomizedSearchCV(xgbreg, params, n_jobs=3)
    ret = gs.fit(X, Y)
    print(gs.best_params_)

    return ret


In [146]:
#学習・テスト用データセット整備

df_sample = df_train.fillna(0)
l_features = ['Pclass','male','Q','S','Master','Miss','Mr','Mrs','FamilyNum','KnowCabin','Age']
df_x_pack = df_sample[l_features]
df_y_pack = df_sample.Survived

df_x_pack.describe()


Unnamed: 0,Pclass,male,Q,S,Master,Miss,Mr,Mrs,FamilyNum,KnowCabin,Age
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,2.308642,0.647587,0.08642,0.722783,0.044893,0.204265,0.580247,0.140292,1.906846,0.228956,30.298732
std,0.836071,0.47799,0.281141,0.447876,0.207186,0.40339,0.493796,0.347485,1.497289,0.420397,13.357099
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.42
25%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,22.0
50%,3.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,30.0
75%,3.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,36.0
max,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,9.0,1.0,80.0


In [147]:
#モデル学習

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_x_pack, df_y_pack, test_size=0.25)
model_xgb = make_xgb_model3(X_train, y_train)


{'colsample_bytree': 0.91780106687610641, 'gamma': 6.0996955410441984, 'learning_rate': 0.44040810211859088, 'max_depth': 27, 'min_child_weight': 54.548634471660954, 'n_estimators': 32, 'reg_alpha': 5.1929375973936143, 'subsample': 0.99852627093259905}


In [148]:
#モデル保存

import pickle
filename = 'xgb_model.sav'
pickle.dump(model_xgb, open(filename, 'wb'))

loaded_m_xgb = pickle.load(open(filename, 'rb'))

In [179]:

from sklearn.metrics import mean_squared_error

res_xgb = loaded_m_xgb.predict(X_test)
result = mean_squared_error(y_test, res_xgb)

int_res_xgb = [int(round(x)) for x in res_xgb]

int_res_xgb

[0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1]