In [1]:
#pandasインポートとデータセット読み込み
import pandas as pd
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')


In [2]:
#ダミー変数化処理用関数

def dummylize(df,n_item):
    dum = pd.get_dummies(df[n_item], drop_first = True)
    df = pd.concat((df, dum),axis = 1)
    df = df.drop(n_item,axis = 1)
    return df


#nullを0、それ以外を1とする関数
def booleanize(df,n_item):
    df_tmp = df[n_item].notnull() * 1
    df_tmp.name = n_item

    df = df.drop(n_item,axis = 1)
    df = pd.concat((df, df_tmp), axis = 1)
    return df


In [3]:
#性別、乗車場所をダミー変数化、CabinをBoolean化をApply

df_train = dummylize(df_train,'Sex')
df_train = dummylize(df_train,'Embarked')
df_train = booleanize(df_train,'Cabin')


df_test = dummylize(df_test,'Sex')
df_test = dummylize(df_test,'Embarked')
df_test = booleanize(df_test,'Cabin')


In [4]:
#Title列、Title数値化列、FamilyName列、FamilyName頻度列を追加する処理関数


#Listを含まれるタイトルのみの文字列に変換するサブ関数

def titleCheck(L):
        if 'Miss' in L: return 'Miss'
        elif 'Mrs' in L: return 'Mrs'
        elif 'Master' in L: return 'Master'
        elif 'Mr' in L: return 'Mr'
        else: return ""


#処理関数メイン

def nameAnalysis(df):

    l_names = [x.replace(",","").replace(".","").split(" ") for x in df.Name.values.tolist()]
#Mr. Mrs. などを仕分け
    l_title = [titleCheck(x) for x in l_names]
    df_title = pd.Series(l_title, name='Titles')

#Mr. Mrs. などを数値化
    from sklearn.preprocessing import LabelEncoder

    le = LabelEncoder()
    num_title = le.fit_transform(l_title)
    df_n_title = pd.Series(num_title, name='TitlesNum')

#ファミリーネーム抜出し

    l_family = [x[0] for x in l_names]
    df_family = pd.Series(l_family, name = "Family")

#ファミリーネームの出現回数列を追加
    df_n_family = df_family.apply(lambda x: (df_family == x).sum())
    df_n_family.name = 'FamilyNum'

#作成した各分析列を追加しリターン
    df = pd.concat((df, df_title), axis = 1)
    df = pd.concat((df, df_n_title), axis = 1)
    df = pd.concat((df, df_family), axis = 1)
    df = pd.concat((df, df_n_family), axis = 1)
        
    return df

In [5]:
#名前処理関数をApply

df_train = nameAnalysis(df_train)

df_test = nameAnalysis(df_test)


In [6]:
#年齢を世代区分け


def ageGroup(x):
        if 0 <= x and x < 10:
            return 0
        elif 10 <= x and x < 20:
            return 1
        elif 20 <= x and x < 40:
            return 2
        elif 40 <= x and x < 60:
            return 3
        elif 60 <= x:
            return 4


df_train.Age = df_train.Age.apply(lambda x: ageGroup(x))
df_test.Age = df_test.Age.apply(lambda x: ageGroup(x))

In [7]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,male,Q,S,Cabin,Titles,TitlesNum,Family,FamilyNum
0,892,3,"Kelly, Mr. James",2.0,0,0,330911,7.8292,1,1,0,0,Mr,3,Kelly,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",3.0,1,0,363272,7.0,0,0,1,0,Mrs,4,Wilkes,1
2,894,2,"Myles, Mr. Thomas Francis",4.0,0,0,240276,9.6875,1,1,0,0,Mr,3,Myles,1
3,895,3,"Wirz, Mr. Albert",2.0,0,0,315154,8.6625,1,0,1,0,Mr,3,Wirz,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",2.0,1,1,3101298,12.2875,0,0,1,0,Mrs,4,Hirvonen,1


In [8]:
#年齢推定モデルに渡すためテスト用加工済みCSVを保存

df_train.to_csv('forAgePred.csv')



In [9]:
#別NoteBookの処理で作成・保存した年齢推定モデルの読み込み

import pickle
filename = 'age_pred.sav'
model_age_pred = pickle.load(open(filename, 'rb'))

In [10]:
#Age欠損部分を年齢推定モデルを使い補完する処理まとめ関数

def fillMissingAge(df):

    #モデルを使って予測し結果をリストに保存→Seriesに変換
    l_pred = ['TitlesNum']
    res_pred = model_age_pred.predict(df[l_pred])
    df_res_pred =pd.Series(res_pred, name='prediction')
    df_res_pred = df_res_pred.apply(lambda x: float(x/100))

    df_real_age = df['Age']
    df_real_age = df_real_age.fillna(0)


    #もともとのAge列の欠損値のみPredictの結果で置き換え　(もっといいやり方があるはず)

    df_real_age = df['Age']
    df_real_age = df_real_age.fillna(0)

    l_mix = []

    for i,v in enumerate(df_real_age):
        if v != 0:
            l_mix.append(v)
        else:
            l_mix.append(df_res_pred[i])

    df_mix = pd.Series(l_mix, name='Age')

    #Age列を差し替え
    df = df.drop('Age',axis = 1)
    df = pd.concat((df,df_mix),axis = 1)

    return df

In [11]:

#Age列欠損補完をApply

df_train = fillMissingAge(df_train)

df_test = fillMissingAge(df_test)


In [12]:
#学習・テスト用データセット整備

from sklearn.model_selection import train_test_split

df_train = df_train.fillna(0)
l_features = ['Pclass','male','Q','S','TitlesNum','FamilyNum','Cabin','Age','SibSp','Parch','Fare']
df_x_pack = df_train[l_features]
df_y_pack = df_train.Survived

X_train, X_test, y_train, y_test = train_test_split(df_x_pack, df_y_pack, test_size=0.25)



In [13]:
#XGBoostモデル

def get_xgb_model3(X, Y):
    import xgboost as xgb

    import scipy.stats as st
    from sklearn.model_selection import RandomizedSearchCV

    one_to_left = st.beta(10, 1)
    from_zero_positive = st.expon(0, 50)

    params = {
        "n_estimators": st.randint(3, 40),
        "max_depth": st.randint(3, 40),
        "learning_rate": st.uniform(0.05, 0.4),
        "colsample_bytree": one_to_left,
        "subsample": one_to_left,
        "gamma": st.uniform(0, 10),
        'reg_alpha': from_zero_positive,
        "min_child_weight": from_zero_positive,
    }

    xgbreg = xgb.XGBRegressor(nthreads=-1)


    gs = RandomizedSearchCV(xgbreg, params, n_jobs=3)
    res = gs.fit(X, Y)
    print(gs.best_params_)

    return res


In [14]:
#XGBモデル学習

model_xgb = get_xgb_model3(X_train, y_train)


{'colsample_bytree': 0.98118377227268017, 'gamma': 9.0238107012659405, 'learning_rate': 0.36279978490601211, 'max_depth': 20, 'min_child_weight': 17.783987514305519, 'n_estimators': 34, 'reg_alpha': 0.073455572469056207, 'subsample': 0.92640139774331876}


In [15]:
#Grid Search Cross Validation適用用関数

def applyGSCV(model, param, X, Y):
    from sklearn.model_selection import GridSearchCV

    res = GridSearchCV(model, param, cv=3)
    res.fit(X, Y)

    return res

In [16]:
#Random Forestモデル

from sklearn.ensemble import RandomForestClassifier

obj_param = {
'n_estimators': [5,10,20,30,50,100,300],
'max_depth': [3,5,10,15,20,25,30,40,50,100],
'random_state': [0]
}

model_randForest = applyGSCV(RandomForestClassifier(),obj_param,X_train, y_train)

print('done')

done


In [17]:
#MLPCモデル

from sklearn.neural_network import MLPClassifier
model_MLPC =  MLPClassifier(solver='lbfgs', random_state=0).fit(X_train, y_train)

In [19]:
N = len(l_features)
N

11

In [50]:
#Keras DLモデル

import keras.optimizers
from keras.models import Sequential
from keras.layers.core import Dense, Activation

model_NN = Sequential()
model_NN.add(Dense(2,input_dim=N, activation='sigmoid', kernel_initializer='uniform'))
model_NN.add(Dense(2,activation='softmax', kernel_initializer='uniform'))
sgd = keras.optimizers.SGD(lr = 0.5, momentum = 0.0,decay = 0.0, nesterov = False)
model_NN.compile(optimizer = sgd, loss = 'categorical_crossentropy', metrics = ['accuracy'])


In [51]:
from keras.utils import to_categorical

y_train_bin = to_categorical(y_train)
y_test_bin = to_categorical(y_test)


history = model_NN.fit(X_train, y_train_bin, batch_size=100,epochs=1000,verbose=0,validation_data=(X_test, y_test_bin))


In [52]:
history.history['acc']

[0.58682635319447085,
 0.61676645921375939,
 0.61676646171215765,
 0.61676646456746997,
 0.61676646171215765,
 0.61676646742278229,
 0.61377244688079746,
 0.61676647384723504,
 0.64221555922559637,
 0.65718562874251496,
 0.65868264258264786,
 0.67065867870867613,
 0.65269461613215374,
 0.65868263330288279,
 0.66167663921139197,
 0.65269460899387288,
 0.65269461256301331,
 0.64670657647584018,
 0.6541916114127565,
 0.66017963536485225,
 0.66317364698398618,
 0.66317366233128983,
 0.66766465816669118,
 0.66616766788288506,
 0.66916166237014496,
 0.66616765896003405,
 0.67814370400891333,
 0.68413173759768819,
 0.66766467922461958,
 0.67065869584055005,
 0.68862276484152518,
 0.66467065957491989,
 0.64670657897423844,
 0.66317366518660215,
 0.66167664242361834,
 0.66017964428770326,
 0.66317366233128983,
 0.64970060058696544,
 0.66017963179571182,
 0.65568862418214713,
 0.65568862703745956,
 0.6721556979025195,
 0.66467064619064331,
 0.66017963143879776,
 0.66467066118103302,
 0.654191614

In [53]:
history.history['val_acc']

[0.61434977204275776,
 0.61434977204275776,
 0.61434977204275776,
 0.61434977204275776,
 0.61434977204275776,
 0.61434977204275776,
 0.61434977204275776,
 0.61434977204275776,
 0.65022421043549949,
 0.65919282431025139,
 0.61883409809103995,
 0.66367714233997155,
 0.6322869842897082,
 0.63677129430086621,
 0.63677128494587709,
 0.66816144299614055,
 0.61434977391375556,
 0.60986547005016178,
 0.65022422299791349,
 0.63228700005954697,
 0.63228700005954697,
 0.63228699685212209,
 0.63228699685212209,
 0.64125559816446009,
 0.64125559816446009,
 0.62331837522609357,
 0.64125560591573671,
 0.63228699685212209,
 0.62780269940337796,
 0.66367711240400651,
 0.64125560137188486,
 0.60986547005016178,
 0.63677132102940648,
 0.64573990977933049,
 0.66816144299614055,
 0.65919282431025139,
 0.62331839553978408,
 0.60986547005016178,
 0.62780269940337796,
 0.63677129430086621,
 0.64573992234174449,
 0.63228699685212209,
 0.62780269940337796,
 0.63228700005954697,
 0.6636771249664204,
 0.641255598

In [54]:
score = model_NN.evaluate(X_test, y_test_bin, verbose = 0)
print(score[0], score[1])

0.523651085092 0.744394616161


In [55]:
#model 比較

from sklearn.metrics import mean_squared_error

models =[]
models.append(model_xgb)
models.append(model_randForest)
models.append(model_MLPC)

for model in models:    
    res_tmp = model.predict(X_test)
    res_mean = mean_squared_error(y_test, res_tmp)
    print('mean squared error:{0} '.format(res_mean))


print(score[0])


mean squared error:0.16106102917539145 
mean squared error:0.21973094170403587 
mean squared error:0.19282511210762332 
0.523651085092


In [68]:

model_final = model_xgb


{'colsample_bytree': 0.74436301228265755, 'gamma': 6.1772715269608813, 'learning_rate': 0.15496312978223037, 'max_depth': 10, 'min_child_weight': 101.83755392360787, 'n_estimators': 12, 'reg_alpha': 0.87831901557616898, 'subsample': 0.64694260899176426}


In [61]:
#モデル保存

import pickle
filename = 'xgb_model.sav'
pickle.dump(model_final, open(filename, 'wb'))

loaded_m_xgb = pickle.load(open(filename, 'rb'))

In [62]:

from sklearn.metrics import mean_squared_error

res_xgb = loaded_m_xgb.predict(X_test)
result = mean_squared_error(y_test, res_xgb)


In [63]:
result

0.16175879616873748

In [64]:
dig_res_xgb = [0 if x <=0.5 else 1 for x in res_xgb]
dig_res_xgb

print(sum(dig_res_xgb == y_test) / len(dig_res_xgb))


0.793721973094


In [65]:
df_X_forSubmit = df_test[l_features]
Y_forSubmit = loaded_m_xgb.predict(df_X_forSubmit)
Y_forSubmit_int = [0 if x <=0.5 else 1 for x in Y_forSubmit]

df_Y = pd.Series(Y_forSubmit_int, name='Survived')
df_submit = df_test.PassengerId.copy()

df_submit = pd.concat((df_submit,df_Y),axis=1)


In [66]:
df_submit.head()
df_submit.to_csv('submission.csv', index = False)