In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

###データの準備

In [2]:
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')
sub = pd.read_csv('../input/titanic/gender_submission.csv')

In [3]:
print(train.shape)
display(train.head())

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
print(test.shape)
display(test.head())

(418, 11)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
print(sub.shape)
display(sub.head())

(418, 2)


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


###欠損値の補完

In [6]:
df_all = pd.concat([train, test])

In [7]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [8]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [9]:
#Embarked
train['Embarked'].fillna('S', inplace = True)

In [10]:
#Fare
fare_avg = df_all['Fare'].mean()
test['Fare'].fillna(fare_avg, inplace=True)

In [11]:
#age
age_avg = df_all['Age'].mean()
train['Age'].fillna(age_avg, inplace = True)
test['Age'].fillna(age_avg, inplace = True)

In [12]:
#Cabinは欠損値が多すぎるので削除
train.drop('Cabin',axis=1,inplace=True)
test.drop('Cabin',axis=1,inplace=True)

###データの選別

In [13]:
#PassengerId
#単なる客番号であり、生存の有無には関係ないので削除
train.drop('PassengerId',axis=1,inplace=True)
test.drop('PassengerId',axis=1,inplace=True)

In [14]:
#Name
#数値化が難しいので削除
train.drop('Name',axis=1,inplace=True)
test.drop('Name',axis=1,inplace=True)

In [15]:
#Ticket
#数値化が難しいので削除
train.drop('Ticket',axis=1,inplace=True)
test.drop('Ticket',axis=1,inplace=True)

###カテゴリ変数の数値化

In [16]:
sex = df_all['Sex']
sex_dummies = pd.get_dummies(sex)

sex_dummies_train = sex_dummies[:891]
sex_dummies_test = sex_dummies[891:]

train = pd.concat([train,sex_dummies_train], axis=1)
test= pd.concat([test,sex_dummies_test],axis=1)

train.drop('Sex',axis=1,inplace=True)
test.drop('Sex',axis=1,inplace=True)

In [17]:
embarked = df_all['Embarked']
embarked_dummies = pd.get_dummies(embarked)

embarked_dummies_train = embarked_dummies[:891]
embarked_dummies_test = embarked_dummies[891:]

train = pd.concat([train,embarked_dummies_train],axis=1)
test = pd.concat([test,embarked_dummies_test],axis=1)

train.drop('Embarked',axis=1,inplace=True)
test.drop('Embarked',axis=1,inplace=True)

###加工後のデータの確認

In [18]:
print(train.shape)
display(train.head())

(891, 11)


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S
0,0,3,22.0,1,0,7.25,0,1,0,0,1
1,1,1,38.0,1,0,71.2833,1,0,1,0,0
2,1,3,26.0,0,0,7.925,1,0,0,0,1
3,1,1,35.0,1,0,53.1,1,0,0,0,1
4,0,3,35.0,0,0,8.05,0,1,0,0,1


In [19]:
print(test.shape)
display(test.head())

(418, 10)


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S
0,3,34.5,0,0,7.8292,0,1,0,1,0
1,3,47.0,1,0,7.0,1,0,0,0,1
2,2,62.0,0,0,9.6875,0,1,0,1,0
3,3,27.0,0,0,8.6625,0,1,0,0,1
4,3,22.0,1,1,12.2875,1,0,0,0,1


###学習の前準備

In [20]:
X = train.drop('Survived',axis=1).values
y = train['Survived'].values

X_test = test.values

In [21]:
X_train,X_valid,y_train,y_valid=train_test_split(X,y,test_size=0.3,random_state=71)

###モデル構築

In [22]:
max_depth_list=[3,4,5,6,7,8,9]
min_child_weight_list=[1,2,3,4,5]

In [23]:
#ランダムサーチ
random_search_params=[]
trials=20
np.random.seed(71)

#ハイパーパラメータの探索
for i in range(trials):
    max_depth=np.random.choice(max_depth_list)
    min_child_weight=np.random.choice(min_child_weight_list)
    random_search_params.append((max_depth,min_child_weight))

    model=XGBClassifier(booster='gbtree',random_state=71,learning_rate=0.05,objective='binary:logistic',n_estimators=100,
                    max_depth=max_depth,min_child_weight=min_child_weight)

    model.fit(X_train,y_train)

    #モデルの評価
    train_pred=model.predict(X_train)
    valid_pred=model.predict(X_valid)

    train_acc=accuracy_score(y_train,train_pred)
    valid_acc=accuracy_score(y_valid,valid_pred)

    print('max_depth:',max_depth,'min_child_weight:',min_child_weight)
    print('Train Score:',train_acc,'Valid Score:',valid_acc)

max_depth: 6 min_child_weight: 4
Train Score: 0.8764044943820225 Valid Score: 0.8470149253731343
max_depth: 3 min_child_weight: 1
Train Score: 0.8475120385232745 Valid Score: 0.8432835820895522
max_depth: 5 min_child_weight: 2
Train Score: 0.8860353130016051 Valid Score: 0.8395522388059702
max_depth: 3 min_child_weight: 3
Train Score: 0.8426966292134831 Valid Score: 0.8432835820895522
max_depth: 9 min_child_weight: 2
Train Score: 0.9036918138041734 Valid Score: 0.8470149253731343
max_depth: 9 min_child_weight: 4
Train Score: 0.8812199036918138 Valid Score: 0.8432835820895522
max_depth: 4 min_child_weight: 1
Train Score: 0.8651685393258427 Valid Score: 0.8507462686567164
max_depth: 3 min_child_weight: 4
Train Score: 0.841091492776886 Valid Score: 0.8507462686567164
max_depth: 8 min_child_weight: 1
Train Score: 0.9213483146067416 Valid Score: 0.832089552238806
max_depth: 3 min_child_weight: 2
Train Score: 0.8523274478330658 Valid Score: 0.835820895522388
max_depth: 3 min_child_weight: 5


In [24]:
#範囲を変えてもう一度
max_depth_list=[2,3,4,5,6]
min_child_weight=[1,2,3,4,5,6,7]

In [25]:
#グリッドサーチ
grid_search_list=[]

for max_depth in range(2,7,1):
    for min_child_weight in range(1,8,1):
        grid_search_list.append((max_depth,min_child_weight))

        model=XGBClassifier(booster='gbtree',random_state=71,learning_rate=0.05,objective='binary:logistic',n_estimators=100,
                    max_depth=max_depth,min_child_weight=min_child_weight)

        model.fit(X_train,y_train)

        #モデルの評価
        train_pred=model.predict(X_train)
        valid_pred=model.predict(X_valid)

        train_acc=accuracy_score(y_train,train_pred)
        valid_acc=accuracy_score(y_valid,valid_pred)

        print('max_depth:',max_depth,'min_child_weight:',min_child_weight)
        print('Train Score:',train_acc,'Valid Score:',valid_acc)

max_depth: 2 min_child_weight: 1
Train Score: 0.8250401284109149 Valid Score: 0.835820895522388
max_depth: 2 min_child_weight: 2
Train Score: 0.8250401284109149 Valid Score: 0.835820895522388
max_depth: 2 min_child_weight: 3
Train Score: 0.8282504012841091 Valid Score: 0.8395522388059702
max_depth: 2 min_child_weight: 4
Train Score: 0.8282504012841091 Valid Score: 0.8395522388059702
max_depth: 2 min_child_weight: 5
Train Score: 0.8298555377207063 Valid Score: 0.8395522388059702
max_depth: 2 min_child_weight: 6
Train Score: 0.8346709470304976 Valid Score: 0.8507462686567164
max_depth: 2 min_child_weight: 7
Train Score: 0.8282504012841091 Valid Score: 0.835820895522388
max_depth: 3 min_child_weight: 1
Train Score: 0.8475120385232745 Valid Score: 0.8432835820895522
max_depth: 3 min_child_weight: 2
Train Score: 0.8523274478330658 Valid Score: 0.835820895522388
max_depth: 3 min_child_weight: 3
Train Score: 0.8426966292134831 Valid Score: 0.8432835820895522
max_depth: 3 min_child_weight: 4
T

In [26]:
#よさそうなパラメータを選ぶ
model=XGBClassifier(booster='gbtree',random_state=71,learning_rate=0.05,objective='binary:logistic',
                    n_estimators=100,max_depth=3,min_child_weight=4)

###学習

In [27]:
model.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=3,
              min_child_weight=4, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=71,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

###モデルの評価

In [28]:
train_pred = model.predict(X_train)
valid_pred = model.predict(X_valid)

train_acc = accuracy_score(y_train,train_pred)
valid_acc = accuracy_score(y_valid,valid_pred)

In [29]:
print('Train Score:',train_acc)
print('Valid Score:',valid_acc)

Train Score: 0.841091492776886
Valid Score: 0.8507462686567164


###予測の提出

In [30]:
y_pred = model.predict(X_test)

In [31]:
sub['Survived'] = y_pred

In [32]:
sub.to_csv('submission.csv',index=False)