In [80]:
import numpy as np
from numpy.core.umath_tests import inner1d
import pandas as pd
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
%matplotlib inline

In [2]:
pd.get_option("display.max_columns")
pd.set_option('display.max_columns', 50)

In [3]:
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

In [5]:
train_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [6]:
train_PassengerId = train_df['PassengerId']
test_PassengerId = test_df['PassengerId']

In [7]:
test_df.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

どれを残すか、どのobject(文字列)をカテゴリに変換するか考える

PassengerIdは保管したので学習しないようにdrop 
Nameはいらないからdrop  
Ticket、Cabin、Embarkedは詳細を見てから  

In [8]:
train_df.drop(['PassengerId','Name'], axis=1, inplace=True)
test_df.drop(['PassengerId','Name'], axis=1, inplace=True)

In [11]:
train_df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

AgeとCabinとEmbarkedに欠損値がある。
Cabinは多すぎるのでdropさせることに決定

In [12]:
train_df.drop(['Cabin'], axis=1, inplace=True)
test_df.drop(['Cabin'], axis=1, inplace=True)

In [13]:
train_df['Ticket'].unique

<bound method Series.unique of 0             A/5 21171
1              PC 17599
2      STON/O2. 3101282
3                113803
4                373450
5                330877
6                 17463
7                349909
8                347742
9                237736
10              PP 9549
11               113783
12            A/5. 2151
13               347082
14               350406
15               248706
16               382652
17               244373
18               345763
19                 2649
20               239865
21               248698
22               330923
23               113788
24               349909
25               347077
26                 2631
27                19950
28               330959
29               349216
             ...       
861               28134
862               17466
863            CA. 2343
864              233866
865              236852
866       SC/PARIS 2149
867            PC 17590
868              345777
869              347742
870      

Ticketはユニークなのでdropする

In [14]:
train_df.drop(['Ticket'], axis=1, inplace=True)
test_df.drop(['Ticket'], axis=1, inplace=True)

Sexをカテゴリ化してダミー変数化する

In [None]:
train_df['Sex'] = pd.Categorical(train_df['Sex'])
test_df['Sex'] = pd.Categorical(test_df['Sex'])

In [15]:
train_df.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
dtype: object

In [16]:
dummy_vars_train = pd.get_dummies(train_df['Sex'], drop_first=False)
dummy_vars_test = pd.get_dummies(test_df['Sex'], drop_first=False)

In [17]:
train_df = pd.concat([train_df, dummy_vars_train], axis=1)
test_df = pd.concat([test_df, dummy_vars_test], axis=1)

Sexをダミー変数化できたので、Sexをdrop

In [19]:
train_df.drop(['Sex'], axis=1, inplace=True)
test_df.drop(['Sex'], axis=1, inplace=True)

In [20]:
train_df

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,female,male
0,0,3,22.0,1,0,7.2500,S,0,1
1,1,1,38.0,1,0,71.2833,C,1,0
2,1,3,26.0,0,0,7.9250,S,1,0
3,1,1,35.0,1,0,53.1000,S,1,0
4,0,3,35.0,0,0,8.0500,S,0,1
5,0,3,,0,0,8.4583,Q,0,1
6,0,1,54.0,0,0,51.8625,S,0,1
7,0,3,2.0,3,1,21.0750,S,0,1
8,1,3,27.0,0,2,11.1333,S,1,0
9,1,2,14.0,1,0,30.0708,C,1,0


Pclassが数値データになっているので、カテゴリデータ化してダミー変数化  
1 = 1st  
2 = 2nd  
3 = 3rd

In [28]:
train_df['Pclass'].replace({1:"1st", 2:"2nd", 3:"3rd"}, inplace=True)
test_df['Pclass'].replace({1:"1st", 2:"2nd", 3:"3rd"}, inplace=True)

In [31]:
dummy_vars_train = pd.get_dummies(train_df['Pclass'], drop_first=False)
dummy_vars_test = pd.get_dummies(test_df['Pclass'], drop_first=False)

In [33]:
train_df = pd.concat([train_df, dummy_vars_train], axis=1)
test_df = pd.concat([test_df, dummy_vars_test], axis=1)

In [35]:
train_df.drop(['Pclass'], axis=1, inplace=True)
test_df.drop(['Pclass'], axis=1, inplace=True)

In [36]:
train_df.dtypes

Survived      int64
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
female        uint8
male          uint8
1st           uint8
2nd           uint8
3rd           uint8
dtype: object

In [41]:
train_df['Embarked']

0      S
1      C
2      S
3      S
4      S
5      Q
6      S
7      S
8      S
9      C
10     S
11     S
12     S
13     S
14     S
15     S
16     Q
17     S
18     S
19     C
20     S
21     S
22     Q
23     S
24     S
25     S
26     C
27     S
28     Q
29     S
      ..
861    S
862    S
863    S
864    S
865    S
866    C
867    S
868    S
869    S
870    S
871    S
872    S
873    S
874    C
875    C
876    S
877    S
878    S
879    C
880    S
881    S
882    S
883    S
884    S
885    Q
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object

In [42]:
train_df['Embarked'].replace({'C':"Cherbourg", 'Q':"Queenstown", 'S':"Southampton"}, inplace=True)
test_df['Embarked'].replace({'C':"Cherbourg", 'Q':"Queenstown", 'S':"Southampton"}, inplace=True)

In [44]:
dummy_vars_train = pd.get_dummies(train_df['Embarked'], drop_first=False)
dummy_vars_test = pd.get_dummies(test_df['Embarked'], drop_first=False)

In [45]:
train_df = pd.concat([train_df, dummy_vars_train], axis=1)
test_df = pd.concat([test_df, dummy_vars_test], axis=1)

In [47]:
train_df.drop(['Embarked'], axis=1, inplace=True)
test_df.drop(['Embarked'], axis=1, inplace=True)

In [52]:
train_df.isnull().sum()

Survived         0
Age            177
SibSp            0
Parch            0
Fare             0
female           0
male             0
1st              0
2nd              0
3rd              0
Cherbourg        0
Queenstown       0
Southampton      0
dtype: int64

In [53]:
train_df.isnull().sum()

Survived         0
Age            177
SibSp            0
Parch            0
Fare             0
female           0
male             0
1st              0
2nd              0
3rd              0
Cherbourg        0
Queenstown       0
Southampton      0
dtype: int64

In [60]:
pd.concat([train_df['Age'],test_df['Age']]).median()

28.0

Nanは中央値で埋める

In [61]:
train_df.fillna(pd.concat([train_df['Age'],test_df['Age']]).median(), inplace=True)
test_df.fillna(pd.concat([train_df['Age'],test_df['Age']]).median(), inplace=True)

In [62]:
test_df.fillna(pd.concat([train_df['Fare'],test_df['Fare']]).median(), inplace=True)

In [63]:
train_df.isnull().sum()

Survived       0
Age            0
SibSp          0
Parch          0
Fare           0
female         0
male           0
1st            0
2nd            0
3rd            0
Cherbourg      0
Queenstown     0
Southampton    0
dtype: int64

In [64]:
test_df.isnull().sum()

Age            0
SibSp          0
Parch          0
Fare           0
female         0
male           0
1st            0
2nd            0
3rd            0
Cherbourg      0
Queenstown     0
Southampton    0
dtype: int64

これで欠損値もなく、数値データになったので、学習に食わせてみる。  
RandomForestと線形回帰を試してみよう

In [76]:
X_train, X_test, y_train, y_test = train_test_split(train_df.drop(['Survived'],axis=1), train_df['Survived'], random_state=0)

#### ロジスティック回帰

In [126]:
LR = LogisticRegression(C=0.01)
LR = LR.fit(X_train, y_train)

In [127]:
print("Training set score: {:.3f}".format(LR.score(X_train, y_train)))
print("test set score: {:.3f}".format(LR.score(X_test, y_test)))

Training set score: 0.771
test set score: 0.794


K分割交差法

In [143]:
logreg = LogisticRegression()
scores = cross_val_score(logreg, X_train, y_train)

In [144]:
scores = cross_val_score(logreg, train_df.drop(['Survived'],axis=1), train_df['Survived'], cv=10)
LRscores = cross_val_score(LR, train_df.drop(['Survived'],axis=1), train_df['Survived'], cv=10)

In [145]:
scores.mean()

0.8003209056860742

#### RandomForest

In [139]:
forest = RandomForestClassifier(n_estimators=8, random_state=2)
forest.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, y_test)))

Accuracy on training set: 0.958
Accuracy on test set: 0.816


In [152]:
RFscores = cross_val_score(forest, train_df.drop(['Survived'],axis=1), train_df['Survived'], cv=10)

In [153]:
RFscores.mean()

0.8193723754397911

とりあえずランダムフォレストで予測をする

In [154]:
prediction = forest.predict(test_df)

In [160]:
prediction_df = pd.DataFrame(prediction)

In [162]:
test_PassengerId

0       892
1       893
2       894
3       895
4       896
5       897
6       898
7       899
8       900
9       901
10      902
11      903
12      904
13      905
14      906
15      907
16      908
17      909
18      910
19      911
20      912
21      913
22      914
23      915
24      916
25      917
26      918
27      919
28      920
29      921
       ... 
388    1280
389    1281
390    1282
391    1283
392    1284
393    1285
394    1286
395    1287
396    1288
397    1289
398    1290
399    1291
400    1292
401    1293
402    1294
403    1295
404    1296
405    1297
406    1298
407    1299
408    1300
409    1301
410    1302
411    1303
412    1304
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64

In [163]:
submit_df = pd.concat([test_PassengerId,prediction_df],axis=1)

In [169]:
submit_df.rename(columns={0:'Survived'}, inplace=True)

In [177]:
submit_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,0
7,899,0
8,900,1
9,901,0


In [178]:
submit_df.to_csv("./result/submission.csv", index=False)