# Titanic

## 必要ファイルのインポート


In [713]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pickle
 
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

# yの値を入れ替え
col = train.columns.tolist()
col.remove('Survived') 
col.append('Survived')
train = train[col]

print(train.shape)
print(test.shape)

(891, 12)
(418, 11)


#### 各項目の説明
* PassengerId – 乗客識別ユニークID
* Survived – 生存フラグ（0=死亡、1=生存）
* Pclass – チケットクラス
* Name – 乗客の名前
* Sex – 性別（male=男性、female＝女性）
* Age – 年齢
* SibSp – タイタニックに同乗している兄弟/配偶者の数
* parch – タイタニックに同乗している親/子供の数
* ticket – チケット番号
* fare – 料金
* cabin – 客室番号
* Embarked – 出港地（タイタニックへ乗った港）

##### pclass = チケットクラス

* 1 = 上層クラス（お金持ち）
* 2 = 中級クラス（一般階級）
* 3 = 下層クラス（労働階級）

##### Embarked = 各変数の定義は下記の通り

* C = Cherbourg
* Q = Queenstown
* S = Southampton

In [714]:
train.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1


In [715]:
test.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [716]:
train.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Survived
count,891.0,891.0,714.0,891.0,891.0,891.0,891.0
mean,446.0,2.308642,29.699118,0.523008,0.381594,32.204208,0.383838
std,257.353842,0.836071,14.526497,1.102743,0.806057,49.693429,0.486592
min,1.0,1.0,0.42,0.0,0.0,0.0,0.0
25%,223.5,2.0,20.125,0.0,0.0,7.9104,0.0
50%,446.0,3.0,28.0,0.0,0.0,14.4542,0.0
75%,668.5,3.0,38.0,1.0,0.0,31.0,1.0
max,891.0,3.0,80.0,8.0,6.0,512.3292,1.0


In [717]:
train.corr()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Survived
PassengerId,1.0,-0.035144,0.036847,-0.057527,-0.001652,0.012658,-0.005007
Pclass,-0.035144,1.0,-0.369226,0.083081,0.018443,-0.5495,-0.338481
Age,0.036847,-0.369226,1.0,-0.308247,-0.189119,0.096067,-0.077221
SibSp,-0.057527,0.083081,-0.308247,1.0,0.414838,0.159651,-0.035322
Parch,-0.001652,0.018443,-0.189119,0.414838,1.0,0.216225,0.081629
Fare,0.012658,-0.5495,0.096067,0.159651,0.216225,1.0,0.257307
Survived,-0.005007,-0.338481,-0.077221,-0.035322,0.081629,0.257307,1.0


## 前処理

In [718]:
print("------train------")
print(train.isnull().sum())  #欠損値確認
print("------test-------")
print(test.isnull().sum())

------train------
PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Survived         0
dtype: int64
------test-------
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [719]:
train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [720]:
train["Age"] = train["Age"].fillna(train["Age"].median())
train["Embarked"] = train["Embarked"].fillna("S")
train.drop(['Cabin','Name','Ticket'], axis=1, inplace=True)

test["Age"] = test["Age"].fillna(test["Age"].median())
test["Embarked"] = test["Embarked"].fillna("S")
test["Fare"] = test["Fare"].fillna(test["Fare"].median())
test.drop(['Cabin','Name','Ticket'], axis=1, inplace=True)

In [721]:
train.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,1,3,male,22.0,1,0,7.25,S,0
1,2,1,female,38.0,1,0,71.2833,C,1
2,3,3,female,26.0,0,0,7.925,S,1
3,4,1,female,35.0,1,0,53.1,S,1
4,5,3,male,35.0,0,0,8.05,S,0


In [722]:
le = []
target = [1, 2, 7]
for i in target:
    le = LabelEncoder()
    le.fit(train.iloc[:,i])
    train.iloc[:,i] = le.transform(train.iloc[:,i])
    
    le = LabelEncoder()
    le.fit(test.iloc[:,i])
    test.iloc[:,i] = le.transform(test.iloc[:,i])
train.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,1,2,1,22.0,1,0,7.25,2,0
1,2,0,0,38.0,1,0,71.2833,0,1
2,3,2,0,26.0,0,0,7.925,2,1
3,4,0,0,35.0,1,0,53.1,2,1
4,5,2,1,35.0,0,0,8.05,2,0


In [723]:
dummy = pd.get_dummies(train.iloc[:, :-1], columns=['Sex','Pclass','Embarked'])
dummy.head(3)

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,Sex_0,Sex_1,Pclass_0,Pclass_1,Pclass_2,Embarked_0,Embarked_1,Embarked_2
0,1,22.0,1,0,7.25,0,1,0,0,1,0,0,1
1,2,38.0,1,0,71.2833,1,0,1,0,0,1,0,0
2,3,26.0,0,0,7.925,1,0,0,0,1,0,0,1


In [724]:
test = pd.get_dummies(test, columns=['Sex','Pclass','Embarked'])
test.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,Sex_0,Sex_1,Pclass_0,Pclass_1,Pclass_2,Embarked_0,Embarked_1,Embarked_2
0,892,34.5,0,0,7.8292,0,1,0,0,1,0,1,0
1,893,47.0,1,0,7.0,1,0,0,0,1,0,0,1
2,894,62.0,0,0,9.6875,0,1,0,1,0,0,1,0
3,895,27.0,0,0,8.6625,0,1,0,0,1,0,0,1
4,896,22.0,1,1,12.2875,1,0,0,0,1,0,0,1


In [725]:
X = dummy
y = train.iloc[:, -1]
#X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.3)

## 学習

In [726]:
clf_rf = RandomForestClassifier(n_estimators=1000, max_features=4, random_state=1, n_jobs=-1)

In [727]:
kfold = StratifiedKFold(n_splits=10)
scores = []
tX, ty = X.values, y.values
for kf_train, kf_test in kfold.split(tX, ty):
    clf_rf.fit(tX[kf_train], ty[kf_train])
    score = clf_rf.score(tX[kf_test], ty[kf_test])
    scores = np.append(scores, score)
scores.mean()

0.82278033140392692

In [728]:
clf_rf = RandomForestClassifier(n_estimators=1000, max_features=4, random_state=1, n_jobs=-1)
clf_rf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=4, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [729]:
importances = pd.DataFrame(X.columns,columns=['features_label'])
importances['importances'] =clf_rf.feature_importances_
importances

Unnamed: 0,features_label,importances
0,PassengerId,0.186767
1,Age,0.16558
2,SibSp,0.038197
3,Parch,0.028054
4,Fare,0.177928
5,Sex_0,0.140648
6,Sex_1,0.133984
7,Pclass_0,0.026558
8,Pclass_1,0.015029
9,Pclass_2,0.053592


## 後処理

In [730]:
clf_rf.predict(test)

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0,

In [731]:
prediction = pd.DataFrame(clf_rf.predict(test))
prediction.columns = ['Survived']

In [732]:
result = pd.concat([test.iloc[:, 0], prediction], axis=1)
result.to_csv("01.csv", index=False)