In [53]:
#Titanic号で実装

#モジュールをインポート

import pandas as pd
import numpy as  np
import seaborn as sns
import matplotlib.pyplot as plt
# % matplotlib inline

## ①ランダムフォレスト

In [54]:
# 最初の10行を表示
df = pd.read_csv('titanic_train.csv')
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [55]:
# 欠損値の数を確認
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

##②前処理

- Sexをマッピング
- Embarkedを０で補完してからマッピング
- Nameを敬称ごとにマッピング
- Ageを補完する（敬称ごとの平均年齢）



In [56]:
#Sexをマッピング　male:0, female:1

Sex_mapping={"male":0,"female":1}
df["Sex"]=df["Sex"].map(Sex_mapping)

In [57]:
#Enbarkedをマッピング　S:0, C:1, Q:2

embarked_mapping = {"S":0, "C":1, "Q":2}
df["Embarked"] = df["Embarked"].map(embarked_mapping)  

In [58]:
#Embarkedの欠損値を0で補完
df['Embarked'] = df['Embarked'].fillna(0)

In [59]:
# 敬称のみを取り出す()

dataset_title = [i.split(",")[1].split(".")[0].strip() for i in df["Name"]]  

df["Title"] = pd.Series(dataset_title)  
    
df["Title"].head()

0      Mr
1     Mrs
2    Miss
3     Mrs
4      Mr
Name: Title, dtype: object

In [60]:
df["Title"].unique()
  ## .unique()で、重複なしの要素を取り出す　

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer'], dtype=object)

In [61]:
 #敬称をマッピング　→ 17個の名称に対して、0~16の値を振る

name_mapping = {name:i for (i,name) in enumerate(df["Title"].unique())}

name_mapping

{'Mr': 0,
 'Mrs': 1,
 'Miss': 2,
 'Master': 3,
 'Don': 4,
 'Rev': 5,
 'Dr': 6,
 'Mme': 7,
 'Ms': 8,
 'Major': 9,
 'Lady': 10,
 'Sir': 11,
 'Mlle': 12,
 'Col': 13,
 'Capt': 14,
 'the Countess': 15,
 'Jonkheer': 16}

In [62]:
df["Title"] = df["Title"].map(name_mapping)

In [63]:
df["Title"].head()

0    0
1    1
2    2
3    1
4    0
Name: Title, dtype: int64

In [64]:
#Ageを補完する
#敬称ごとの平均年齢


age_mean = df.groupby("Title")["Age"].mean()  


age_mean_map = {i:mean for (i,mean) in enumerate(age_mean)}
age_mean_map


{0: 32.368090452261306,
 1: 35.898148148148145,
 2: 21.773972602739725,
 3: 4.574166666666667,
 4: 40.0,
 5: 43.166666666666664,
 6: 42.0,
 7: 24.0,
 8: 28.0,
 9: 48.5,
 10: 48.0,
 11: 49.0,
 12: 24.0,
 13: 58.0,
 14: 70.0,
 15: 33.0,
 16: 38.0}

In [65]:
df["Age"] = df["Age"].fillna(-1)

for i in range(len(df)):
    if df["Age"][i] == -1:
        df["Age"][i] = age_mean_map[df["Title"][i]]
       

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [66]:
#データフレームから説明変数・目的変数を取り出す
X = df.loc[:,["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]].values

In [67]:
y = df.loc[:,["Survived"]].values

In [68]:
#ホールドアウト法による分割
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [69]:
# ランダムフォレスト
from sklearn.ensemble import RandomForestClassifier

In [70]:
rf = RandomForestClassifier()

In [71]:
X_train

array([[ 1.        ,  0.        , 51.        , ...,  0.        ,
        26.55      ,  0.        ],
       [ 1.        ,  1.        , 49.        , ...,  0.        ,
        76.7292    ,  1.        ],
       [ 3.        ,  0.        ,  1.        , ...,  2.        ,
        46.9       ,  0.        ],
       ...,
       [ 3.        ,  0.        , 32.36809045, ...,  0.        ,
         7.7333    ,  2.        ],
       [ 3.        ,  1.        , 36.        , ...,  0.        ,
        17.4       ,  0.        ],
       [ 2.        ,  0.        , 60.        , ...,  1.        ,
        39.        ,  0.        ]])

In [72]:
rf.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [73]:
rf.score(X_test, y_test)

0.8022388059701493

In [74]:
#グリッドサーチを用いて最適なパラメータを利用
param_grid = {'n_estimators': [i for i in range(400, 450, 10)],
              'max_depth':  [i for i in range(4, 5)]}

In [75]:
from sklearn.model_selection import GridSearchCV

forest = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)

In [76]:
y_train = y_train.reshape(-1, 1)

In [77]:
forest.fit(X_train, y_train)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [400, 410, 420, 430, 440], 'max_depth': [4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [78]:
forest.best_params_ 

{'max_depth': 4, 'n_estimators': 410}

In [79]:
#scoreの表示 
forest.score(X_test, y_test.reshape(1,-1)[0])

0.8134328358208955

## ②SVM

##前処理

- Sexをdummy変数化
- Embarkedをdummy変数化
- Nameをダミー変数化
- 年齢を補完

In [80]:
df2 = pd.read_csv("titanic_train.csv")

In [81]:
sex_dummies = pd.get_dummies(df2['Sex'])
df2 = pd.concat([df2, sex_dummies], axis=1)

In [82]:
embarked_dummies = pd.get_dummies(df2["Embarked"])
df2 = pd.concat([df2, embarked_dummies], axis=1)

In [83]:
dataset_title = [i.split(",")[1].split(".")[0].strip() for i in df2["Name"]]
df2["Title"] = pd.Series(dataset_title)
df2.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,female,male,C,Q,S,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,1,0,0,1,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0,1,0,0,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,0,0,0,1,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,0,0,0,1,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,1,0,0,1,Mr


In [84]:
title_dummies = pd.get_dummies(df2["Title"])
df2 = pd.concat([df2, title_dummies], axis=1)

In [85]:
#Ageを補完
#敬称ごとに年齢の平均値で補完する
df2["Title"] = df2["Title"].map(name_mapping)
df2["Age"] = df2["Age"].fillna(-1)

for i in range(len(df2)):
    if df2["Age"][i] == -1:
        df2["Age"][i] = age_mean_map[df2["Title"][i]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [86]:
df2.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'female', 'male', 'C',
       'Q', 'S', 'Title', 'Capt', 'Col', 'Don', 'Dr', 'Jonkheer', 'Lady',
       'Major', 'Master', 'Miss', 'Mlle', 'Mme', 'Mr', 'Mrs', 'Ms', 'Rev',
       'Sir', 'the Countess'],
      dtype='object')

In [87]:

X2 = df2.loc[:,['Pclass', 'Age', 'SibSp','Parch', 'Fare','female', 'male', 'C',
       'Q', 'S', 'Title', 'Capt', 'Col', 'Don', 'Dr', 'Jonkheer', 'Lady',
       'Major', 'Master', 'Miss', 'Mlle', 'Mme', 'Mr', 'Mrs', 'Ms', 'Rev',
       'Sir', 'the Countess']].values

In [88]:
y2 =  df2.loc[:, ["Survived"]].values

In [89]:
from sklearn.preprocessing import StandardScaler

# 標準化を行う
scaler = StandardScaler()
scaler.fit(X2)
X2_std = scaler.transform(X2)

In [90]:
X2_std

array([[ 0.82737724, -0.58438725,  0.43279337, ..., -0.0823387 ,
        -0.03352008, -0.03352008],
       [-1.56610693,  0.62136484,  0.43279337, ..., -0.0823387 ,
        -0.03352008, -0.03352008],
       [ 0.82737724, -0.28294923, -0.4745452 , ..., -0.0823387 ,
        -0.03352008, -0.03352008],
       ...,
       [ 0.82737724, -0.60142056,  0.43279337, ..., -0.0823387 ,
        -0.03352008, -0.03352008],
       [-1.56610693, -0.28294923, -0.4745452 , ..., -0.0823387 ,
        -0.03352008, -0.03352008],
       [ 0.82737724,  0.1692078 , -0.4745452 , ..., -0.0823387 ,
        -0.03352008, -0.03352008]])

#ホールドアウト法による分割：テストデータの割合は3割、random_stateは0、変数は「X_train, X_test, y_train, y_test」を使用

In [98]:
from sklearn.model_selection import train_test_split

X2_train, X2_test, y2_train, y2_test = train_test_split(X2_std, y2, test_size=0.3, random_state=0)

In [100]:
from sklearn.svm import SVC
svm=SVC()
svm.fit(X_train,y_train)
svm.score(X_test,y_test)

  y = column_or_1d(y, warn=True)


0.7313432835820896