In [1]:
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split

df = pd.read_csv('Survived.csv')
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C


In [2]:
df['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [3]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
df.shape

(891, 11)

In [5]:
df['Age'] = df['Age'].fillna(df['Age'].mean()) # Age列を平均値で穴埋め

df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0]) # Embarked列を最頻値で穴埋め

In [6]:
col = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

x = df[col]
t = df['Survived']

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, t, test_size = 0.2, random_state = 0)

x_train.shape

(712, 5)

In [8]:
model = tree.DecisionTreeClassifier(max_depth = 5, random_state = 0, class_weight = 'balanced')

model.fit(x_train, y_train) # 学習

In [10]:
model.score(X = x_test, y = y_test)

0.7374301675977654

In [14]:
# x:特徴量 t:正解データ depth:木の深さ
def learn(x, t, depth = 3):
    x_train, x_test, y_train, y_test = train_test_split(x, t, test_size = 0.2, random_state = 0)
    model = tree.DecisionTreeClassifier(max_depth = depth, random_state = 0, class_weight = 'balanced')
    model.fit(x_train, y_train) # 学習
    
    score = model.score(X = x_train, y = y_train)
    score2 = model.score(X = x_test, y = y_test)
    
    return round(score, 3), round(score2, 3), model # round():四捨五入

In [16]:
for j in range(1,15): # jは木の深さ
    train_score, test_score, model = learn(x, t, depth = j)
    sentence = '訓練データの正解率{}'
    sentence2 = 'テストデータの正解率{}'
    total_sentence = '深さ{}:'+sentence+sentence2
    print(total_sentence.format(j, train_score, test_score))

深さ1:訓練データの正解率0.659テストデータの正解率0.704
深さ2:訓練データの正解率0.699テストデータの正解率0.732
深さ3:訓練データの正解率0.704テストデータの正解率0.737
深さ4:訓練データの正解率0.698テストデータの正解率0.726
深さ5:訓練データの正解率0.722テストデータの正解率0.737
深さ6:訓練データの正解率0.77テストデータの正解率0.698
深さ7:訓練データの正解率0.771テストデータの正解率0.648
深さ8:訓練データの正解率0.781テストデータの正解率0.631
深さ9:訓練データの正解率0.83テストデータの正解率0.704
深さ10:訓練データの正解率0.851テストデータの正解率0.687
深さ11:訓練データの正解率0.878テストデータの正解率0.676
深さ12:訓練データの正解率0.892テストデータの正解率0.654
深さ13:訓練データの正解率0.909テストデータの正解率0.654
深さ14:訓練データの正解率0.92テストデータの正解率0.654


In [17]:
df2 = pd.read_csv('Survived.csv')

print(df2['Age'].mean())
print(df2['Age'].median())

29.69911764705882
28.0


In [18]:
df2.groupby('Survived').mean()['Age']

  df2.groupby('Survived').mean()['Age']


Survived
0    30.626179
1    28.343690
Name: Age, dtype: float64

In [19]:
df2.groupby('Pclass').mean()['Age']

  df2.groupby('Pclass').mean()['Age']


Pclass
1    38.233441
2    29.877630
3    25.140620
Name: Age, dtype: float64

In [20]:
pd.pivot_table(df2, index = 'Survived', columns = 'Pclass', values = 'Age')

Pclass,1,2,3
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,43.695312,33.544444,26.555556
1,35.368197,25.901566,20.646118


In [21]:
pd.pivot_table(df2, index = 'Survived', columns = 'Pclass', values = 'Age', aggfunc = max)

Pclass,1,2,3
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,71.0,70.0,74.0
1,80.0,62.0,63.0


In [36]:
is_null = df2['Age'].isnull()
# Pclass1に関する埋め込み
df2.loc[(df2['Pclass'] == 1) & (df2['Survived'] == 0) &(is_null), 'Age'] = 43
df2.loc[(df2['Pclass'] == 1) & (df2['Survived'] == 1) &(is_null), 'Age'] = 35
# Pclass2に関する埋め込み
df2.loc[(df2['Pclass'] == 2) & (df2['Survived'] == 0) &(is_null), 'Age'] = 33
df2.loc[(df2['Pclass'] == 2) & (df2['Survived'] == 1) &(is_null), 'Age'] = 25
# Pclass3に関する埋め込み
df2.loc[(df2['Pclass'] == 3) & (df2['Survived'] == 0) &(is_null), 'Age'] = 26
df2.loc[(df2['Pclass'] == 3) & (df2['Survived'] == 1) &(is_null), 'Age'] = 20
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [37]:
col = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

x = df2[col]
t = df2['Survived']

In [38]:
x_train, x_test, y_train, y_test = train_test_split(x, t, test_size = 0.2, random_state = 0)

x_train.shape

(712, 5)

In [39]:
model = tree.DecisionTreeClassifier(max_depth = 5, random_state = 0, class_weight = 'balanced')

model.fit(x_train, y_train) # 学習

In [40]:
model.score(X = x_test, y = y_test)

0.7262569832402235

In [41]:
for j in range(1,15): # jは木の深さ
    train_score, test_score, model = learn(x, t, depth = j)
    sentence = '訓練データの正解率{}'
    sentence2 = 'テストデータの正解率{}'
    total_sentence = '深さ{}:'+sentence+sentence2
    print(total_sentence.format(j, train_score, test_score))

深さ1:訓練データの正解率0.659テストデータの正解率0.704
深さ2:訓練データの正解率0.699テストデータの正解率0.67
深さ3:訓練データの正解率0.722テストデータの正解率0.715
深さ4:訓練データの正解率0.74テストデータの正解率0.704
深さ5:訓練データの正解率0.76テストデータの正解率0.726
深さ6:訓練データの正解率0.794テストデータの正解率0.793
深さ7:訓練データの正解率0.819テストデータの正解率0.749
深さ8:訓練データの正解率0.84テストデータの正解率0.749
深さ9:訓練データの正解率0.885テストデータの正解率0.743
深さ10:訓練データの正解率0.906テストデータの正解率0.732
深さ11:訓練データの正解率0.93テストデータの正解率0.726
深さ12:訓練データの正解率0.947テストデータの正解率0.737
深さ13:訓練データの正解率0.961テストデータの正解率0.732
深さ14:訓練データの正解率0.969テストデータの正解率0.721
