Pre-Processing

In [None]:
import pandas as pd
import warnings

%matplotlib inline
warnings.filterwarnings('ignore')

#data import
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
gender_submisson = pd.read_csv('gender_submission.csv')

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
#성별 - male, female 을 numeric하게 바꾸어줌
train['Sex_clean'] = train['Sex'].astype('category').cat.codes
test['Sex_clean'] = test['Sex'].astype('category').cat.codes

In [None]:
#Embarked - train데이터에 NaN값이 2개 존재 -> 값 채워주기

train['Embarked'].isnull().sum()


2

In [None]:
test['Embarked'].isnull().sum()

0

In [None]:
train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [None]:
#최빈값이 S이므로 S로 NaN값을 채워줌

train['Embarked'].fillna('S', inplace=True)

train['Embarked_clean'] = train['Embarked'].astype('category').cat.codes
test['Embarked_clean'] = test['Embarked'].astype('category').cat.codes

In [None]:
#Family
#SibSp컬럼과 Parch컬럼 합친 다음 자신을 더한 값이 Family의 값이 됨

train['Family'] = 1+ train['SibSp'] + train['Parch']
test['Family'] = 1+ test['SibSp'] + test['Parch']

In [None]:
#Solo - 혼자 탔는지 가족과 탔는지 구분하기 위해 Solo컬럼 추가

train['Solo'] = (train['Family']==1)
test['Solo'] = (test['Family']==1)

In [None]:
#Fare - 단순화위해서 Binning기법활용 ; 5구간으로 나눠줌

train['FareBin'] = pd.qcut(train['Fare'], 5)
test['FareBin'] = pd.qcut(test['Fare'], 5)

train['FareBin'].value_counts()



(7.854, 10.5]        184
(21.679, 39.688]     180
(-0.001, 7.854]      179
(39.688, 512.329]    176
(10.5, 21.679]       172
Name: FareBin, dtype: int64

In [None]:
#Binning 한 후에 Numeric한 값으로 변경

train['Fare_clean'] = train['FareBin'].astype('category').cat.codes
test['Fare_clean'] = test['FareBin'].astype('category').cat.codes

train['Fare_clean'].value_counts()

1    184
3    180
0    179
4    176
2    172
Name: Fare_clean, dtype: int64

In [None]:
#Title ; 모수가 적은 title은 단일화

train['Title'] = train['Name'].str.extract('([A-Za-z]+)\.', expand=False)
test['Title'] = test['Name'].str.extract('([A-Za-z]+)\.', expand=False)

train['Title'] = train['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona' ], 'other')
test['Title'] = test['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona' ], 'other')

train['Title'].value_counts()

Mr        517
Miss      182
Mrs       125
Master     40
other      23
Mlle        2
Mme         1
Ms          1
Name: Title, dtype: int64

In [None]:
#Mlle, Ms, Mme 도 단일화

train['Title'] = train['Title'].replace('Mlle', 'Miss')
train['Title'] = train['Title'].replace('Ms', 'Miss')
train['Title'] = train['Title'].replace('Mme', 'Mrs')

test['Title'] = test['Title'].replace('Mlle', 'Miss')
test['Title'] = test['Title'].replace('Ms', 'Miss')
test['Title'] = test['Title'].replace('Mme', 'Mrs')


In [None]:
train['Title'].value_counts()

Mr        517
Miss      185
Mrs       126
Master     40
other      23
Name: Title, dtype: int64

In [None]:
test['Title'].value_counts()

Mr        240
Miss       79
Mrs        72
Master     21
other       6
Name: Title, dtype: int64

In [None]:
#Title 역시 Numeric한 값들로 변경

train['Title_clean'] = train['Title'].astype('category').cat.codes
test['Title_clean'] = test['Title'].astype('category').cat.codes

In [None]:
#Age - 중요도 높음
#Title로 Group화 한 Age의 Median 값으로 채워주는 전략이 가장 높은 점수

train['Age'].isnull().sum()

0

In [None]:
test['Age'].isnull().sum()

0

In [None]:
train['Age'].fillna(train.groupby("Title")["Age"].transform("median"), inplace=True)
test['Age'].fillna(test.groupby("Title")["Age"].transform("median"), inplace=True)

In [None]:
#Age 구간화(Binning) - 5세 단위로 자르고 50세는 10세 단위, 60세 이상은 모두 묶기

#Train
train.loc[train['Age'] <= 10, 'Age_clean'] = 0 
train.loc[(train['Age']>10) & (train['Age']<= 16), 'Age_clean'] = 1
train.loc[(train['Age']>16) & (train['Age']<= 20), 'Age_clean'] = 2
train.loc[(train['Age']>20) & (train['Age']<= 26), 'Age_clean'] = 3
train.loc[(train['Age']>26) & (train['Age']<= 30), 'Age_clean'] = 4
train.loc[(train['Age']>30) & (train['Age']<= 36), 'Age_clean'] = 5
train.loc[(train['Age']>36) & (train['Age']<= 40), 'Age_clean'] = 6
train.loc[(train['Age']>40) & (train['Age']<= 46), 'Age_clean'] = 7
train.loc[(train['Age']>46) & (train['Age']<= 50), 'Age_clean'] = 8
train.loc[(train['Age']>50) & (train['Age']<= 60), 'Age_clean'] = 9
train.loc[train['Age']>60, 'Age_clean'] = 10

test.loc[test['Age'] <= 10, 'Age_clean'] = 0 
test.loc[(test['Age']>10) & (test['Age']<= 16), 'Age_clean'] = 1
test.loc[(test['Age']>16) & (test['Age']<= 20), 'Age_clean'] = 2
test.loc[(test['Age']>20) & (test['Age']<= 26), 'Age_clean'] = 3
test.loc[(test['Age']>26) & (test['Age']<= 30), 'Age_clean'] = 4
test.loc[(test['Age']>30) & (test['Age']<= 36), 'Age_clean'] = 5
test.loc[(test['Age']>36) & (test['Age']<= 40), 'Age_clean'] = 6
test.loc[(test['Age']>40) & (test['Age']<= 46), 'Age_clean'] = 7
test.loc[(test['Age']>46) & (test['Age']<= 50), 'Age_clean'] = 8
test.loc[(test['Age']>50) & (test['Age']<= 60), 'Age_clean'] = 9
test.loc[train['Age']>60, 'Age_clean'] = 10

In [None]:
#Cabin - Alphabet을 가져와서 Numeric한 값으로 변경한 후 Pclass로 group한 median값을 일괄 적용

train['Cabin'].str[:1].value_counts()

C    59
B    47
D    33
E    32
A    15
F    13
G     4
T     1
Name: Cabin, dtype: int64

In [None]:
mapping = {
    'A':0,
    'B':1,
    'C':2, 
    'D':3,
    'E':4,
    'F':5, 
    'G':6,
    'T':7
}

train['Cabin_clean'] = train['Cabin'].str[:1]
train['Cabin_clean'] = train['Cabin_clean'].map(mapping)
train['Cabin_clena'] = train.groupby('Pclass')['Cabin_clean'].transform('median')

test['Cabin_clean'] = test['Cabin'].str[:1]
test['Cabin_clean'] = test['Cabin_clean'].map(mapping)
test['Cabin_clena'] = test.groupby('Pclass')['Cabin_clean'].transform('median')

In [None]:
train['Cabin_clean'].value_counts()

2.0    59
1.0    47
3.0    33
4.0    32
0.0    15
5.0    13
6.0     4
7.0     1
Name: Cabin_clean, dtype: int64

In [None]:
test['Cabin_clean'].value_counts()

2.0    35
1.0    18
3.0    13
4.0     9
5.0     8
0.0     7
6.0     1
Name: Cabin_clean, dtype: int64

In [None]:
#Feature와 Label 정의

feature = [
    'Pclass', 
    'SibSp', 
    'Parch', 
    'Sex_clean',
    'Embarked_clean', 
    'Family', 
    'Solo', 
    'Title_clean', 
    'Age_clean',
    'Cabin_clean',
    'Fare_clean'
]

label = ['Survived',]

In [None]:
#HyperParameter - 다른 포스팅에서 깊게 다룰 것

In [None]:
#Cross Validation Score

from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier

data = train[feature]
target = train[label]

k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

clf = RandomForestClassifier(n_estimators=50, max_depth=6, random_state=0)
cross_val_score(clf, data, target, cv=k_fold, scoring='accuracy', ).mean()

#Acuracy
#0.8271660424469414

nan

In [None]:
#Make Prediction

x_train = train[feature]
x_test = test[feature]
y_train = train[label]

clf = RandomForestClassifier(n_estimators=50, max_depth=6, random_state=0)
clf.fit(x_train, y_train)
gender_submisson['Survived'] = clf.predict(x_test)
gender_submisson.to_csv('titanic-submission.csv', index=False)

ValueError: ignored