# [Kaggle Titanic 데이터셋](https://www.kaggle.com/c/titanic/data)
- 각 컬럼 전처리 후 DT 모델 활용

In [1]:
import pandas as pd
import numpy as np
filepath='D:/downloads/titanic/'

In [2]:
train = pd.read_csv(filepath+'train.csv')
test = pd.read_csv(filepath+'test.csv')
train_test_data = [train,test]

In [3]:
train.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


__Title__

In [4]:
title_map = {
            'Mr':1,
            'Miss':2,
            'Mrs':3,
            'Master':4, 
}

def match_title(tag):
    try:
        return title_map[tag]
    except KeyError:
        return 5

for dataset in train_test_data:
        dataset['Title'] = dataset.Name.str.extract('([A-Za-z]+)\.')
        dataset.Title.replace(['Mlle','Ms'],'Miss',inplace=True)
        dataset.Title.replace(['Mme','Lady'],'Mrs',inplace=True)
        dataset.Title.replace(['Countess','Capt','Col','Don','Dr','Jonkheer','Major','Rev','Sir'],'Other',inplace=True)
        
        dataset.Title = dataset.Title.map(match_title)

train.Title.value_counts()

1    517
2    185
3    127
4     40
5     22
Name: Title, dtype: int64

__Sex__

In [5]:
for dataset in train_test_data:
    dataset.Sex = dataset.Sex.map({'female':1,'male':0})
train.Sex.value_counts()

0    577
1    314
Name: Sex, dtype: int64

__Embarked__

In [6]:
for dataset in train_test_data:
    null_cnt = dataset.Embarked.isnull().sum()
    rand = np.random.choice(dataset.Embarked.value_counts().index,
                            size=null_cnt,
                            replace=True,
                            p=dataset.Embarked.value_counts(normalize=True))
    
    dataset.Embarked[dataset.Embarked.isnull()]=rand
    dataset.Embarked = dataset.Embarked.map({'S':0,'C':1,'Q':2})

train.Embarked.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset.Embarked[dataset.Embarked.isnull()]=rand


0    646
1    168
2     77
Name: Embarked, dtype: int64

__Fare__

In [7]:
for dataset in train_test_data:
    dataset.Fare = dataset.Fare.fillna(dataset.Fare.median())
    dataset.Fare = pd.qcut(dataset.Fare,4).values.codes

train.Fare.value_counts()

1    224
0    223
3    222
2    222
Name: Fare, dtype: int64

__IsAlone__

In [8]:
for dataset in train_test_data:
    familysize = dataset.SibSp+dataset.Parch+1
    dataset['IsAlone']=familysize.apply(lambda x : 0 if x>1 else 1)
    
train.IsAlone.value_counts()

1    537
0    354
Name: IsAlone, dtype: int64

# 1. Age 열 전처리 개선 : 수업 때는 표준편차 범위 내 무작위 수 생성

__Idea : 전체 데이터의 평균과 표준편차를 이용한 범위 대신 호칭(Title)별 신뢰범위를 설정하고 개별로 난수 생성하여 처리__

In [9]:
def match_randint(idx):
    global dataset,mas
    title = dataset.iloc[idx]['Title']
    return np.random.randint(mas.loc[title,'start'],mas.loc[title,'end'])

for dataset in train_test_data:
    mas = dataset.groupby(['Title'])['Age'].describe()[['mean','std']] # means and stds
    mas['start'] = mas['mean']-mas['std']
    mas['end'] = mas['mean']+mas['std']
    fill_array = pd.Series([match_randint(x) for x in dataset[dataset.Age.isnull()].index])
    dataset.Age = dataset.Age.fillna(fill_array)
    

In [10]:
mas

Unnamed: 0_level_0,mean,std,start,end
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,32.0,11.804497,20.195503,43.804497
2,21.774844,10.457716,11.317128,32.23256
3,38.903226,15.03407,23.869156,53.937296
4,7.406471,4.67247,2.734001,12.07894
5,43.833333,8.953584,34.879749,52.786917


In [11]:
for dataset in train_test_data:
    dataset.Age = pd.cut(dataset.Age,5).values.codes
    
train.Age.head(3)

0    1
1    2
2    1
Name: Age, dtype: int8

__1까지만 적용한 모델 평가 (기존 87.2% -> 87.8%)__

In [12]:
from sklearn.tree import DecisionTreeClassifier

features_drop=['Name','SibSp','Parch','Ticket','Cabin',]
xTrain = train.drop(features_drop+['Survived','PassengerId'],axis=1)
yLabel = train['Survived']
xTest = test.drop(features_drop+['PassengerId'],axis=1)

DT = DecisionTreeClassifier()
model = DT.fit(xTrain,yLabel)
prediction = model.predict(xTest)
test['Survived']=prediction
result = test[['PassengerId','Survived']]
result.to_csv('advancedDT.csv',index=False)
print('File Saved')
print(f'Model Accuracy : {model.score(xTrain,yLabel)}')

File Saved
Model Accuracy : 0.8787878787878788


# 2. Cabin 열 전처리 

### EDA

- __`대구분 컬럼 추가 (ex. E54 -> E)`__

In [13]:
train['Cabin_category'] = train.Cabin.dropna().str.extract('([A-Z])')
test['Cabin_category'] = test.Cabin.dropna().str.extract('([A-Z])')
train['Cabin_category'].head(4)

0    NaN
1      C
2    NaN
3      C
Name: Cabin_category, dtype: object

- __`8가지 분류 (단, test 데이터에는 T 분류 항목은 없음)`__

In [14]:
train.Cabin_category.unique()

array([nan, 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [15]:
test.Cabin_category.unique()

array([nan, 'B', 'E', 'A', 'C', 'D', 'F', 'G'], dtype=object)

- __`각 분류 사이즈 확인`__
    1. A,F 가 비교적 적다.
    2. B~E 는 비교적 균등하다.

In [16]:
train.groupby(['Cabin_category']).size()

Cabin_category
A    15
B    47
C    59
D    33
E    32
F    13
G     4
T     1
dtype: int64

- __`룸 타입에 따른 성비`__
    1. A,F 는 비교적 여성 비율이 적다?
    2. G,T 는 남/여 전용 캐빈?

In [17]:
train.groupby(['Cabin_category'])['Sex'].value_counts(normalize=True)

Cabin_category  Sex
A               0      0.933333
                1      0.066667
B               1      0.574468
                0      0.425532
C               0      0.542373
                1      0.457627
D               1      0.545455
                0      0.454545
E               0      0.531250
                1      0.468750
F               0      0.615385
                1      0.384615
G               1      1.000000
T               0      1.000000
Name: Sex, dtype: float64

- __`각 타입별 생존률`__
    - 생존률이 다 높다.. 결측값이 워낙 많아 신뢰할만한 수치 없는듯

In [18]:
train.groupby(['Cabin_category'])['Survived'].mean()

Cabin_category
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
T    0.000000
Name: Survived, dtype: float64

- __`각 타입별+성별 생존률`__
    - 마찬가지로 별 소득은 없다. 이전과 동일하게 여성의 생존 비율이 월등이 높다는것은 알 수 있다.

In [19]:
train.groupby(['Cabin_category','Sex'])['Survived'].mean()

Cabin_category  Sex
A               0      0.428571
                1      1.000000
B               0      0.400000
                1      1.000000
C               0      0.343750
                1      0.888889
D               0      0.466667
                1      1.000000
E               0      0.588235
                1      0.933333
F               0      0.375000
                1      1.000000
G               1      0.500000
T               0      0.000000
Name: Survived, dtype: float64

- __`각 타입별 Fare 분류 비율`__
    - 많은 비용을 지불한 사람들이 각 방에 큰 부분을 차지.

In [20]:
train.groupby(['Cabin_category'])['Fare'].value_counts(normalize=True).unstack()

Fare,0,1,2,3
Cabin_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,0.066667,,0.266667,0.666667
B,0.06383,,0.106383,0.829787
C,,,0.220339,0.779661
D,,0.121212,0.151515,0.727273
E,,0.21875,0.25,0.53125
F,0.307692,0.230769,0.307692,0.153846
G,,0.5,0.5,
T,,,,1.0


- __`대분류를 숫자 변환 , 상관계수 확인`__
    - Pclass 와 Fare 두 피쳐와 가장 관계 있음

In [21]:
cabin_map = {'A':1,'B':2,'C':3,'D':4,
            'E':5,'F':6,'G':7,'T':8}

train['Cabin_enc'] = train.Cabin_category.map(cabin_map)
test['Cabin_enc'] = test.Cabin_category.map(cabin_map)

In [22]:
train.drop(['PassengerId'],axis=1).corr()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,IsAlone,Cabin_enc
Survived,1.0,-0.338481,0.543351,0.028792,-0.035322,0.081629,0.299357,0.106811,0.406954,-0.203367,0.018825
Pclass,-0.338481,1.0,-0.1319,-0.327433,0.083081,0.018443,-0.634271,0.045702,-0.171341,0.135207,0.598211
Sex,0.543351,-0.1319,1.0,-0.027237,0.114631,0.245489,0.243613,0.116569,0.502181,-0.303646,0.079704
Age,0.028792,-0.327433,-0.027237,1.0,-0.168004,-0.02388,0.150664,-0.211437,0.031388,0.005895,-0.070446
SibSp,-0.035322,0.083081,0.114631,-0.168004,1.0,0.414838,0.393025,-0.059961,0.269982,-0.584471,0.035571
Parch,0.081629,0.018443,0.245489,-0.02388,0.414838,1.0,0.393881,-0.078665,0.318353,-0.583398,0.023298
Fare,0.299357,-0.634271,0.243613,0.150664,0.393025,0.393881,1.0,-0.098161,0.338509,-0.560279,-0.336375
Embarked,0.106811,0.045702,0.116569,-0.211437,-0.059961,-0.078665,-0.098161,1.0,0.043418,0.017807,-0.191904
Title,0.406954,-0.171341,0.502181,0.031388,0.269982,0.318353,0.338509,0.043418,1.0,-0.405083,-0.023089
IsAlone,-0.203367,0.135207,-0.303646,0.005895,-0.584471,-0.583398,-0.560279,0.017807,-0.405083,1.0,-0.065847


- __`Pclass,Fare 와 상관계수가 높은것을 확인했으니 이에 따른 Cabin분류 분포 확인`__
    - 생존

In [23]:
train.groupby(['Pclass','Fare'])['Cabin_category'].value_counts(normalize=True).unstack()

Unnamed: 0_level_0,Cabin_category,A,B,C,D,E,F,G,T
Pclass,Fare,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,0.25,0.75,,,,,,
1,2,0.114286,0.142857,0.371429,0.142857,0.228571,,,
1,3,0.072993,0.284672,0.335766,0.175182,0.124088,,,0.007299
2,1,,,,0.363636,0.363636,0.272727,,
2,2,,,,,,1.0,,
2,3,,,,,,1.0,,
3,0,,,,,,1.0,,
3,1,,,,,0.6,,0.4,
3,2,,,,,,0.333333,0.666667,


### Preprocessing

- __`Cabin 컬럼 결측값을 Pclass, Fare 분류에 따라 확률을 적용하여 np.random.choice`__

In [24]:
def match_cabin(row):
    global cabin_df
    
    try:
        return np.random.choice(cabin_df.columns.values,1,p=cabin_df.loc[row.Pclass,row.Fare].values).item()
    except TypeError:
        return np.random.choice(cabin_df.columns.values,1).item()

for dataset in train_test_data:
    cabin_df = dataset.groupby(['Pclass','Fare'])['Cabin_category'].value_counts(normalize=True).unstack().fillna(0)
    null_set = dataset[dataset.Cabin.isnull()]
    dataset.Cabin_category = dataset.Cabin_category.fillna(null_set.apply(match_cabin,axis=1))

train.head(5)  # Cabin_category => NaN 값 채움

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,IsAlone,Cabin_category,Cabin_enc
0,1,0,3,"Braund, Mr. Owen Harris",0,1,1,0,A/5 21171,0,,0,1,0,F,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,2,1,0,PC 17599,3,C85,1,3,0,C,3.0
2,3,1,3,"Heikkinen, Miss. Laina",1,1,0,0,STON/O2. 3101282,1,,0,2,1,E,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,2,1,0,113803,3,C123,0,3,0,C,3.0
4,5,0,3,"Allen, Mr. William Henry",0,2,0,0,373450,1,,0,1,1,E,


- __`결측값을 예측하여 채운 Cabin_category 컬럼을 맵핑하여 Cabin 컬럼 업데이트`__

In [25]:
for dataset in train_test_data:
    dataset.Cabin = dataset.Cabin_category.map(cabin_map)
    
train.Cabin.head(5)

0    6
1    3
2    5
3    3
4    5
Name: Cabin, dtype: int64

- __`2까지 적용한 모델 평가 (기존 87.8% -> 90.7%)`__

In [26]:
features_drop=['Name','SibSp','Parch','Ticket','PassengerId','Cabin_category','Cabin_enc']
xTrain = train.drop(features_drop+['Survived'],axis=1)
yLabel = train['Survived']
xTest = test.drop(features_drop+['Survived'],axis=1)

DT = DecisionTreeClassifier()
model = DT.fit(xTrain,yLabel)
prediction = model.predict(xTest)
test['Survived']=prediction
result = test[['PassengerId','Survived']]
result.to_csv('advancedDT2.csv',index=False)
print('File Saved')
print(f'Model Accuracy : {model.score(xTrain,yLabel)}')

File Saved
Model Accuracy : 0.9068462401795735
