# 作業 : (Kaggle)鐵達尼生存預測
https://www.kaggle.com/c/titanic

In [27]:
# 做完特徵工程前的所有準備 (與前範例相同)
import pandas as pd
import numpy as np
import copy
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression, LinearRegression

data_path = 'data/'
df_train = pd.read_csv(data_path + 'titanic_train.csv')
df_test = pd.read_csv(data_path + 'titanic_test.csv')

train_Y = df_train['Survived']
ids = df_test['PassengerId']
df_train = df_train.drop(['PassengerId', 'Survived'] , axis=1)
df_test = df_test.drop(['PassengerId'] , axis=1)
df = pd.concat([df_train,df_test])
df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [28]:
#只取 int64, float64 兩種數值型欄位, 存於 num_features 中
num_features = []
for dtype, feature in zip(df.dtypes, df.columns):
    if dtype == 'float64' or dtype == 'int64':
        num_features.append(feature)
print(f'{len(num_features)} Numeric Features : {num_features}\n')

5 Numeric Features : ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']



In [48]:
# 削減文字型欄位, 只剩數值型欄位
df = df[num_features]
for dtype, col in zip(df.dtypes, df.columns):
    if (dtype == 'int64'):
        df[col] = df[col].astype(np.float64)
train_num = train_Y.shape[0]
df.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3.0,22.0,1.0,0.0,7.25
1,1.0,38.0,1.0,0.0,71.2833
2,3.0,26.0,0.0,0.0,7.925
3,1.0,35.0,1.0,0.0,53.1
4,3.0,35.0,0.0,0.0,8.05


# 作業1
* 試著在補空值區塊, 替換並執行兩種以上填補的缺值, 看看何者比較好?

# 作業2
* 使用不同的標準化方式 ( 原值 / 最小最大化 / 標準化 )，搭配羅吉斯迴歸模型，何者效果最好?

In [130]:
def compute_score(estimator, filling, scaler = None):
    df_temp = df.fillna(filling)
        
    if scaler != None:
        df_temp = scaler.fit_transform(df_temp)
        
    train_X = df_temp[:train_num]
    
    return cross_val_score(estimator, train_X, train_Y, cv=5).mean()

In [142]:
estimators = [LogisticRegression(solver='liblinear'), LinearRegression()]
fillings = {
    '-1': -1,
    'zero': 0,
    'mean': df.mean(),
    'max': df.max(),
    'min': df.min()
}
scalers = [None, MinMaxScaler(), StandardScaler()]

columns=['Estimator', 'Filling', 'Scaler', 'Score']
result = pd.DataFrame(columns=columns)

for estimator in estimators:
    for filling_name, filling in fillings.items():
        for scaler in scalers:
            row = pd.DataFrame([[
                type(estimator).__name__,
                filling_name,
                type(scaler).__name__ if scaler != None else '-',
                compute_score(estimator, filling, scaler)
            ]], columns=columns)
            result = result.append(row, ignore_index=True)

result.sort_values(by=['Score'], ascending=False)

Unnamed: 0,Estimator,Filling,Scaler,Score
4,LogisticRegression,zero,MinMaxScaler,0.701629
13,LogisticRegression,min,MinMaxScaler,0.701629
10,LogisticRegression,max,MinMaxScaler,0.701623
7,LogisticRegression,mean,MinMaxScaler,0.69935
2,LogisticRegression,-1,StandardScaler,0.698258
5,LogisticRegression,zero,StandardScaler,0.698258
14,LogisticRegression,min,StandardScaler,0.698258
6,LogisticRegression,mean,-,0.698176
12,LogisticRegression,min,-,0.697154
3,LogisticRegression,zero,-,0.697154
