# 作業 : (Kaggle)鐵達尼生存預測

# [作業目標]
- 試著模仿範例寫法, 在鐵達尼生存預測中, 觀察計數編碼與特徵雜湊的效果

# [作業重點]
- 仿造範例, 完成自己挑選特徵的群聚編碼 (In[2], Out[2])
- 觀察群聚編碼, 搭配邏輯斯回歸, 看看有什麼影響 (In[5], Out[5], In[6], Out[6]) 

# 作業1
* 試著使用鐵達尼號的例子，創立兩種以上的群聚編碼特徵( mean、median、mode、max、min、count 均可 )

In [1]:
# 做完特徵工程前的所有準備 (與前範例相同)
import pandas as pd
import numpy as np
import copy
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

data_path = 'data/'
df = pd.read_csv(data_path + 'titanic_train.csv')

train_Y = df['Survived']
df = df.drop(['PassengerId', 'Survived'] , axis=1)
df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
# 取一個類別型欄位, 與一個數值型欄位, 做群聚編碼
# 用票根 (Ticket) 對於費用 (Fare) 做群聚編碼

df['Ticket'] = df['Ticket'].fillna('NaN')
mean_df = df.groupby(['Ticket'])['Fare'].mean().reset_index()
mode_df = df.groupby(['Ticket'])['Fare'].apply(lambda x: x.mode()[0]).reset_index()
median_df = df.groupby(['Ticket'])['Fare'].median().reset_index()
max_df = df.groupby(['Ticket'])['Fare'].max().reset_index()
counts_df = df['Ticket'].value_counts(sort=False).reset_index()
counts_df = counts_df.rename(columns={'index': 'Ticket', 'Ticket': 'Ticket_counts'})
temp = pd.merge(mean_df, mode_df, how='left', on=['Ticket'])
temp = pd.merge(temp, median_df, how='left', on=['Ticket'])
temp = pd.merge(temp, max_df, how='left', on=['Ticket'])
temp = pd.merge(temp, counts_df, how='left', on=['Ticket'])
temp.columns = ['Ticket', 'Fare_Mean', 'Fare_Mode', 'Fare_Median', 'Fare_Max', 'Ticket_counts']
temp

Unnamed: 0,Ticket,Fare_Mean,Fare_Mode,Fare_Median,Fare_Max,Ticket_counts
0,110152,86.500,86.500,86.500,86.500,3
1,110413,79.650,79.650,79.650,79.650,3
2,110465,52.000,52.000,52.000,52.000,2
3,110564,26.550,26.550,26.550,26.550,1
4,110813,75.250,75.250,75.250,75.250,1
...,...,...,...,...,...,...
676,W./C. 6608,34.375,34.375,34.375,34.375,4
677,W./C. 6609,7.550,7.550,7.550,7.550,1
678,W.E.P. 5734,61.175,61.175,61.175,61.175,1
679,W/C 14208,10.500,10.500,10.500,10.500,1


In [3]:
# 只取 int64, float64 兩種數值型欄位, 存於 num_features 中
num_features = []
for dtype, feature in zip(df.dtypes, df.columns):
    if dtype == 'float64' or dtype == 'int64':
        num_features.append(feature)
print(f'{len(num_features)} Numeric Features : {num_features}\n')

# 削減文字型欄位, 只剩數值型欄位
df_num = df[num_features]
df_num = df_num.fillna(-1)
MMEncoder = MinMaxScaler()
df_num.head(10)

5 Numeric Features : ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']



Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3,22.0,1,0,7.25
1,1,38.0,1,0,71.2833
2,3,26.0,0,0,7.925
3,1,35.0,1,0,53.1
4,3,35.0,0,0,8.05
5,3,-1.0,0,0,8.4583
6,1,54.0,0,0,51.8625
7,3,2.0,3,1,21.075
8,3,27.0,0,2,11.1333
9,2,14.0,1,0,30.0708


# 作業2
* 將上述的新特徵，合併原有的欄位做生存率預估，結果是否有改善?

> 若加入群聚編碼的特徵(對"Ticket"做分群的"Fare"的統計量)，生存預估的效果會更好!

In [4]:
# 原始特徵 (數值) + 邏輯斯迴歸
train_X = df_num
estimator = LogisticRegression(solver='newton-cg')
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
df_num.head(10)

0.6982644788418415


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3,22.0,1,0,7.25
1,1,38.0,1,0,71.2833
2,3,26.0,0,0,7.925
3,1,35.0,1,0,53.1
4,3,35.0,0,0,8.05
5,3,-1.0,0,0,8.4583
6,1,54.0,0,0,51.8625
7,3,2.0,3,1,21.075
8,3,27.0,0,2,11.1333
9,2,14.0,1,0,30.0708


In [5]:
#將分群計算出來的新數值特徵 merge 進去，變成新的 DataFrame
df_num['Ticket'] = df['Ticket']
new_df = pd.merge(df_num, temp, how='left', on=['Ticket'])
new_df = new_df.drop(['Ticket'] , axis=1) #丟掉非數值特徵
new_df.head(10)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Fare_Mean,Fare_Mode,Fare_Median,Fare_Max,Ticket_counts
0,3,22.0,1,0,7.25,7.25,7.25,7.25,7.25,1
1,1,38.0,1,0,71.2833,71.2833,71.2833,71.2833,71.2833,1
2,3,26.0,0,0,7.925,7.925,7.925,7.925,7.925,1
3,1,35.0,1,0,53.1,53.1,53.1,53.1,53.1,2
4,3,35.0,0,0,8.05,8.05,8.05,8.05,8.05,1
5,3,-1.0,0,0,8.4583,8.4583,8.4583,8.4583,8.4583,1
6,1,54.0,0,0,51.8625,51.8625,51.8625,51.8625,51.8625,1
7,3,2.0,3,1,21.075,21.075,21.075,21.075,21.075,4
8,3,27.0,0,2,11.1333,11.1333,11.1333,11.1333,11.1333,3
9,2,14.0,1,0,30.0708,30.0708,30.0708,30.0708,30.0708,2


In [6]:
# 新特徵 + 邏輯斯迴歸
train_X = new_df
estimator = LogisticRegression(solver='newton-cg')
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
new_df.head(10)

0.70049911571335


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Fare_Mean,Fare_Mode,Fare_Median,Fare_Max,Ticket_counts
0,3,22.0,1,0,7.25,7.25,7.25,7.25,7.25,1
1,1,38.0,1,0,71.2833,71.2833,71.2833,71.2833,71.2833,1
2,3,26.0,0,0,7.925,7.925,7.925,7.925,7.925,1
3,1,35.0,1,0,53.1,53.1,53.1,53.1,53.1,2
4,3,35.0,0,0,8.05,8.05,8.05,8.05,8.05,1
5,3,-1.0,0,0,8.4583,8.4583,8.4583,8.4583,8.4583,1
6,1,54.0,0,0,51.8625,51.8625,51.8625,51.8625,51.8625,1
7,3,2.0,3,1,21.075,21.075,21.075,21.075,21.075,4
8,3,27.0,0,2,11.1333,11.1333,11.1333,11.1333,11.1333,3
9,2,14.0,1,0,30.0708,30.0708,30.0708,30.0708,30.0708,2
