# [鐵達尼生存預測(Kaggle)](https://www.kaggle.com/c/titanic)
以下用鐵達尼生存預測資料, 觀察計數編碼與特徵雜湊的效果

In [48]:
# 做完特徵工程前的所有準備
import pandas as pd
import numpy as np
import copy, time
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

data_path = 'data/'
df_train = pd.read_csv(data_path + 'titanic_train.csv')
df_test = pd.read_csv(data_path + 'titanic_test.csv')

train_Y = df_train['Survived']
ids = df_test['PassengerId']
df_train = df_train.drop(['PassengerId', "Survived"], axis = 1)
df_test = df_test.drop(['PassengerId'], axis = 1)
df = pd.concat([df_train, df_test])
df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [49]:
# 找出object欄位
df = df.select_dtypes(include=['object'])
df = df.fillna('None')
train_num = len(train_Y)
df.head()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S


In [61]:
# 創建計數編碼、特徵雜湊與OneHotEncoding進行比較
df_le = pd.DataFrame()
for i in df.columns:
    df_le[i] = LabelEncoder().fit_transform(df[i])
## OneHotEncoding    
df_enc = OneHotEncoder().fit_transform(df_le).toarray()
df_enc = pd.DataFrame(df_enc)
## 加上'Ticket'欄位的計數編碼
count_df = df_le.groupby('Ticket')['Name'].agg({'Ticket_Count':'size'}).reset_index()
df_le = pd.merge(df_le, count_df, on = ['Ticket'], how = 'left')
# FeatureHasuing欄位
df_le['Ticket_Hash'] = df_le['Ticket'].map(lambda x:hash(x) % 10)

is deprecated and will be removed in a future version
  if __name__ == '__main__':


In [62]:
df_enc.shape

(1309, 2429)

In [63]:
df_le.head()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Ticket_Count,Ticket_Hash
0,155,1,720,185,3,1,0
1,286,0,816,106,0,2,6
2,523,0,914,185,3,1,4
3,422,0,65,70,3,2,5
4,22,1,649,185,3,1,9


In [70]:
# 對照組: count + LogisticRegression
df_count = df_le.drop(['Ticket','Ticket_Hash'],axis=1)
train_X = df_count[:train_num]
# estimator = LogisticRegression()
estimator = LogisticRegression()
print(cross_val_score(estimator, train_X, train_Y, cv = 5).mean())
# print(cross_val_score(estimator, train_X, train_Y, cv = 5).mean())
df_count.head()

0.7766528819375363


Unnamed: 0,Name,Sex,Cabin,Embarked,Ticket_Count
0,155,1,185,3,1
1,286,0,106,0,2
2,523,0,185,3,1
3,422,0,70,3,2
4,22,1,185,3,1


In [71]:
# Hash + LogisticRegression

df_ha = df_le.drop(['Ticket', 'Ticket_Count'], axis = 1)
train_X = df_ha[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
df_ha.head()

0.7800237393817333


Unnamed: 0,Name,Sex,Cabin,Embarked,Ticket_Hash
0,155,1,185,3,0
1,286,0,106,0,6
2,523,0,185,3,4
3,422,0,70,3,5
4,22,1,185,3,9


In [72]:
# OnehotEncoding + LogistRegression
train_X = df_enc[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
df_enc.head()

0.8013346043513216


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2419,2420,2421,2422,2423,2424,2425,2426,2427,2428
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


# 作業1
* 參考範例，將鐵達尼的艙位代碼( 'Cabin' )欄位使用特徵雜湊 / 標籤編碼 / 計數編碼三種轉換後， 與其他類別型欄位一起預估生存機率

# 作業2
* 承上題，三者比較效果何者最好?

In [73]:
import pandas as pd
import numpy as np
import copy, time
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

data_path = 'data/'
df_train = pd.read_csv(data_path + 'titanic_train.csv')
df_test = pd.read_csv(data_path + 'titanic_test.csv')

train_Y = df_train['Survived']
ids = df_test['PassengerId']
df_train = df_train.drop(['PassengerId', 'Survived'], axis = 1)
df_test = df_test.drop(['PassengerId'] , axis=1)
df = pd.concat([df_train, df_test])
df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [74]:
df = df.select_dtypes('object')
df = df.fillna('None')
train_num = train_Y.shape[0]

In [75]:
# 標籤編碼
df_le = pd.DataFrame()
for i in df.columns:
    df_le[i] = LabelEncoder().fit_transform(df[i])
df_le.head()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,155,1,720,185,3
1,286,0,816,106,0
2,523,0,914,185,3
3,422,0,65,70,3
4,22,1,649,185,3


In [76]:
# 計數編碼
count_df = df_le.groupby(['Cabin'])['Cabin'].agg({'Cabin_Count':'size'}).reset_index()
df_le = df_le.merge(count_df, how='left', on = 'Cabin')
# feature Hashing
df_le['Cabin_Hash'] = df_le['Cabin'].map(lambda x:hash(x) % 10)

is deprecated and will be removed in a future version
  


In [77]:
df_le.head(10)

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Cabin_Count,Cabin_Hash
0,155,1,720,185,3,1014,5
1,286,0,816,106,0,2,6
2,523,0,914,185,3,1014,5
3,422,0,65,70,3,2,0
4,22,1,649,185,3,1014,5
5,818,1,373,185,2,1014,5
6,767,1,109,163,3,2,3
7,914,1,541,185,3,1014,5
8,605,0,477,185,3,1014,5
9,847,0,174,185,0,1014,5


In [78]:
# Label + Logistic
df_le2 = df_le.drop(['Cabin_Count','Cabin_Hash'],axis=1)
train_X = df_le2[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator, train_X, train_Y, cv = 5).mean())

0.780004837244799


In [79]:
# count + Logistic
df_count = df_le.drop(['Cabin','Cabin_Hash'], axis=1)
train_X = df_count[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator, train_X, train_Y, cv = 5).mean())

0.7755670907057873


In [80]:
# hash + Logistic
df_ha = df_le.drop(['Cabin','Cabin_Count'],axis=1)
train_X = df_ha[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator, train_X, train_Y, cv = 5).mean())

0.7856039835632975


In [81]:
# Label + Gradient
df_le2 = df_le.drop(['Cabin_Count','Cabin_Hash'],axis=1)
train_X = df_le2[:train_num]
estimator = GradientBoostingClassifier()
print(cross_val_score(estimator, train_X, train_Y, cv = 5).mean())

0.7991313528291322


In [82]:
# count + Gradient
df_count = df_le.drop(['Cabin','Cabin_Hash'], axis=1)
train_X = df_count[:train_num]
estimator = GradientBoostingClassifier()
print(cross_val_score(estimator, train_X, train_Y, cv = 5).mean())

0.7957355289601662


In [83]:
# hash + Gradient
df_ha = df_le.drop(['Cabin','Cabin_Count'],axis=1)
train_X = df_ha[:train_num]
estimator = GradientBoostingClassifier()
print(cross_val_score(estimator, train_X, train_Y, cv = 5).mean())

0.8081393027203827
