# 作業 : (Kaggle)鐵達尼生存預測
***
- 分數以網站評分結果為準, 請同學實際將提交檔(*.csv)上傳試試看  
https://www.kaggle.com/c/titanic/submit

# [作業目標]
- 試著模仿範例寫法, 在鐵達尼生存預測中, 觀查堆疊泛化 (Stacking) 的寫法與效果

# [作業重點]
- 完成堆疊泛化的寫作, 看看提交結果, 想想看 : 分類與回歸的堆疊泛化, 是不是也與混合泛化一樣有所不同呢?(In[14])  
如果可能不同, 應該怎麼改寫會有較好的結果?  
- Hint : 請參考 mlxtrend 官方網站 StackingClassifier 的頁面說明 : Using Probabilities as Meta-Features
http://rasbt.github.io/mlxtend/user_guide/classifier/StackingClassifier/

In [21]:
# 做完特徵工程前的所有準備 (與前範例相同)
import pandas as pd
import numpy as np
import copy, time
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

data_path = '../data/'
df_train = pd.read_csv(data_path + 'titanic_train.csv')
df_test = pd.read_csv(data_path + 'titanic_test.csv')

train_Y = df_train['Survived']
ids = df_test['PassengerId']
df_train = df_train.drop(['PassengerId', 'Survived'] , axis=1)
df_test = df_test.drop(['PassengerId'] , axis=1)
df = pd.concat([df_train,df_test])
df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [22]:
# 檢查 DataFrame 空缺值的狀態
def na_check(df_data):
    data_na = (df_data.isnull().sum() / len(df_data)) * 100
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :data_na})
    display(missing_data.head(10))
na_check(df)

Unnamed: 0,Missing Ratio
Cabin,77.463713
Age,20.091673
Embarked,0.152788
Fare,0.076394


In [23]:
# 以下 In[3]~In[10] 只是鐵達尼預測中的一組特徵工程, 並以此組特徵工程跑參數, 若更換其他特徵工程, In[10]的參數需要重新跑
# Sex : 直接轉男 0 女 1
df["Sex"] = df["Sex"].map({"male": 0, "female":1})
# Fare : 用 log 去偏態, 0 則直接取 0
df["Fare"] = df["Fare"].map(lambda i: np.log(i) if i > 0 else 0)
# Age : 缺值用中位數補
df["Age"] = df["Age"].fillna(df['Age'].median())

In [24]:
# Title 的 特徵工程 : 將各種頭銜按照類型分類, 最後取 One Hot
df_title = [i.split(",")[1].split(".")[0].strip() for i in df["Name"]]
df["Title"] = pd.Series(df_title)
df["Title"] = df["Title"].replace(['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df["Title"] = df["Title"].map({"Master":0, "Miss":1, "Ms" : 1 , "Mme":1, "Mlle":1, "Mrs":1, "Mr":2, "Rare":3})
df["Title"] = df["Title"].astype(int)
df = pd.get_dummies(df, columns = ["Title"])

In [25]:
# 新建:家庭大小 (Fsize)特徵, 並依照大小分別建獨立欄位
df["Fsize"] = df["SibSp"] + df["Parch"] + 1
df['Single'] = df['Fsize'].map(lambda s: 1 if s == 1 else 0)
df['SmallF'] = df['Fsize'].map(lambda s: 1 if  s == 2  else 0)
df['MedF'] = df['Fsize'].map(lambda s: 1 if 3 <= s <= 4 else 0)
df['LargeF'] = df['Fsize'].map(lambda s: 1 if s >= 5 else 0)

In [26]:
# Ticket : 如果不只是數字-取第一個空白之前的字串(去除'.'與'/'), 如果只是數字-設為'X', 最後再取 One Hot
Ticket = []
for i in list(df.Ticket):
    if not i.isdigit() :
        Ticket.append(i.replace(".","").replace("/","").strip().split(' ')[0])
    else:
        Ticket.append("X")        
df["Ticket"] = Ticket
df = pd.get_dummies(df, columns = ["Ticket"], prefix="T")

In [27]:
# Cabib 依照第一碼分類, 再取 One Hot
df["Cabin"] = pd.Series([i[0] if not pd.isnull(i) else 'X' for i in df['Cabin'] ])
df = pd.get_dummies(df, columns = ["Cabin"], prefix="Cabin")

In [28]:
# Embarked, Pclass 取 One Hot
df = pd.get_dummies(df, columns = ["Embarked"], prefix="Em")
df["Pclass"] = df["Pclass"].astype("category")
df = pd.get_dummies(df, columns = ["Pclass"], prefix="Pc")

# 捨棄 Name 欄位
df.drop(labels = ["Name"], axis = 1, inplace = True)

In [29]:
na_check(df)
df.head()

Unnamed: 0,Missing Ratio


Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Title_0,Title_1,Title_2,Title_3,Fsize,...,Cabin_F,Cabin_G,Cabin_T,Cabin_X,Em_C,Em_Q,Em_S,Pc_1,Pc_2,Pc_3
0,0,22.0,1,0,1.981001,0,0,1,0,2,...,0,0,0,1,0,0,1,0,0,1
1,1,38.0,1,0,4.266662,0,1,0,0,2,...,0,0,0,0,1,0,0,1,0,0
2,1,26.0,0,0,2.070022,0,1,0,0,1,...,0,0,0,1,0,0,1,0,0,1
3,1,35.0,1,0,3.972177,0,1,0,0,2,...,0,0,0,0,0,0,1,1,0,0
4,0,35.0,0,0,2.085672,0,0,1,0,1,...,0,0,0,1,0,0,1,0,0,1


In [30]:
# 將資料最大最小化
df = MinMaxScaler().fit_transform(df)

# 將前述轉換完畢資料 df , 重新切成 train_X, test_X
train_num = train_Y.shape[0]
train_X = df[:train_num]
test_X = df[train_num:]

# 使用三種模型 : 邏輯斯迴歸 / 梯度提升機 / 隨機森林, 參數使用 Random Search 尋找
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
lr = LogisticRegression(tol=0.001, penalty='l2', fit_intercept=True, C=1.0)
gdbt = GradientBoostingClassifier(tol=100, subsample=0.75, n_estimators=250, max_features=20,
                                  max_depth=6, learning_rate=0.03)
rf = RandomForestClassifier(n_estimators=100, min_samples_split=2, min_samples_leaf=1, 
                            max_features='sqrt', max_depth=6, bootstrap=True)

In [31]:
# 線性迴歸預測檔 (結果有部分隨機, 請以 Kaggle 計算的得分為準, 以下模型同理)
lr.fit(train_X, train_Y)
lr_pred = lr.predict_proba(test_X)[:,1]
sub = pd.DataFrame({'PassengerId': ids, 'Survived': lr_pred})
sub['Survived'] = sub['Survived'].map(lambda x:1 if x>0.5 else 0) 
sub.to_csv('titanic_lr.csv', index=False) 

In [32]:
# 梯度提升機預測檔 
gdbt.fit(train_X, train_Y)
gdbt_pred = gdbt.predict_proba(test_X)[:,1]
sub = pd.DataFrame({'PassengerId': ids, 'Survived': gdbt_pred})
sub['Survived'] = sub['Survived'].map(lambda x:1 if x>0.5 else 0) 
sub.to_csv('titanic_gdbt.csv', index=False)

In [33]:
# 隨機森林預測檔
rf.fit(train_X, train_Y)
rf_pred = rf.predict_proba(test_X)[:,1]
sub = pd.DataFrame({'PassengerId': ids, 'Survived': rf_pred})
sub['Survived'] = sub['Survived'].map(lambda x:1 if x>0.5 else 0) 
sub.to_csv('titanic_rf.csv', index=False)

# 作業
* 分類預測的集成泛化, 也與回歸的很不一樣  
既然分類的 Blending 要變成機率, 才比較容易集成,
那麼分類的 Stacking 要讓第一層的模型輸出機率當特徵, 應該要怎麼寫呢?

In [34]:
from mlxtend.classifier import StackingClassifier

meta_estimator = GradientBoostingClassifier(tol=100, subsample=0.70, n_estimators=50, 
                                           max_features='sqrt', max_depth=4, learning_rate=0.3)
"""
Your Code Here
"""
stacking = StackingClassifier(classifiers=[lr_pred,gdbt_pred,rf_pred],meta_classifier=meta_estimator,use_probas = True, average_probas=False)
#stacking = StackingRegressor(regressors=[linear, gdbt, rf], meta_regressor=meta_estimator)

In [35]:
stacking.fit(train_X, train_Y)
stacking_pred = stacking.predict(test_X)
sub = pd.DataFrame({'PassengerId': ids, 'Survived': stacking_pred})
sub.to_csv('titanic_stacking.csv', index=False)

TypeError: Cannot clone object 'array([0.08340212, 0.55613262, 0.51722066, 0.42358863, 0.20582838,
       0.08574485, 0.57523186, 0.81273568, 0.75486179, 0.29563639,
       0.30794037, 0.71764869, 0.52867297, 0.09034345, 0.74618295,
       0.8707411 , 0.82933417, 0.11534467, 0.54398364, 0.6705422 ,
       0.2005181 , 0.29630429, 0.86368801, 0.4639579 , 0.55439105,
       0.20784934, 0.65092898, 0.15140479, 0.66888451, 0.10934534,
       0.18057656, 0.77329309, 0.61256251, 0.09628687, 0.3188428 ,
       0.1193255 , 0.19393324, 0.21359196, 0.3551398 , 0.42456065,
       0.37733557, 0.70601994, 0.0633258 , 0.81499054, 0.83558462,
       0.29531923, 0.25743243, 0.40951282, 0.5789232 , 0.62105185,
       0.86321053, 0.25567982, 0.93488358, 0.52347266, 0.43879016,
       0.01986074, 0.31325346, 0.07103334, 0.3006215 , 0.9749136 ,
       0.08000609, 0.74871765, 0.10673938, 0.87652636, 0.08133186,
       0.954442  , 0.83002346, 0.22104243, 0.79034589, 0.09949577,
       0.25819962, 0.36116651, 0.19108811, 0.30587617, 0.70996505,
       0.58992975, 0.06867362, 0.38790272, 0.79447692, 0.70639523,
       0.1463632 , 0.14183415, 0.63417875, 0.06843917, 0.64776174,
       0.40573201, 0.25007645, 0.16520781, 0.76010558, 0.23265402,
       0.17785693, 0.06825281, 0.77935622, 0.06867362, 0.37005959,
       0.07127019, 0.48544033, 0.55417529, 0.65172188, 0.05623545,
       0.89564311, 0.14710411, 0.24036197, 0.07022888, 0.3005512 ,
       0.15521796, 0.43611866, 0.09118367, 0.06964004, 0.61691997,
       0.27804092, 0.69366848, 0.56794454, 0.72121096, 0.79851978,
       0.10325174, 0.10279896, 0.36540493, 0.57839952, 0.81163034,
       0.46879939, 0.06905939, 0.5858504 , 0.6812939 , 0.24036197,
       0.83512694, 0.07437538, 0.25548672, 0.67456742, 0.07270128,
       0.06630799, 0.28012748, 0.19547066, 0.36930985, 0.05515382,
       0.07238679, 0.72317372, 0.22528377, 0.22605912, 0.01736202,
       0.17807458, 0.9099252 , 0.23803563, 0.1868108 , 0.21894862,
       0.05771651, 0.28291383, 0.34889781, 0.39068514, 0.24330561,
       0.75553596, 0.5354877 , 0.02927092, 0.18198365, 0.0094723 ,
       0.03007115, 0.90027804, 0.20543471, 0.21894862, 0.85522608,
       0.24655246, 0.5385585 , 0.40563839, 0.06699607, 0.7655718 ,
       0.84349069, 0.91546262, 0.03335522, 0.64949048, 0.21210599,
       0.06181058, 0.70181804, 0.3109736 , 0.10484752, 0.01440618,
       0.48842422, 0.94777003, 0.75678392, 0.39279668, 0.49826341,
       0.5744859 , 0.30171793, 0.95662834, 0.78561462, 0.93321289,
       0.24984497, 0.63594822, 0.04979936, 0.01701588, 0.14351098,
       0.51710998, 0.25709364, 0.3673572 , 0.85754001, 0.83092615,
       0.51494946, 0.50924945, 0.21790265, 0.59389967, 0.62483062,
       0.27188169, 0.10842475, 0.26786707, 0.4801325 , 0.17025214,
       0.71635595, 0.22723345, 0.17056046, 0.90268457, 0.1281579 ,
       0.14734524, 0.35850861, 0.1712338 , 0.2533432 , 0.01605614,
       0.86981951, 0.69418346, 0.22217881, 0.97464795, 0.06867362,
       0.43798116, 0.07586374, 0.22780792, 0.07543781, 0.57987049,
       0.29847497, 0.07332674, 0.24662853, 0.06498916, 0.55185921,
       0.76632075, 0.55792873, 0.05506492, 0.41027197, 0.24915407,
       0.36447916, 0.2261575 , 0.47785103, 0.40989236, 0.50438838,
       0.89436498, 0.79854291, 0.34875833, 0.06842644, 0.03162192,
       0.34752906, 0.8891595 , 0.54791616, 0.63251056, 0.339042  ,
       0.49681668, 0.31361846, 0.44700605, 0.07460543, 0.32926484,
       0.33783016, 0.41526467, 0.535229  , 0.83908307, 0.36049153,
       0.05249028, 0.61789913, 0.74882396, 0.44650687, 0.68839548,
       0.06843917, 0.15714755, 0.06873622, 0.7045412 , 0.46139228,
       0.34248937, 0.09118367, 0.92968042, 0.27880199, 0.44736824,
       0.93177484, 0.60044227, 0.12381239, 0.76030483, 0.38460794,
       0.20848655, 0.11224038, 0.24662853, 0.34823227, 0.42517586,
       0.05284024, 0.0751726 , 0.25291864, 0.10484752, 0.30780909,
       0.66951943, 0.82925741, 0.26974517, 0.64764279, 0.06306421,
       0.07042022, 0.55061332, 0.54301537, 0.34323364, 0.53024892,
       0.32259245, 0.25133022, 0.05238329, 0.70158189, 0.24662853,
       0.94254002, 0.73320821, 0.46736436, 0.20915674, 0.81848121,
       0.45758446, 0.66725353, 0.34034306, 0.22189399, 0.5642005 ,
       0.73274437, 0.66617922, 0.26110489, 0.41905512, 0.86945039,
       0.07022888, 0.10924357, 0.58940481, 0.68986144, 0.5867553 ,
       0.45796519, 0.45147433, 0.89965153, 0.5362181 , 0.77786583,
       0.79945493, 0.34734762, 0.14088592, 0.31560102, 0.34022388,
       0.25128068, 0.20963484, 0.73873019, 0.20176583, 0.1368991 ,
       0.89296973, 0.39889319, 0.0082101 , 0.55628773, 0.06123555,
       0.78510479, 0.58940481, 0.69382706, 0.81019561, 0.50272722,
       0.50716316, 0.29429313, 0.16906895, 0.14380172, 0.27451357,
       0.19806481, 0.95263033, 0.33670153, 0.40951282, 0.57468448,
       0.00648308, 0.55802645, 0.82801512, 0.07056829, 0.6206749 ,
       0.02629948, 0.68286886, 0.71166231, 0.89986345, 0.83668085,
       0.40931308, 0.70487359, 0.11903728, 0.13613636, 0.85519654,
       0.92201802, 0.61137957, 0.23473781, 0.20571346, 0.01069517,
       0.40951282, 0.41771097, 0.21199472, 0.6168596 , 0.16847565,
       0.44567166, 0.60652316, 0.47158407, 0.10047632, 0.07679339,
       0.51838952, 0.34599615, 0.09767568, 0.76146233, 0.04911924,
       0.53284365, 0.42133107, 0.60146284, 0.11333851, 0.39834034,
       0.60859687, 0.12274474, 0.92746219, 0.31438745, 0.6975281 ,
       0.27356441, 0.13373973, 0.91451099, 0.24649542, 0.73367023,
       0.24662853, 0.55784279, 0.69818772, 0.06044517, 0.61345708,
       0.32354695, 0.33761132, 0.47633091])' (type <type 'numpy.ndarray'>): it does not seem to be a scikit-learn estimator as it does not implement a 'get_params' methods.