In [7]:
# 做完特徵工程前的所有準備 (與前範例相同)
import pandas as pd
import numpy as np
import copy
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import warnings
warnings.simplefilter('ignore')
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

import copy

df = pd.read_csv('/Users/max/Downloads/titanic_train.csv')
train_Y = df['Survived']
df = df.drop(['PassengerId', 'Survived'] , axis=1)
df.head()

# 因為需要把類別型與數值型特徵都加入, 故使用最簡版的特徵工程
LEncoder = LabelEncoder()
MMEncoder = MinMaxScaler()
for c in df.columns:
    df[c] = df[c].fillna(-1)
    if df[c].dtype == 'object':
        df[c] = LEncoder.fit_transform(list(df[c].values))
    df[c] = MMEncoder.fit_transform(df[c].values.reshape(-1, 1))
df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1.0,0.121348,1.0,0.283951,0.125,0.0,0.769118,0.014151,0.0,1.0
1,0.0,0.213483,0.0,0.481481,0.125,0.0,0.876471,0.139136,0.557823,0.333333
2,1.0,0.396629,0.0,0.333333,0.0,0.0,0.983824,0.015469,0.0,1.0
3,0.0,0.305618,0.0,0.444444,0.125,0.0,0.072059,0.103644,0.380952,1.0
4,1.0,0.016854,1.0,0.444444,0.0,0.0,0.694118,0.015713,0.0,1.0


In [8]:
# 梯度提升樹擬合後, 將結果依照重要性由高到低排序 (note : D27作業中'Ticket'是第一名特徵, 'Age'是數值特徵中排名最高者)
estimator = GradientBoostingClassifier()
estimator.fit(df.values, train_Y)
feats = pd.Series(data=estimator.feature_importances_, index=df.columns)
feats = feats.sort_values(ascending=False)
feats

Sex         0.444443
Pclass      0.111911
Ticket      0.098938
Fare        0.087641
Name        0.082386
Age         0.072261
Cabin       0.070895
Embarked    0.012220
Parch       0.011543
SibSp       0.007760
dtype: float64

In [11]:
# 原始特徵 + 梯度提升樹
train_X = df.values
cross_val_score(estimator, train_X, train_Y, cv=5).mean()

0.8215972695810002

In [10]:
# 高重要性特徵 + 梯度提升樹 

high_feature = feats[: int(len(feats) / 2)].index
train_X = MMEncoder.fit_transform(df[high_feature])
cross_val_score(estimator, train_X, train_Y, cv=5).mean()

0.811465936966161

In [12]:
# 製作新特徵看效果1
df['Add_char'] = df['Ticket'] + df['Name']
df['Multi_char'] = df['Ticket'] * df['Name']
df['GO_div1p'] = df['Ticket'] / (df['Name']+1)
df['OG_div1p'] = df['Name'] / (df['Ticket']+1)
print(df.shape)
train_X = MMEncoder.fit_transform(df)
cross_val_score(estimator, train_X, train_Y, cv=5).mean()

(891, 14)


0.8181885369355912

In [13]:
# 製作新特徵看效果2
df['Add_char'] = df['Sex'] + df['Fare']
df['Multi_char'] = df['Sex'] * df['Fare']
df['GO_div1p'] = df['Sex'] / (df['Fare']+1)
df['OG_div1p'] = df['Fare'] / (df['Sex']+1)
print(df.shape)
train_X = MMEncoder.fit_transform(df)
cross_val_score(estimator, train_X, train_Y, cv=5).mean()

(891, 14)


0.8204800220725892