<br>

## 1. Feature Engineering

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from matplotlib import rc
import missingno as msno


rc('font', family='Malgun Gothic') # Windows OS 
# rc('font', family='AppleGothic') # os x(mac os)  

%matplotlib inline
plt.style.use("ggplot") # 그래프를 격자 스타일로 (숫자 범위가 눈에 잘 띄도록 ggplot 스타일 사용.)
mpl.rcParams["axes.unicode_minus"] = False # 그래프에서 마이너스 폰트 깨지는 문제 해결을 위해

In [2]:
titanic_df = pd.read_csv("titanic.csv")


del titanic_df['Cabin'] # 너무 많은 결측치가 존재
del titanic_df['PassengerId'] # Passenger 번호는 큰 의미를 갖고있지 않은 일련번호
del titanic_df['Ticket'] # ticket 번호에서 패턴이 확인되지 않음


titanic_df['Title'] = titanic_df['Name'].str.extract('([A-Za-z]+)\.', expand=False)

rare_title = []
for title in set(titanic_df['Title']):
    if list(titanic_df['Title']).count(title) < 10:
        rare_title.append(title)

titanic_df['Title'] = titanic_df['Title'].replace('Mlle', 'Miss') # Mademoiselle
titanic_df['Title'] = titanic_df['Title'].replace('Ms', 'Miss') 
titanic_df['Title'] = titanic_df['Title'].replace('Mme', 'Mrs') # Madame
titanic_df['Title'] = titanic_df['Title'].replace(rare_title, 'Rare')

del titanic_df['Name'] # 호칭만 남김

title_mapping = {"Master":1, "Miss":2, "Mr":3, "Mrs":4, "Rare":5 }

titanic_df['Title'] = titanic_df['Title'].map(title_mapping)
titanic_df['Title'] = titanic_df['Title'].fillna(0)
titanic_df['Title'].astype(int)


sex_mapping = {"male": 0 , "female":1} 
titanic_df['Sex'] = titanic_df['Sex'].map(sex_mapping)


titanic_df['Embarked'] = titanic_df['Embarked'].fillna('S')

mapping_data ={"S":0, "Q":1, "C":2}
titanic_df["Embarked"] = titanic_df["Embarked"].map(mapping_data)


# titanic_df["Fareband"] = pd.cut(titanic_df["Fare"], 5) # Bin values into discrete intervals.
# titanic_df[['Fareband','Survived']].groupby('Fareband').mean().sort_values(by='Survived', ascending=False)

# del titanic_df['Fareband']

# titanic_df.loc[ titanic_df['Fare'] <= 102, 'Fare'] = 0,
# titanic_df.loc[(titanic_df['Fare'] > 102) & (titanic_df['Fare'] <= 204), 'Fare'] = 1,
# titanic_df.loc[(titanic_df['Fare'] > 204) & (titanic_df['Fare'] <= 307), 'Fare'] = 2,
# titanic_df.loc[ titanic_df['Fare'] > 307, 'Fare'] = 4


titanic_df["FamilySize"] = titanic_df["SibSp"] + titanic_df["Parch"] +1

titanic_df['isAlone'] = 0
titanic_df.loc[titanic_df['FamilySize'] == 1, 'isAlone'] = 1

del titanic_df['SibSp']
del titanic_df['Parch']

# family_mapping = {1: 0, 2: 0.4, 3: 0.8, 4: 1.2, 5: 1.6, 6: 2, 7: 2.4, 8: 2.8, 9: 3.2, 10: 3.6, 11: 4}
# titanic_df['FamilySize'] = titanic_df['FamilySize'].map(family_mapping)


titanic_df["Age"].fillna(titanic_df.groupby("Title")["Age"].transform("median"), inplace=True)

## 결측치를 별도의 머신러닝 모델을 만들어 채워넣을 수도 있음
# from sklearn.ensemble import RandomForestRegressor

# titanic_df["Age"].fillna(0, inplace=True)
# f_columns = ['Pclass', 'Sex', 'Fare', 'Embarked', 'Title', 'FamilySize']
# AgeNull = titanic_df[titanic_df['Age']==0]
# AgeNotNull = titanic_df[titanic_df['Age']!=0]
# X_train = AgeNotNull[f_columns]

# random_forest = RandomForestRegressor(n_estimators=200)
# random_forest.fit(X_train, AgeNotNull["Age"]) # 나이 정보가 있는 열로 나이 열을 맞추기 위한 모델을 학습

# train_pred = random_forest.predict(AgeNull[f_columns])
# AgeNull["Age"]= train_pred

# titanic_df = AgeNotNull.append(AgeNull)


# titanic_df['AgeBand'] = pd.cut(titanic_df['Age'], 5)
# titanic_df[['AgeBand', 'Survived']].groupby('AgeBand', as_index=False).mean().sort_values(by='AgeBand', ascending=True)

# del titanic_df['AgeBand']

# titanic_df.loc[ titanic_df['Age'] <= 16, 'Age'] = 0,
# titanic_df.loc[(titanic_df['Age'] > 16) & (titanic_df['Age'] <= 32), 'Age'] = 1,
# titanic_df.loc[(titanic_df['Age'] > 32) & (titanic_df['Age'] <= 48), 'Age'] = 2,
# titanic_df.loc[(titanic_df['Age'] > 48) & (titanic_df['Age'] <= 64), 'Age'] = 3,
# titanic_df.loc[ titanic_df['Age'] > 64, 'Age'] = 4

titanic_df.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,FamilySize,isAlone
0,0,3,0,22.0,7.25,0,3,2,0
1,1,1,1,38.0,71.2833,2,4,2,0
2,1,3,1,26.0,7.925,0,2,1,1


### - Age : 구간화 제외됨
### - Fare : 구간화 제외됨
### - FamilySize : Re-scaling 제외됨

<br>
<br>

## 2. X-Y Split & Train-Test Split

In [3]:
titanic_target = titanic_df['Survived'].copy()
titanic_data = titanic_df.copy()
del titanic_data['Survived']


from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(titanic_data, titanic_target, 
                                                    test_size=0.3,
                                                    random_state=0)

<br>
<br>

## 3. Make Pipeline for feature-transformer (StandardScaler & OneHotEncoder)

In [4]:
# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [5]:
numeric_features = ['Age', 'Fare']
numeric_transformer = StandardScaler()

categorical_features = ['Pclass', 'Sex', 'Embarked', 'Title', 'FamilySize', 'isAlone']
categorical_transformer = OneHotEncoder(categories='auto') # categories='auto' : just for ignoring warning messages

preprocessor = ColumnTransformer(
    transformers=[ # List of (name, transformer, column(s))
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

<br>
<br>

## 4. Training with single model

In [6]:
from sklearn.ensemble import GradientBoostingClassifier

In [7]:
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', GradientBoostingClassifier(n_estimators=200))])

In [8]:
model.fit(x_train, y_train)

accuracy = model.score(x_test, y_test)
print("model score:", round(accuracy, 4))

model score: 0.8284


<br>
<br>

## 5. Training with multiple models

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

In [10]:
models = [LogisticRegression(solver='lbfgs'),
          KNeighborsClassifier(n_neighbors=20),
          DecisionTreeClassifier(),
          ExtraTreeClassifier(),
          AdaBoostClassifier(),
          RandomForestClassifier(n_estimators=200),
          GradientBoostingClassifier(n_estimators=200),
          SVC(gamma='auto')] 

In [11]:
# You can make following codes as a function.
# You can show the result knittly as a DataFrame.

for model in models:
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                     ('classifier', model)])
    
    model_pipeline.fit(x_train, y_train)
    accuracy = model_pipeline.score(x_test, y_test)
    
    print("model name:", model.__str__()[:10], '...')
    print("model score:", round(accuracy, 4))
    print()

model name: LogisticRe ...
model score: 0.8284

model name: KNeighbors ...
model score: 0.806

model name: DecisionTr ...
model score: 0.7873

model name: ExtraTreeC ...
model score: 0.7799

model name: AdaBoostCl ...
model score: 0.8209

model name: RandomFore ...
model score: 0.7985

model name: GradientBo ...
model score: 0.8284

model name: SVC(gamma= ...
model score: 0.8209



<br>
<br>

## 6. Go deep with Hyper-params

In [12]:
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', GradientBoostingClassifier(n_estimators=200, random_state=0))])

In [13]:
model.fit(x_train, y_train)

accuracy = model.score(x_test, y_test)
print("model score:", round(accuracy, 4))

model score: 0.8284


In [14]:
from sklearn.model_selection import GridSearchCV

In [15]:
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', GradientBoostingClassifier())])

In [16]:
model.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'classifier', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__num', 'preprocessor__cat', 'preprocessor__num__copy', 'preprocessor__num__with_mean', 'preprocessor__num__with_std', 'preprocessor__cat__categories', 'preprocessor__cat__drop', 'preprocessor__cat__dtype', 'preprocessor__cat__handle_unknown', 'preprocessor__cat__sparse', 'classifier__ccp_alpha', 'classifier__criterion', 'classifier__init', 'classifier__learning_rate', 'classifier__loss', 'classifier__max_depth', 'classifier__max_features', 'classifier__max_leaf_nodes', 'classifier__min_impurity_decrease', 'classifier__min_impurity_split', 'classifier__min_samples_leaf', 'classifier__min_samples_split', 'classifier__min_weight_fraction_leaf', 'classifier__n_estimators', 'classifier__n_iter_no_change', 'classifier__random_state', 

In [17]:
param_grid = {
    'classifier__loss': ['deviance', 'exponential'], # you can exclude "deviance" 
    'classifier__learning_rate': [0.01, 0.001], # you can exclude "0.001" 
    'classifier__n_estimators': [200, 500], # 500, 1000, 1500
    'classifier__min_samples_split': [2, 4, 6],
    'classifier__max_depth': [2, 4, 6],
    'classifier__random_state': [0]
}

grid_search = GridSearchCV(model, param_grid, 
                           refit=True, cv=3, n_jobs=1, verbose=1, scoring= 'accuracy')

grid_search.fit(x_train, y_train)

print("Best params:", grid_search.best_params_)

Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best params: {'classifier__learning_rate': 0.01, 'classifier__loss': 'exponential', 'classifier__max_depth': 4, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200, 'classifier__random_state': 0}


In [18]:
grid_search.refit_time_ # Seconds used for refitting the best model on the whole dataset.

0.2732710838317871

In [19]:
accuracy = grid_search.score(x_test, y_test)
print("model score:", round(accuracy, 4))

model score: 0.8284


In [199]:
# model = Pipeline(steps=[('preprocessor', preprocessor),
#                         ('classifier', GradientBoostingClassifier(learning_rate=0.01, 
#                                                                   loss='exponential', 
#                                                                   max_depth=4, 
#                                                                   min_samples_split=2, 
#                                                                   n_estimators=200, 
#                                                                   random_state=0))])

# model.fit(x_train, y_train)

# accuracy = model.score(x_test, y_test)
# print("model score:", round(accuracy, 4))