In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

In [None]:
df_titan = pd.read_csv('titanic_train.csv')

In [None]:
titan_y = df_titan.Survived
titan_x = df_titan.drop(columns=['Survived', 'PassengerId', 'Cabin','Ticket'])

In [None]:
titan_x.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


## Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split  
X_train, X_test, Y_train, Y_test = train_test_split(
    titan_x, titan_y, test_size=0.3, random_state=1)  

## Feature Engineering: Title Extraction

In [None]:
def AddTitle(df):
    df['noble'] = df['Name'].str.extract(r'\b(\w+)\.')
    df.loc[~df['noble'].isin(['Mr','Mrs','Miss','Miss','Ms']),'noble'] = 1
    df.loc[df['noble'].isin(['Mr','Mrs','Miss','Miss','Ms']),'noble'] = 0
    df = df.drop(columns=['Name'])
    return df

In [None]:
X_train = AddTitle(X_train)

In [None]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,noble
114,3,female,17.0,0,0,14.4583,C,0
874,2,female,28.0,1,0,24.0,C,0
76,3,male,,0,0,7.8958,S,0
876,3,male,20.0,0,0,9.8458,S,0
674,2,male,,0,0,0.0,S,0


In [None]:
def AddFamily(df):
    df['Family'] = df['Parch'] + df['SibSp']
    df.loc[df['Family']>0, 'Family'] = 1
    df.loc[df['Family']==0, 'Family'] = 0
    df = df.drop(columns=['SibSp', 'Parch'])
    return df


In [None]:
X_train = AddFamily(X_train)


In [None]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,noble,Family
114,3,female,17.0,14.4583,C,0,0
874,2,female,28.0,24.0,C,0,1
76,3,male,,7.8958,S,0,0
876,3,male,20.0,9.8458,S,0,0
674,2,male,,0.0,S,0,0


## ColumnTransformer

### ColumnTransformer for Imputation

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [None]:
imp1 = SimpleImputer(strategy='mean')
imp2 = SimpleImputer(strategy='most_frequent')
tf = ColumnTransformer([('imp_age', imp1, [2,3]),('imp_embark', imp2, [4])], remainder='passthrough')
tf = tf.fit(X_train)

In [None]:
X_train = tf.transform(X_train)

In [None]:
X_train[0]

array([17.0, 14.4583, 'C', 3, 'female', 0, 0], dtype=object)

### ColumnTransformer for Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False)

In [None]:
tf1 = ColumnTransformer([('enca', enc, [2,3,4])], remainder='passthrough')
tf1 = tf1.fit(X_train)

In [None]:
X_train = tf1.transform(X_train)
X_train[0]

array([1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 17.0, 14.4583, 0, 0],
      dtype=object)

### ColumnTransformer for Scaling


In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [None]:
tf2 = ColumnTransformer([('scaler', scaler, [8,9])], remainder='passthrough')
tf2 = tf2.fit(X_train)

In [None]:
X_train = tf2.transform(X_train)
X_train[0]

array([0.20584898525148115, 0.028220722145058292, 1.0, 0.0, 0.0, 0.0, 0.0,
       1.0, 1.0, 0.0, 0, 0], dtype=object)

## Dimensionality Reduction

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=8)

In [None]:
pca = pca.fit(X_train)

In [None]:
pca.explained_variance_ratio_.sum()

In [None]:
X_train = pca.transform(X_train)
X_train[0]

array([ 0.63768284, -0.57559412,  1.02450343,  0.598545  ,  0.06882799,
       -0.55713137,  0.26674571, -0.08707042])

## Build Basic Trees

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier 
clf_dt = DecisionTreeClassifier()

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier()

### AdaBoost Tree

In [None]:
from sklearn.ensemble import AdaBoostClassifier
clf_ada = AdaBoostClassifier()

## Making Pipeline

In [2]:
df_titan = pd.read_csv('titanic_train.csv')
titan_y = df_titan.Survived
titan_x = df_titan.drop(columns=['Survived', 'PassengerId','Ticket','Cabin'])

In [3]:
from sklearn.model_selection import train_test_split  
X_train, X_test, Y_train, Y_test = train_test_split(
    titan_x, titan_y, test_size=0.3, random_state=1) 

In [4]:
def AddFeature(df):
    df['noble'] = df['Name'].str.extract(r'\b(\w+)\.')
    df.loc[~df['noble'].isin(['Mr','Mrs','Miss','Miss','Ms']),'noble'] = 1
    df.loc[df['noble'].isin(['Mr','Mrs','Miss','Miss','Ms']),'noble'] = 0
    df = df.drop(columns=['Name'])
    df['Family'] = df['Parch'] + df['SibSp']
    df.loc[df['Family']>0, 'Family'] = 1
    df.loc[df['Family']==0, 'Family'] = 0
    df = df.drop(columns=['SibSp', 'Parch'])
    return df

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import FunctionTransformer
imp1 = SimpleImputer(strategy='mean')
imp2 = SimpleImputer(strategy='most_frequent')
enc = OneHotEncoder(sparse=False)
scaler = MinMaxScaler()
af = FunctionTransformer(AddFeature)
tf = ColumnTransformer([('impa', imp1, [2,3]),('impb', imp2, [4])], remainder='passthrough')
tf1 = ColumnTransformer([('enca', enc, [2,3,4])], remainder='passthrough')
tf2 = ColumnTransformer([('scalera', scaler, [8,9])], remainder='passthrough')
pca = PCA(n_components=8)

In [6]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
clf_dt = DecisionTreeClassifier()
clf_rf = RandomForestClassifier(max_depth=5, n_estimators=200)
clf_ada = AdaBoostClassifier(learning_rate=0.1, n_estimators=100)

In [7]:
from sklearn.pipeline import Pipeline
steps = [('add', af),
         ('tf', tf),
         ('tf1', tf1),
         ('tf2', tf2), 
         ('pca', pca),        
         ('c1f', clf_rf)]
pipe = Pipeline(steps)

In [8]:
pipe = pipe.fit(X_train, Y_train)

In [9]:
pipe.score(X_test, Y_test)

0.7835820895522388

# Evaluate the Model

In [10]:
Y_predict = pipe.predict(X_test)

In [11]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test, Y_predict)

array([[140,  13],
       [ 45,  70]])

In [12]:
from sklearn.metrics import recall_score
recall_score(Y_test, Y_predict)

0.6086956521739131

In [13]:
from sklearn.metrics import precision_score
precision_score(Y_test, Y_predict)

0.8433734939759037

In [14]:
from sklearn.metrics import f1_score
f1_score(Y_test, Y_predict)

0.7070707070707071

In [None]:
prob = pipe.predict_proba(X_test)

In [20]:
from sklearn.metrics import roc_curve, roc_auc_score, RocCurveDisplay
fpr, tpr, thresholds = roc_curve(Y_test, prob[:,1])

In [21]:
figure = RocCurveDisplay(fpr=fpr, tpr=tpr)

In [None]:
figure.plot()

# Cross Validation

## K-Fold

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, cross_val_predict

In [None]:
kf = KFold()
cross_val_score(pipe, titan_x, titan_y, cv=kf )

array([0.83798883, 0.80337079, 0.82022472, 0.78651685, 0.83146067])

In [None]:
skf = StratifiedKFold()
cross_val_score(pipe, titan_x, titan_y, cv=skf )

array([0.82122905, 0.81460674, 0.82022472, 0.79775281, 0.83707865])

## GridSearch with Pipeline

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'tf__impa__strategy': ['mean','median'],    
    'c1f__n_estimators': [200,300],
    'c1f__criterion': ['gini','entropy'],
    'c1f__max_depth': [4,5,6],
    'c1f__max_features': ['auto','log2']
}

In [None]:
pipe.get_params().keys()

In [None]:
search = GridSearchCV(pipe, param_grid, n_jobs=-1, verbose=12)

#no need to split train/test

In [None]:
search = search.fit(titan_x, titan_y)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [None]:
search.best_score_

0.8204255853367648

In [None]:
search.best_params_

{'c1f__criterion': 'entropy',
 'c1f__max_depth': 5,
 'c1f__max_features': 'auto',
 'c1f__n_estimators': 300,
 'tf__impa__strategy': 'median'}

In [None]:
titan_test = pd.read_csv('titanic_test.csv')
titan_test = titan_test.drop(columns=['PassengerId','Ticket','Cabin'])

In [None]:
titan_test.isnull().sum()

Pclass       0
Name         0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [None]:
pred = search.predict(titan_test)

In [None]:
np.savetxt("test.csv", pred, delimiter=",")