In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from category_encoders.one_hot import OneHotEncoder


sns.set(style='darkgrid')

titanic = pd.read_csv("data/titanic_train.csv")
titanic.drop(['Ticket', 'PassengerId'], axis=1, inplace=True)
titanic.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,,S


In [5]:
titanic.isna().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [6]:
titanic['Age'] = titanic['Age'].fillna(-99)

# Feature Set 1: Label Encoder

In [7]:
def eng_cabin1(df, col = 'Cabin'):
    df = df.copy()
    df[col] = df[col].str[0]
    df[col] = df[col].fillna('Missing')
    enc = LabelEncoder()
    df[col] = enc.fit_transform(df[col])
    return df

def eng_name1(df, col = 'Name', newcol = 'title'):
    df = df.copy()
    df[newcol] = df[col].str.extract('(Mrs|Mr|Miss)', expand = False).fillna("other")
    df.drop(col, axis=1, inplace=True)
    enc = LabelEncoder()
    df[newcol] = enc.fit_transform(df[newcol])
    return df

def eng_embarked1(df, col = 'Embarked'):
    df = df.copy()
    most_frequent_value = df[col].value_counts().head(1).index[0]
    df[col].fillna(most_frequent_value, inplace=True)
    enc = LabelEncoder()
    df[col] = enc.fit_transform(df[col])
    return df

def eng_sex1(df, col = 'Sex'):
    df = df.copy()
    enc = df[col].value_counts() / len(titanic)
    df[col] = df[col].map(enc)
    return df

In [8]:
X1 = titanic.pipe(eng_cabin1) \
            .pipe(eng_name1) \
            .pipe(eng_embarked1) \
            .pipe(eng_sex1)
X1.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title
0,0,3,0.647587,22.0,1,0,7.25,7,2,1
1,1,1,0.352413,38.0,1,0,71.2833,2,0,2
2,1,3,0.352413,26.0,0,0,7.925,7,2,0
3,1,1,0.352413,35.0,1,0,53.1,2,2,2
4,0,3,0.647587,35.0,0,0,8.05,7,2,1


# Feature Set 2: OneHotEncoder

In [167]:
def eng_cabin2(df, col = 'Cabin'):
    df = df.copy()
    df[col] = df[col].str[0]
    df[col] = df[col].fillna('Missing')
    enc = OneHotEncoder(cols=col, return_df=True, use_cat_names=True)
    df = enc.fit_transform(df)
    return df

def eng_name2(df, col = 'Name', newcol = 'title'):
    df = df.copy()
    df[newcol] = df[col].str.extract('(Mrs|Mr|Miss)', expand = False).fillna("other")
    df.drop(col, axis=1, inplace=True)
    enc = OneHotEncoder(cols=newcol, return_df=True, use_cat_names=True)
    df = enc.fit_transform(df)
    return df

def eng_embarked2(df, col = 'Embarked'):
    df = df.copy()
    most_frequent_value = df[col].value_counts().head(1).index[0]
    df[col].fillna(most_frequent_value, inplace=True)
    enc = OneHotEncoder(cols=col, return_df=True, use_cat_names=True)
    df = enc.fit_transform(df)
    return df

def eng_sex2(df, col = 'Sex'):
    df = df.copy()
    enc = df[col].value_counts() / len(titanic)
    df[col] = df[col].map(enc)
    return df

In [168]:
X2 = titanic.pipe(eng_cabin2) \
            .pipe(eng_name2) \
            .pipe(eng_embarked2) \
            .pipe(eng_sex2)
X2.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin_Missing,Cabin_C,Cabin_E,...,Cabin_B,Cabin_F,Cabin_T,Embarked_S,Embarked_C,Embarked_Q,title_Mr,title_Mrs,title_Miss,title_other
0,0,3,0.647587,22.0,1,0,7.25,1,0,0,...,0,0,0,1,0,0,1,0,0,0
1,1,1,0.352413,38.0,1,0,71.2833,0,1,0,...,0,0,0,0,1,0,0,1,0,0
2,1,3,0.352413,26.0,0,0,7.925,1,0,0,...,0,0,0,1,0,0,0,0,1,0
3,1,1,0.352413,35.0,1,0,53.1,0,1,0,...,0,0,0,1,0,0,0,1,0,0
4,0,3,0.647587,35.0,0,0,8.05,1,0,0,...,0,0,0,1,0,0,1,0,0,0


In [169]:
X1.shape, X2.shape

((891, 10), (891, 23))

# Random Forest

In [170]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

kf = KFold(n_splits=10, shuffle=True, random_state=28)
rf = RandomForestClassifier(n_estimators=500, random_state=42)
gbm = GradientBoostingClassifier(n_estimators=500, random_state=42)

rf_mean_accuracy = cross_val_score(rf, X1.drop('Survived', axis=1), X1['Survived'], cv=kf, scoring='accuracy', n_jobs=-1).mean()
gbm_mean_accuracy = cross_val_score(gbm, X1.drop('Survived', axis=1), X1['Survived'], cv=kf, scoring='accuracy', n_jobs=-1).mean()
print(f"RF Accuracy -- X1: {rf_mean_accuracy}")
print(f"GBM Accuracy -- X1: {gbm_mean_accuracy}")

RF Accuracy -- X1: 0.8114856429463171
GBM Accuracy -- X1: 0.8080898876404495


In [171]:
rf_mean_accuracy = cross_val_score(rf, X2.drop('Survived', axis=1), X2['Survived'], cv=kf, scoring='accuracy', n_jobs=-1).mean()
gbm_mean_accuracy = cross_val_score(gbm, X2.drop('Survived', axis=1), X2['Survived'], cv=kf, scoring='accuracy', n_jobs=-1).mean()
print(f"RF Accuracy -- X1: {rf_mean_accuracy}")
print(f"GBM Accuracy -- X1: {gbm_mean_accuracy}")

RF Accuracy -- X1: 0.8013732833957553
GBM Accuracy -- X1: 0.8036204744069912


# Stack

In [172]:
X = titanic.drop('Survived', axis=1)
y = titanic['Survived']

In [173]:
from itertools import product
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

kf_out = KFold(n_splits=10, shuffle=True, random_state=28)
kf_in = KFold(n_splits=10, shuffle=True, random_state=28)

In [142]:
cv_mean = []
for fold, (tr, ts) in enumerate(kf_out.split(X, y)):
    X1_train, X1_test = X1.iloc[tr], X1.iloc[ts]
    X2_train, X2_test = X2.iloc[tr], X2.iloc[ts]
    y_train, y_test = y.iloc[tr], y.iloc[ts]
    
    models = [rf, gbm]
    feature_sets = [(X1_train, X1_test), (X2_train, X2_test)]
    
    predictions_cv = []
    predictions_test = []
    
    for model, feature_set in product(models, feature_sets):
        predictions_cv.append(cross_val_predict(model, feature_set[0], y_train, cv=kf_in, n_jobs=-1).reshape(-1,1))
        model.fit(feature_set[0], y_train)
        ptest = model.predict(feature_set[1])
        predictions_test.append(ptest.reshape(-1,1))
        
    predictions_cv = np.concatenate(predictions_cv, axis=1)
    predictions_test = np.concatenate(predictions_test, axis=1)
    
    stacker = LogisticRegression(solver='lbfgs')
    stacker.fit(predictions_cv, y_train)
    y_pred = stacker.predict(predictions_test)
    
    error = accuracy_score(y_test, y_pred)
    cv_mean.append(error)
    print(f"Fold {fold} - Accuracy: {error}")
    
print(f'Mean Accuracy CV10 {np.mean(cv_mean)}')

Fold 0 - Accuracy: 1.0
Fold 1 - Accuracy: 1.0
Fold 2 - Accuracy: 1.0
Fold 3 - Accuracy: 1.0
Fold 4 - Accuracy: 1.0
Fold 5 - Accuracy: 1.0
Fold 6 - Accuracy: 1.0
Fold 7 - Accuracy: 1.0
Fold 8 - Accuracy: 1.0
Fold 9 - Accuracy: 1.0
Mean Accuracy CV10 1.0


# Predicting on the Test Set

In [174]:
titanic_test = pd.read_csv("data/titanic_test.csv")

titanic_test.drop('Ticket', axis=1, inplace=True)
titanic_test['Age'] = titanic_test['Age'].fillna(-99)
titanic_test['Fare'] = titanic_test['Fare'].fillna(titanic_test['Fare'].median())

In [175]:
titanic_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,,S


In [176]:
passenger_id = titanic_test['PassengerId']
titanic_test.drop('PassengerId', axis=1, inplace=True)

X1 = titanic_test.pipe(eng_cabin1).pipe(eng_name1).pipe(eng_embarked1).pipe(eng_sex1)
X2 = titanic_test.pipe(eng_cabin2).pipe(eng_name2).pipe(eng_embarked2).pipe(eng_sex2)

In [None]:
cv_mean = []

models = [rf, gbm]
feature_sets = [X1, X2]

predictions_cv = []
predictions_test = []

for model, feature_set in product(models, feature_sets):
    predictions_cv.append(cross_val_predict(model, feature_set[0], y_train, cv=kf_in, n_jobs=-1).reshape(-1,1))
    model.fit(feature_set[0], y_train)
    ptest = model.predict(feature_set[1])
    predictions_test.append(ptest.reshape(-1,1))

predictions_cv = np.concatenate(predictions_cv, axis=1)
predictions_test = np.concatenate(predictions_test, axis=1)

stacker = LogisticRegression(solver='lbfgs')
stacker.fit(predictions_cv, y_train)
y_pred = stacker.predict(predictions_test)

error = accuracy_score(y_test, y_pred)
cv_mean.append(error)
print(f"Fold {fold} - Accuracy: {error}")
    
print(f'Mean Accuracy CV10 {np.mean(cv_mean)}')