# Title: Kaggle Titanic Competition
## Author: Kazutoki Matsui

This notebook explores Kaggle Competition on Titanic passenger dataset (https://www.kaggle.com/c/titanic/submissions).
My model seems well-performing on training set, but the public leadership board is much lower, indicating I am overfitting the training set.

In [154]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import sklearn.model_selection as skmods
import sklearn.preprocessing as skprep
import sklearn.neural_network as sknn
import sklearn.ensemble as skens
import sklearn.metrics as skmet
import sklearn.impute as skimp
import sklearn.tree as sktree
import sklearn.svm as sksvm

%matplotlib inline

In [147]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [148]:
full_data = pd.concat([train, test], ignore_index=True)
full_data.iloc[890:900,]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
890,891,0.0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q
891,892,,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
892,893,,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
893,894,,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
894,895,,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
895,896,,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
896,897,,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
897,898,,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
898,899,,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
899,900,,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C


In [195]:
## A function to parse a title from a person's name
def parse_titles(df):
  return df['Name'].map(lambda s: s[s.index(',')+1:s.index('.')]).str.lstrip()

def is_alone(df):
    if df['SibSp']+ df['Parch']==0:
        return 1
    else:
        return 0

def clean_and_transform(df):
    '''(a) Filling in missing values with column mean'''
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode().iloc[0])
    
    '''(b)-1 Mapping categorical variables into ordinals'''
    df['Embarked'] = df['Embarked'].map({
        'Q': 0,
        'C': 1,
        'S': 2
    })
    
    '''(b)-2 Sex to Gender One-hot'''
    gender = []
    for sex in df['Sex']:
        if sex == 'female':
            gender.append(1)
        else:
            gender.append(0)
    df['Gender'] = gender
    
    '''(c) Creating Title feature'''
    df['Title'] = parse_titles(df)
    df['Title'] = df['Title'].replace(['Capt','Don','Jonkheer','Rev'], 'Doomed')
    df['Title'] = df['Title'].replace(['Dr','Sir','Lady','Dona','Col','Major','the Countess'], 'Others')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    df['Title'] = df['Title'].replace('Miss', 'Ms')
    df['Title'] = df['Title'].replace('Mlle', 'Ms')
    df['Title'] = df['Title'].fillna(df['Title'].mode().iloc[0])
    df['Title'] = df['Title'].map({
        'Doomed': 0,
        'Mr': 1,
        'Master': 2,
        'Others': 3,
        'Ms': 4,
        'Mrs': 5,
    }) #Mapping the Title categories into ordinals
    
    '''(d) Creating is_alone feature'''
    df['Alone'] = df.apply(is_alone, axis=1)
    
    '''(e) Create a PartySize column'''
    df['PartySize'] = df['SibSp'] + df['Parch'] + 1
    
    '''(f) Calculate Survival Rate in a family'''
    df['Ticket_num'] = df['Ticket'].apply(lambda x: x.split(' ')[-1])
    temp = pd.DataFrame({'SurviveRate': df.groupby('Ticket_num').sum()['Survived']}).reset_index()
    df = pd.merge(df,temp,on='Ticket_num',how='left')
    df['SurviveRate'] = df['SurviveRate'] / df['PartySize']
    
    '''(g) Binning Fares'''
    fares = []
    for fare in df['Fare']:
        if fare <= df['Fare'].quantile(q = 0.20):
            fares.append(0)
        elif fare <= df['Fare'].quantile(q = 0.4):
            fares.append(1)
        elif fare <= df['Fare'].quantile(q = 0.6):
            fares.append(2)
        elif fare <= df['Fare'].quantile(q = 0.8):
            fares.append(3)
        else:
            fares.append(4)
    df['Fare_binned'] = fares
    
    '''(h) Binning Age'''
    ages = []
    for age in df['Age']:
        if age <= df['Age'].quantile(q = 0.2):
            ages.append(0)
        elif age <= df['Age'].quantile(q = 0.4):
            ages.append(1)
        elif age <= df['Age'].quantile(q = 0.6):
            ages.append(2)
        elif age <= df['Age'].quantile(q = 0.8):
            ages.append(3)
        else:
            ages.append(4)
    df['Age_binned'] = ages
    
    '''(i) Creating Company Score'''
    comps = []
    for sex,party in zip(df.Sex, df.PartySize):
        if sex == 'male': # Male, regardless of PartySize
            comps.append(0)
        elif (sex == 'female') & (party == 1): # Female and Alone
            comps.append(1)
        elif (sex == 'female') & (party >= 2):
            comps.append(2)
    df['CompanyScore'] = comps
    
    X = df.drop(columns=['PassengerId','Survived','Name','Sex','Ticket','Ticket_num','Cabin','Age', 'SibSp', 'Parch','Fare'])
    X_train = X.iloc[:891,:]
    X_test = X.iloc[891:,:]
    
    cat_cols = []
    con_cols = ['Pclass','PartySize','SurviveRate','CompanyScore', 'Fare_binned', 'Age_binned','Embarked', 'Title']
    preprocessor_ohe = ColumnTransformer(transformers=[('cat', OneHotEncoder(categories='auto'), cat_cols),
                                                   ('cont', StandardScaler(), con_cols)])
    X_train_processed = preprocessor_ohe.fit_transform(X_train)
    X_test_processed = preprocessor_ohe.fit_transform(X_test)
    return X_train_processed, X_test_processed

In [196]:
y_train = full_data['Survived'][:891]
X_train_processed, X_test_processed = clean_and_transform(full_data)

y_train.shape, X_train_processed.shape, X_test_processed.shape

((891,), (891, 8), (418, 8))

In [197]:
from sklearn.model_selection import cross_val_score
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV

param_grid = {'min_samples_split': range(2,10),
              'min_samples_leaf': range(1,10)}

model_rf = GridSearchCV(ensemble.RandomForestClassifier(n_estimators = 10,random_state=42), param_grid, cv=3)
model_rf.fit(X_train_processed, y_train)

best_index = np.argmax(model_rf.cv_results_["mean_test_score"])

print("Best parameter values:", model_rf.cv_results_["params"][best_index])
print("Best Mean cross-validated test accuracy:", model_rf.cv_results_["mean_test_score"][best_index])

Best parameter values: {'min_samples_leaf': 2, 'min_samples_split': 9}
Best Mean cross-validated test accuracy: 0.9696969696969697


In [198]:
import sklearn.ensemble as skens
best_model = None
best_score = float('-inf')
cross_val_scores = []
random_state = 42
models = {
    'RandomForestClassifier': {
    'model': skens.RandomForestClassifier(random_state=random_state),
    'params': {
      'n_estimators': [100, 200],
      'criterion': ['gini']
    }
  },
  'AdaBoost': {
    'model': skens.AdaBoostClassifier(random_state=random_state),
    'params': {
      'n_estimators': [100, 200],
    }
  },
  'Bagging': {
    'model': skens.BaggingClassifier(random_state=random_state),
    'params': {
      'n_estimators': [100, 200],
    }
  },
  'GradientBoosting': {
    'model': skens.GradientBoostingClassifier(random_state=random_state),
    'params': {
      'n_estimators': [100, 200],
    }
  },
    'MLPClassifier': {
        'model': sknn.MLPClassifier(random_state=42, max_iter=100000),
        'params': {
            'hidden_layer_sizes': [(), (10)],
            'activation': ['logistic'],
            'solver': ['lbfgs']
        }
    },
}

for name, d in models.items():
  grid_search = skmods.GridSearchCV(d['model'], d['params'], scoring='accuracy', cv=7)
  grid_search.fit(X_train_processed, y_train)
  print(f'{name}:', grid_search.best_score_)

RandomForestClassifier: 0.9584505061867267
AdaBoost: 0.9663157339707537
Bagging: 0.9584417182227222
GradientBoosting: 0.9640572272215974
MLPClassifier: 0.9595841535433072


In [199]:
best_models = {}
for name, d in models.items():
  grid_search = skmods.GridSearchCV(d['model'], d['params'], scoring='accuracy', cv=7)
  grid_search.fit(X_train_processed, y_train)
  best_models[f'{name}'] = grid_search.best_estimator_

In [200]:
from sklearn.ensemble import VotingClassifier
eclf1 = VotingClassifier(estimators=[('RF',best_models['RandomForestClassifier']),
                                     ('AB', best_models['AdaBoost']),
                                     ('Bag', best_models['Bagging']),
                                     ('GB', best_models['GradientBoosting']), 
                                     ('MLP', best_models['MLPClassifier'])], voting='hard')

eclf1 = eclf1.fit(X_train_processed, y_train)

In [201]:
pd.DataFrame({
  "PassengerId": test["PassengerId"],
  "Survived": eclf1.predict(X_test_processed).astype(int)
}).to_csv('submission_upd2.csv', index=False)