In [1]:
## Extract files

import os
from zipfile import ZipFile

if 'train.csv' not in os.listdir():
    ZipFile('titanic.zip','r').extractall()

In [2]:
## Load data

import pandas as pd

df_train = pd.read_csv('train.csv', index_col='PassengerId')
display(df_train.head(2))
X_train, y_train = df_train.loc[:, df_train.columns != 'Survived'], df_train['Survived']

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [105]:
## Transformation pipeline

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

class TitlesAttribute(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.title_dictionary = {
            "Capt": "Officer",
            "Col": "Officer",
            "Major": "Officer",
            "Jonkheer": "Royalty",
            "Don": "Royalty",
            "Sir" : "Royalty",
            "Dr": "Officer",
            "Rev": "Officer",
            "the Countess":"Royalty",
            "Mme": "Mrs",
            "Mlle": "Miss",
            "Ms": "Mrs",
            "Mr" : "Mr",
            "Mrs" : "Mrs",
            "Miss" : "Miss",
            "Master" : "Master",
            "Lady" : "Royalty"
        }
    def fit(self, X, y=None):
        return self
    def get_titles(self, row):
        try:
            return self.title_dictionary[row['Name'].split(', ')[1].split('.')[0]]
        except:
            if row['Sex'] == 'male':
                return "Mr"
            else:
                return "Miss"
    def transform(self, X, y=None):
        X['Titles'] = X.apply(self.get_titles, axis=1)
        return X

class AgesImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.median_by_titles = X.groupby('Titles')['Age'].median()
        self.median_by_gender = X.groupby('Sex')['Age'].median()
        self.median_by_pclass = X.groupby('Pclass')['Age'].median()
        return self
    def estimate_age(self, df):
            return pd.Series(data=((self.median_by_titles[df['Titles']].values + self.median_by_gender[df['Sex']].values + self.median_by_pclass[df['Pclass']].values)//3), index=df.index)
    def transform(self, X, y=None):
        X['Age'].fillna(self.estimate_age(X), axis=0, inplace=True)
        return X

class TicketNumberAttribute(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def fill_ticket_n(self,row):
            import regex as re
            try:
                return int(re.sub("[^0-9]","",row['Ticket']))
            except:
                return int(0)
    def transform(self, X, y=None):
        X['Ticket_n'] = X.apply(self.fill_ticket_n,axis=1)
        return X

class SurnameFrequencyAttribute(BaseEstimator, TransformerMixin): 
    def fit(self, X, y = None):
        X['Surname'] = X['Name'].apply(lambda x:x.split(', ')[0])
        self.surnames_n_repeated = X['Surname'].value_counts()
        return self # nothing else to do
    def check_surname(self, row):
        try:
            return self.surnames_n_repeated[row]
        except:
            return 0
    def transform(self, X):
        X['Surname'] = X['Name'].apply(lambda x:x.split(', ')[0])
        X['Surname_frequency'] = X['Surname'].apply(self.check_surname)
        return X

class FamilySizeAttribute(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X['Family_size'] = X['SibSp'] + X['Parch'] + 1
        X['Family_size_cat'] = X['Family_size'].replace({1:'alone', 2:'small_family', 3:'small_family', 4:'small_family'
                                                        ,5:'large_family', 6:'large_family', 7:'large_family'
                                                        ,8:'large_family', 9:'large_family', 10:'large_family', 
                                                        11:'large_family'})
        return X

class TicketFrequencyAttribute(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X['Ticket_frequency'] =X['Ticket'].map(X['Ticket'].value_counts(dropna=False))
        return X

class SurvivalRatesAttribute(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.X_temp = pd.concat([X[['Surname','Ticket']],y],axis=1)
        return self
    def transform(self, X, y=None):
        X['Family_survival_rate'] = X['Surname'].map(self.X_temp.groupby(['Surname'])['Survived'].median())
        X['Ticket_group_survival_rate'] = X['Ticket'].map(self.X_temp.groupby(['Ticket'])['Survived'].median())
        return X

class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")), # Get rid of possible null values
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ("imputer", MostFrequentImputer()),
    ("encoder", OneHotEncoder(drop='first',sparse=False))
])

combined_pipeline = ColumnTransformer([
    ("num", num_pipeline, ['Age','Fare','Ticket_n','Surname_frequency','Ticket_frequency','Family_survival_rate','Ticket_group_survival_rate','Family_size']),
    ("cat", cat_pipeline, ['Pclass','Sex','Embarked','Titles','Family_size_cat']),
])

prep_pipeline = Pipeline([
    ("titles", TitlesAttribute()),
    ("ages_imputer", AgesImputer()),
    ("ticket_n", TicketNumberAttribute()),
    ("surname_frequency", SurnameFrequencyAttribute()),
    ("family_size", FamilySizeAttribute()),
    ("ticket_frequency", TicketFrequencyAttribute()),
    ("survival_rates", SurvivalRatesAttribute()),
    ("combined", combined_pipeline),
])

X_train_prep = prep_pipeline.fit_transform(X_train,y_train)
X_train_prep[0]

array([-0.55097434, -0.50244517, -0.06821981,  0.0792407 , -0.57916179,
       -0.83492139, -0.85024275,  0.05915988,  0.        ,  1.        ,
        1.        ,  0.        ,  1.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  1.        ])

In [106]:
## Model building and fine tuning: Grid Search

import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score
# from sklearn.model_selection import RandomizedSearchCV

def execute_pipeline(features,labels, search_space=[
                    {"classifier": [LogisticRegression(random_state=42)],
                    "classifier__penalty": ['l2','l1'],
                    "classifier__C": np.logspace(0, 4, 10)
                    },
                    {"classifier": [LogisticRegression(random_state=42)],
                    "classifier__penalty": ['l2'],
                    "classifier__C": np.logspace(0, 4, 10),
                    "classifier__solver":['newton-cg','saga','sag','liblinear'] ##This solvers don't allow L1 penalty
                    },
                    {"classifier": [RandomForestClassifier(random_state=42)],
                    "classifier__n_estimators": [10,100,500,1000],
                    "classifier__max_depth":[5,8,15,25,30,None],
                    "classifier__min_samples_leaf":[1,2,5,10,15,100],
                    "classifier__max_leaf_nodes": [8,16,32]}
                 ], 
                 cv=5, verbose=0, n_jobs=-1):
    
    pipe = Pipeline([("classifier", RandomForestClassifier())])
    
    gridsearch = GridSearchCV(pipe, search_space, cv=cv, verbose=verbose,n_jobs=n_jobs) # Fit grid search
    best_model = gridsearch.fit(features, labels)
    print(best_model.best_estimator_)
    print("The mean accuracy of the model is:",best_model.score(features, labels))

    return best_model

best_estimator = execute_pipeline(X_train_prep,y_train)

Pipeline(memory=None,
         steps=[('classifier',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=8, max_features='auto',
                                        max_leaf_nodes=32, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=10, n_jobs=None,
                                        oob_score=False, random_state=42,
                                        verbose=0, warm_start=False))],
         verbose=False)
The mean accuracy of the model is: 0.9955106621773289


In [31]:
## Check dummies

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train_prep, y_train)
predictions_dummy = dummy_clf.predict(X_train_prep)

print(dummy_clf.score(X_train_prep,y_train))
print(confusion_matrix(y_train, predictions_dummy,labels=[1,0]))
print('Precision: ', precision_score(y_train, predictions_dummy))
print('Recall: ', recall_score(y_train, predictions_dummy))
print('F1: ', f1_score(y_train, predictions_dummy))


0.6165730337078652
[[  0 273]
 [  0 439]]
Precision:  0.0
Recall:  0.0
F1:  0.0


In [107]:
import pandas as pd

X_test = pd.read_csv('test.csv', index_col='PassengerId')

X_test_prep = prep_pipeline.transform(X_test)
X_test['Survived'] = best_estimator.predict(X_test_prep)
display(X_test)

X_test[['PassengerId','Survived']].to_csv('submission.csv',index=False)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Titles,Ticket_n,Surname,Surname_frequency,Family_size,Family_size_cat,Ticket_frequency,Family_survival_rate,Ticket_group_survival_rate,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,Mr,330911,Kelly,4,1,alone,1,1.0,,0
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,Mrs,363272,Wilkes,0,2,small_family,1,,,0
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,Mr,240276,Myles,0,1,alone,1,,,0
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,Mr,315154,Wirz,0,1,alone,1,,,0
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,Mrs,3101298,Hirvonen,1,3,small_family,1,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,3,"Spector, Mr. Woolf",male,27.0,0,0,A.5. 3236,8.0500,,S,Mr,53236,Spector,0,1,alone,1,,,0
1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,Miss,17758,Oliva y Ocana,0,1,alone,1,,0.5,1
1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,Mr,3101262,Saether,0,1,alone,1,,,0
1308,3,"Ware, Mr. Frederick",male,27.0,0,0,359309,8.0500,,S,Mr,359309,Ware,0,1,alone,1,,,0


KeyError: "['PassengerId'] not in index"