# Titanic survivors prediction

- importing libraries:

In [31]:
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer, LabelBinarizer, StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline, FeatureUnion
from CategoricalEncoder_module import CategoricalEncoder
from sklearn.linear_model import LogisticRegression
# direct download doesnt work as one needs to login to kaggle:
#DOWNLOAD_URL="https://www.kaggle.com/c/titanic/download/train.csv"
#urllib.request.urlretrieve(DOWNLOAD_URL,"train.csv")

 ##  Data

In [32]:
rawdata=pd.read_csv("train.csv") #reading data
rawdata.head(50)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


## Preprocessing Data
Creating custom Transformers:
- NumCatSorter:
    - NumCatSorter for sorting out numerical (VarType='num') or categorical (VarType='cat') data;
    - Age, Fare are numerical, Pclass, Sex, Embarked CabinBin categorical.
    - CabinBin is new variable, False if Cabin is NaN, True else
    - New variables SibSpBin, ParchBin, True if SibSp, Parch are nn-zero False else.
    - through SibSpType we can decide if we take into the dataset SibSp as numerical (SibSpType='num') or as SibSpBin as categorical (SibSpType='cat'). The same for Parch/ParchBin.
- Dummycoder: 
    - creates Dummies (1-hot encoding) from all columns. To be applied only on categorical dataset. This encoder creates for N categories N-1 columns, in contrast to future scikit-learn CategoricalEncoder  
- NaDropper:
    - drop rows where na is  in ["Pclass", "Sex", "Embarked", "SibSp", "Parch"]
    - used before the actual preprocessing pipeline

In [19]:
def first_digit(x):
    if str.isdigit(x):
        return x[0]
    else:
        return x
    
def first_letter(x):
    if isinstance(x,str):
        return x[0]
    else:
        return '0'

In [20]:
class NumSelector(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.features=["Age", "Fare", "SibSp", "Parch"]  
    def fit(self,X,y=None):
        return self
    def transform(self, X):
        XX=X.copy()
        return XX[self.features]
    
class CatSelector(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.features=["Pclass", "Sex", "Embarked", "CabinCat",
                       "Title", "TicketCat", "SibSpBin", "ParchBin", "AgeBin"]       
    def fit(self,X,y=None):
        return self
    def transform(self, X):
        XX=X.copy()
        XX['Title']=XX['Name'].str.split(',').map(lambda x: x[1]).str.split('.').map(lambda x: x[0])
        XX['TicketCat']=XX['Ticket'].str.split(' ').map(lambda x: x[0]).str.split('/').map(lambda x: x[0])\
        .str.replace('.','').map(lambda x: first_digit(x))
        XX["SibSpBin"]=X["SibSp"]>0
        XX["ParchBin"]=X["Parch"]>0
        XX["CabinCat"]=X["Cabin"].map(lambda x: first_letter(x))
        XX["AgeBin"]=np.isnan(X["Age"])
        return XX[self.features]
    
class NaDroper(BaseEstimator, TransformerMixin):  
    def fit(self,X,y=None):
        return self
    def transform(self, X):
        features_to_check=["Pclass", "Sex", "Embarked", "SibSp", "Parch"]
        return X.dropna(subset=features_to_check)

class LogL1Selector(BaseEstimator, TransformerMixin):
    def __init__(self, C=1):
        self.C=C
    def fit(self,X,y):
        self.lgr=LogisticRegression(C=self.C, penalty="l1", fit_intercept=True, tol=1e-5)
        self.lgr.fit(X,y)
        self.features=np.where(self.lgr.coef_!=0)[1]
        return self
    def transform(self, X):
        return X[:,self.features]

- we create pipeline for data preprocessing:

In [21]:
nadroper=NaDroper()
num_pipeline=Pipeline([('numselector',NumSelector()),
                      ('imputer',Imputer(strategy='median')),
                      ('standardscaler',StandardScaler())
                      ])
cat_pipeline=Pipeline([('catselector',CatSelector()),
                       ('categoricalencoder',CategoricalEncoder(encoding='onehot-dense', 
                                                               handle_unknown='ignore')),
                      ])
preprocess_pipeline0=FeatureUnion(transformer_list=[('numpip',num_pipeline),
                                                    ('catpip',cat_pipeline)])
preprocess_pipeline1=Pipeline([('ppip0', preprocess_pipeline0),
                               ('polyf', PolynomialFeatures(degree=1)),
                               ('logl1selector',LogL1Selector(C=1))])

In [22]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

full_pipeline=[]
full_pipeline.append(Pipeline([('ppip1', preprocess_pipeline1),
                        ('algo', SVC(kernel='rbf',probability=True))]))

full_pipeline.append(Pipeline([('ppip1', preprocess_pipeline1),
                        ('algo', RandomForestClassifier(n_estimators=100, max_depth=3))]))

full_pipeline.append(Pipeline([('ppip1', preprocess_pipeline1),
                        ('algo', KNeighborsClassifier())]))

full_pipeline.append(Pipeline([('ppip1', preprocess_pipeline1),
                        ('algo', LogisticRegression())]))

param_grid ={'ppip1__polyf__degree':[ 1, 2],
         'ppip1__polyf__interaction_only':[True],
         'ppip1__logl1selector__C':[0.1,0.25,0.5,0.75,1,1.25,1.5,1.75,2,2.5,5,7.5,10,15],
            }

In [23]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict

data_cleaned=nadroper.fit_transform(rawdata)
y=data_cleaned['Survived']

gscv=[None] * len(full_pipeline)
top_estim=[None] * len(full_pipeline)
for k in range(len(full_pipeline)):
    gscv[k]=GridSearchCV(full_pipeline[k], param_grid, cv=3, scoring='accuracy',n_jobs=-1,
                                   verbose=1)
    gscv[k].fit(data_cleaned,data_cleaned["Survived"])
    print(gscv[k].best_params_)
    top_estim[k]=gscv[k].best_estimator_
    print(cross_val_score(top_estim[k], data_cleaned,y))

Fitting 3 folds for each of 28 candidates, totalling 84 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done  84 out of  84 | elapsed:   11.0s finished


{'ppip1__logl1selector__C': 1, 'ppip1__polyf__degree': 1, 'ppip1__polyf__interaction_only': True}
[ 0.82491582  0.84459459  0.84121622]
Fitting 3 folds for each of 28 candidates, totalling 84 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done  84 out of  84 | elapsed:    9.8s finished


{'ppip1__logl1selector__C': 0.5, 'ppip1__polyf__degree': 2, 'ppip1__polyf__interaction_only': True}
[ 0.81481481  0.83108108  0.81418919]
Fitting 3 folds for each of 28 candidates, totalling 84 fits


[Parallel(n_jobs=-1)]: Done  84 out of  84 | elapsed:    7.6s finished


{'ppip1__logl1selector__C': 1.5, 'ppip1__polyf__degree': 1, 'ppip1__polyf__interaction_only': True}
[ 0.77441077  0.83783784  0.8277027 ]
Fitting 3 folds for each of 28 candidates, totalling 84 fits


[Parallel(n_jobs=-1)]: Done  84 out of  84 | elapsed:    7.5s finished


{'ppip1__logl1selector__C': 1, 'ppip1__polyf__degree': 2, 'ppip1__polyf__interaction_only': True}
[ 0.77441077  0.84459459  0.85472973]


In [24]:
estimators=[('est'+str(k),top_estim[k]) for k in range(len(full_pipeline))]

In [25]:
from sklearn.ensemble import VotingClassifier
votc=VotingClassifier(estimators=estimators, n_jobs=-1, voting="soft")
votc.fit(data_cleaned,data_cleaned["Survived"])
cross_val_score(votc, data_cleaned,data_cleaned["Survived"])

array([ 0.82154882,  0.84797297,  0.84121622])

In [26]:
cross_val_score(votc, data_cleaned,data_cleaned["Survived"])

array([ 0.82154882,  0.84797297,  0.84121622])

In [27]:
testdata=pd.read_csv("test.csv") 

In [28]:
y_test_prediction=votc.predict(testdata)

In [29]:
submission02 = pd.DataFrame({
        "PassengerId": testdata["PassengerId"],
        "Survived": y_test_prediction
    })
submission02.to_csv('submission02.csv', index=False)

0.80382 !!!

In [30]:
np.mean(votc.predict(data_cleaned)==data_cleaned["Survived"])

0.85489313835770531