In [None]:
import pandas as pd
import numpy as np
import sklearn 
import xgboost

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", color_codes=True)

# Load the data

In [None]:
data = pd.read_csv('../input/train.csv')

**Dataset Description**

Variable | Definition | Key 
------|----|----
survival | Survival | 0 = No, 1 = Yes 
pclass | Ticket class | 1 = 1st, 2 = 2nd, 3 = 3rd 
sex | Sex |
age |Age in years |
sibsp | # of siblings / spouses aboard the Titanic | 
parch | # of parents / children aboard the Titanic | 
ticket | Ticket number |
fare | Passenger fare | 
cabin | Cabin number |
embarked | Port of Embarkation | C = Cherbourg, Q = Queenstown, S = Southampton

In [None]:
print('(rows, columns):', data.shape)
data.head()

Describe continuous variables

In [None]:
data.describe()

Describe categorical variables:

In [None]:
data[['Pclass','Sex','SibSp','Parch','Ticket','Fare','Cabin','Embarked',]].astype('category').describe()

Distribution of categorical variables (with fewer than 10 categories)

In [None]:
categorical_data = data[data.columns[data.apply(pd.Series.nunique) < 10]]
for col in categorical_data:
    print(col, '\t', dict(categorical_data[col].value_counts()))

In [None]:
print('Number of empty fields per feature')
data.isnull().sum(axis=0)

## Simple visualisations of data distribution

In [None]:
sns.barplot(x="Sex", y="Survived", data=data)
plt.title("Proportion of male/female that survived");

In [None]:
sns.violinplot(x="Sex", y="Age", hue="Survived", data=data)
plt.title('Ages of males/females that survived');

In [None]:
fig, ax = plt.subplots(1,2, figsize=(8,3))
sns.barplot(x="Sex", y="Survived", hue="Pclass", data=data, ax=ax[0])
sns.barplot(x="Sex", y="Survived", hue="Embarked", data=data, ax=ax[1])

plt.suptitle("Proportion of male/female that survived");

In [None]:
fig, ax = plt.subplots(1,2, figsize=(8,3))
sns.barplot(x="SibSp", y="Survived", hue="Sex", data=data, ax=ax[0])
sns.barplot(x="Parch", y="Survived", hue="Sex", data=data, ax=ax[1])

plt.suptitle("Survivors by numbers of siblings/spouses and parents/children");

## Summary of exploration findings

* Target class is imbalanced `Survived 	 {0: 549, 1: 342}`
* Sex appears to be a highly predictive variable at first glance
* Cabin missing many values - discard
* Age missing approx 1/4 of values - could impute using nearest neighbours method

## Feature engineering

In [None]:
salutation = data.Name.apply(lambda x: x.split(',')[1].split()[0])
print(dict(salutation.value_counts()))

# Data Preparation

In [None]:
import fancyimpute

In [None]:
from sklearn import preprocessing
# from sklearn.impute import MICEImputer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

Convert categorical variables to integer representation[](http://)

In [None]:
categorical_variables = [
    'Pclass',
    'Sex',
    'Embarked',
    'Salutation'
]

In [None]:
continuous_variables = [
    'Age_imp',
    'SibSp',
    'Parch',
    'Fare'
]

The data are transformed as follows:
* Categorical variables are first encoded to numerical labels then OHE'ed
* The missing Age values are imputed using the MICE technique
* Continuous variables are standard-scaled

In [None]:
_Xtrain, _Xtest, Ytrain, Ytest = train_test_split(data.drop('Survived', axis=1), data['Survived'], 
    stratify = data['Survived'], 
    test_size=0.2, 
    random_state=0
)

In [None]:
class Titanic:
    
    def __init__(self, data, categorical, continuous):
        '''
        data: pd.DataFrame
            raw input from given csv
        categorical: list 
            categorical variables (col names)
        continuous: list
            continuous variables (col names)
        '''
        
        self.categorical = categorical
        self.continuous = continuous
        
        self.cat_encoder, self.imputer, self.scaler = self.fit(data)

    def fit(self, data):
        data = self._preprocess(data)
        
        cat_encoder = self._fit_encoder(data) 
        df_cat = self.transform_categories(data, cat_encoder)
        
        # imputation must come after cat encoder as it uses the ohe variables 
        imputer = self._fit_imputer(data, df_cat) 
        df_cts = self._transform_imputer(data, df_cat, imputer)
        
        # scaler must come after imputer as it uses imputed cts values
        scaler = self._fit_scaler(df_cts)  
        
        return cat_encoder, imputer, scaler
        
    def _preprocess(self, data):
        data = data.dropna(axis=0, subset=[c for c in self.continuous if 'imp' not in c], how='any')
        data['Salutation'] = data['Name'].apply(lambda x: x.split(',')[1].split()[0])
        return data
    
    def _fit_encoder(self, data):
        
        def _fit_safe_encoder(var):
            # a hack to deal with unseen values in test dataset
            enc = preprocessing.LabelEncoder()
            # we drop one of the known classes since the information is duplicated
            classes = data[var][data[var].notnull()].apply(str).unique()
            # we add one class for unknown or null items
            classes = np.append(classes, 'NA')
            enc.fit(classes) 
            return enc 
        
        return { 
            var : _fit_safe_encoder(var)
            for var in self.categorical 
        }
    
    def _encode_cat(self, data, cat_encoder):   
        # turns categories into numerical labels 0, 1, 2...
        def _transform_safe_encode(var):
            # a hack to deal with unseen values in test dataset
            enc = cat_encoder[var]
            x = data[var].apply(str)
            x[(~x.isin(enc.classes_)) | (x.isnull())] = 'NA'
            return enc.transform(x)
            
        return pd.DataFrame({var: _transform_safe_encode(var) for var in cat_encoder.keys()})

    def _ohe(self, data):
        # OHE the categories
        return pd.get_dummies(data, columns=data.columns)

    def transform_categories(self, data, cat_encoder):
        df_ohe = self._ohe(self._encode_cat(data, cat_encoder))
        # add missing OHE categories
        all_cats = [f'{var}_{i}' for var in cat_encoder.keys() for i, c in enumerate(cat_encoder[var].classes_)]
        missing_cats = np.setdiff1d(all_cats, df_ohe.columns)
        for cat in missing_cats:
            df_ohe[cat] = 0
        return df_ohe
    
    def _prepare_impute_df(self, data, df_cat):
        return pd.concat([df_cat, data[['SibSp','Parch','Fare']], data['Age']], axis=1, ignore_index=True)
    
    def _fit_imputer(self, data, df_cat):
        imp = SimpleImputer(strategy='median')
        imp.fit(self._prepare_impute_df(data.reset_index(drop=True), df_cat.reset_index(drop=True)))
        return imp
    
    def _transform_imputer(self, data, df_cat, imputer):
        _d = self._prepare_impute_df(data.reset_index(drop=True), df_cat.reset_index(drop=True))
        data['Age_imp'] = imputer.transform(_d)[:,-1]
        return data
            
    def _fit_scaler(self, data):
        scaler = preprocessing.StandardScaler()
        scaler.fit(data[self.continuous])
        return scaler
        
    def transform_continuous(self, data, df_cat, imputer, scaler):
        df_cts = self._transform_imputer(data, df_cat, imputer)
        return pd.DataFrame(scaler.transform(df_cts[self.continuous]), columns=self.continuous)
    
    def transform(self, data):
        data = self._preprocess(data)
        df_cat = self.transform_categories(data, self.cat_encoder)
        
        df_cts = self.transform_continuous(data, df_cat, self.imputer, self.scaler)
        df = pd.concat([df_cat, df_cts], axis=1)
        return df[sorted(df.columns)]

In [None]:
titanic = Titanic(_Xtrain, categorical_variables, continuous_variables)
titanic.cat_encoder['Sex'].classes_

In [None]:
Xtrain = titanic.transform(_Xtrain)
Xtrain.head()

In [None]:
Xtest = titanic.transform(_Xtest)
Xtest[sorted(Xtest.columns)].head()

In [None]:
Xtrain_unscaled = pd.DataFrame(titanic.scaler.inverse_transform(Xtrain[continuous_variables]), columns=continuous_variables)
Xtest_unscaled = pd.DataFrame(titanic.scaler.inverse_transform(Xtest[continuous_variables]), columns=continuous_variables)
Xtrain_unscaled.head()

In [None]:
fig, ax = plt.subplots(1,2, figsize=(8,3))
plt_kwargs = dict(bins=np.linspace(0,100,10), kde=False, norm_hist=True)

sns.distplot(_Xtrain.Age[~_Xtrain.Age.isnull()], **{**dict(label='Age'), **plt_kwargs}, ax=ax[0])
sns.distplot(Xtrain_unscaled.Age_imp.values[_Xtrain.Age.isnull()], **{**dict(label='Imputed Age'), **plt_kwargs}, ax=ax[0])
ax[0].set(xlabel='Age', title='Distribution of given and imputed ages')

sns.distplot(_Xtest.Age[~_Xtest.Age.isnull()], **{**dict(label='Age'), **plt_kwargs}, ax=ax[1])
sns.distplot(Xtest_unscaled.Age_imp.values[_Xtest.Age.isnull()], **{**dict(label='Imputed Age'), **plt_kwargs}, ax=ax[1])
ax[1].set(xlabel='Age', title='Distribution of given and imputed ages')

plt.legend()

# Training

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [None]:
pos_weight = len(Ytrain)/Ytrain.sum()
pos_weight

Random Forest Classifier (scikit-learn)

In [None]:
clf = RandomForestClassifier()
clf.fit(Xtrain, Ytrain)
Ypred = clf.predict(Xtest)
Yprob_rf = clf.predict_proba(Xtest)
print(metrics.classification_report(Ytest, Ypred))
print(metrics.accuracy_score(Ytest, Ypred))
ax = sns.heatmap(metrics.confusion_matrix(Ytest, Ypred), annot=True, fmt='.0f', cmap='Blues')
ax.set(xlabel='True', ylabel='Predicted', title='Confusion matrix');

In [None]:
Xtrain.columns[np.argsort(clf.feature_importances_)[::-1]]

XGBoost BDT Classifier

In [None]:
xgb = xgboost.XGBClassifier(scale_pos_weight=pos_weight, n_estimators=500, max_depth=3, colsample_bytree=0.9)
xgb.fit(Xtrain, Ytrain)
Ypred = xgb.predict(Xtest)
Yprob_xgb = xgb.predict_proba(Xtest)
print(metrics.classification_report(Ytest, Ypred))
print(metrics.accuracy_score(Ytest, Ypred))
ax = sns.heatmap(metrics.confusion_matrix(Ytest, Ypred), annot=True, fmt='.0f', cmap='Blues')
ax.set(xlabel='True', ylabel='Predicted', title='Confusion matrix');