# Titanic random-forest classifier - Pipeline

Questions for Aldous:

1. Why do your own train/test split instead of sklearn's test_train_split
2. Understand the use of 'class' object
3. How to interpret 'contributions'?

# Install packages

In [4]:
#install as trusted host
#!pip install --index-url=http://pypi.python.org/simple/ --trusted-host pypi.python.org lime
#!pip install --index-url=http://pypi.python.org/simple/ --trusted-host pypi.python.org treeinterpreter

Collecting lime
  Downloading lime-0.1.1.19.tar.gz (249kB)
[K    100% |████████████████████████████████| 256kB 4.0MB/s ta 0:00:01
Building wheels for collected packages: lime
  Running setup.py bdist_wheel for lime ... [?25l- \ done
[?25h  Stored in directory: /Users/joaeechew/Library/Caches/pip/wheels/ba/f0/48/ed6de3efda30be193f2546a3193a0017878b590577f945fc94
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.1.1.19


# Import libraries

In [10]:
# Import libraries
import numpy as np
import pandas as pd
import pylab as pl
import datetime

from treeinterpreter import treeinterpreter as ti

import lime
import lime.lime_tabular

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# include plots inline in the notebook (ipython specific)
%matplotlib inline

# Input cell

In [16]:
path = 'titanic-train.csv' #Enter data path file here
keep = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'] #Keep needed features only

one_hot_encoding = True #one-hot encoding for categorical values
dropna = True #Do you want to drop missing values?
impute = False #Do you want to impute missing values?
missing_values = 'NaN' #Make sure missing data has been replaced with NaN first
impute_strategy = 'most_frequent' # 'most_frequent', 'mean', 'median'
scale_features = True #Do you want to scale numerical figures?

target = 'Survived' #Target variable

seed = 42 #For random state
test_size = 0.4 #For train_test_split

# Read data set

In [34]:
# Read datasets from excel files
df = pd.read_csv(path)
print("Dataset has {} rows, {} columns".format(*df.shape))
df.head(5)

Dataset has 891 rows, 12 columns


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Data pre-processing

In [35]:
#### Standardisable pre-processing

#Keep needed features only
df = df[keep]

#Dummy encoding
if one_hot_encoding:
    df = pd.get_dummies(df, drop_first=True)

#Drop missing values
if dropna:
    df = df.dropna(how='any') #'any', 'all'
    
if impute:
    imp = Imputer(missing_values=missing_values, strategy=impute_strategy, axis=0)
    imp.fit_transform(df)
    
#Scaling of numerical data
if scale_features:
    scaler = StandardScaler()
    scaler.fit_transform(df)
    
print("Dataset has {} rows, {} columns".format(*df.shape))

Dataset has 714 rows, 9 columns


In [36]:
#Strings needs to be converted to integers:
# 1. Name - This can be removed
# 2. Sex - Converted into 1/0
# 3. Ticket - Probably insignificant, remove for now.
# 4. Cabin - Might represent where they are in the ship, remove for now.
# 5. Embarked - 'Matrix conversion' as no levels inherent in number

#Remove floats - value too large for random forest
df['Age'] = df['Age'].astype(int)
df['Fare'] = df['Fare'].astype(int)

In [37]:
#Create response and target variable
X = df.drop(target, axis=1)
y = df[target]

## Machine Learning
Here we train and fit the model.

In [38]:
#Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

In [51]:
#Tuning the model
param_grid = { "n_estimators"      : [100, 150],
           "criterion"         : ["gini"],
           "max_features"      : ['auto'], #auto, sqrt, log2, int/n_feature
           "max_depth"         : [5, 20],
           "min_samples_split" : [2, 4] ,
           "bootstrap": [True]}

rf = RandomForestClassifier(random_state=seed)

rf_cv = GridSearchCV(rf, param_grid, cv=5)
rf_cv.fit(X_train, y_train)

rf_best = rf_cv.best_estimator_

# Print the tuned parameters
print("Tuned Parameters: {}".format(rf_cv.best_params_)) 

Tuned Parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 100}


In [40]:
# Make prediction
y_pred = rf_cv.predict(X_test)

# Compute and print metrics
print("Accuracy: {}".format(rf_cv.score(X_test, y_test)))
print("Best score is {}".format(rf_cv.best_score_))
print(classification_report(y_test, y_pred))
print("Tuned Model Parameters: {}".format(rf_cv.best_params_))

Accuracy: 0.7762237762237763
Best score is 0.8481308411214953
             precision    recall  f1-score   support

          0       0.75      0.91      0.82       165
          1       0.83      0.60      0.69       121

avg / total       0.78      0.78      0.77       286

Tuned Model Parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 100}


## Feature importance
What are the most important drivers of store sales according to the model

In [52]:
#Display feature importance
def feature_importance(model, trainData, display_n_rows):
    """Display feature importance & weighting for tree based model"""
    fi = model.feature_importances_*100
    feat_imp = pd.DataFrame(list(zip(fi,trainData.columns.values)))
    feat_imp = feat_imp.sort_values(by=0, axis=0, ascending=False)
    feat_imp.columns = ['importance %', 'feature']
    print(feat_imp[:display_n_rows])

In [53]:
#Display features & weighting
feature_importance(rf_best, X_train, 10)

   importance %     feature
5     41.920456    Sex_male
1     16.694158         Age
4     16.309330        Fare
0     14.161379      Pclass
3      4.039234       Parch
2      3.621181       SibSp
7      2.631645  Embarked_S
6      0.622617  Embarked_Q


## Tree interpreter
Decomposing random forest predictions

In [138]:
#Print for all instances 

#Run tree interpreter
prediction, bias, contributions = ti.predict(rf_best, X_test)

#Create df
survived_contributions = contributions[:, :, 0]
df_contributions = pd.DataFrame(survived_contributions, columns=X.columns)
df_contributions['ti_bias'] = bias[:,0]
df_contributions['ti_prediction'] = prediction[:,0]
df_contributions['rf_survived_proba'] = y_pred_proba[:,0]
df_contributions.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,ti_bias,ti_prediction,rf_survived_proba
0,0.021701,0.02249,0.002395,0.01068,0.030871,0.160321,0.000219,0.01311,0.603341,0.865129,0.865129
1,-0.029073,-0.339382,-0.035626,-0.047075,-0.047013,0.149743,-0.000814,0.011243,0.603341,0.265344,0.265344
2,-0.078849,-0.006369,0.007616,0.002298,-0.005866,-0.303161,0.000532,0.015112,0.603341,0.234655,0.234655
3,-0.092888,-0.019648,-0.00333,-0.002585,-0.098342,-0.280361,-0.002093,-0.038413,0.603341,0.06568,0.06568
4,0.053182,0.033623,-0.001204,0.007158,0.070065,0.119525,0.001923,0.01375,0.603341,0.901362,0.901362


In [184]:
#Create function to show instance alognside contributions

def instance(i):
    """Actual test example alongside contributions"""
    i_contribution = df_contributions.iloc[i]
    i_data = X_test.iloc[i]
    i_actual = y_test.iloc[i]

    instance = pd.DataFrame(list(zip(X_test.columns.values, i_data, i_contribution)), 
                            columns = ['Feature', 'Instance', 'Contribution'])
    print("Instance {} prediction:".format(i), rf_best.predict(instances.values.reshape(1,-1)))
    print("Instance {} actual:".format(i), i_actual)
    print(instance)

In [140]:
#Write contributions to df
df_contributions.to_csv('contributions.csv')

In [191]:
#Output for example instance
instance(2)

Instance 2 prediction: [0]
Instance 2 actual: 1
      Feature  Instance  Contribution
0      Pclass         2     -0.078849
1         Age        29     -0.006369
2       SibSp         1      0.007616
3       Parch         0      0.002298
4        Fare        26     -0.005866
5    Sex_male         0     -0.303161
6  Embarked_Q         0      0.000532
7  Embarked_S         1      0.015112
