# Titanic random-forest classifier - Pipeline

Questions for Aldous:

1. Why do your own train/test split instead of sklearn's test_train_split
2. Understand the use of 'class' object
3. How to interpret 'contributions'?
4. After tuning - do we need to re-fit with best_params?

# Install packages

In [4]:
#install as trusted host
#!pip install --index-url=http://pypi.python.org/simple/ --trusted-host pypi.python.org lime
#!pip install --index-url=http://pypi.python.org/simple/ --trusted-host pypi.python.org treeinterpreter

Collecting lime
  Downloading lime-0.1.1.19.tar.gz (249kB)
[K    100% |████████████████████████████████| 256kB 4.0MB/s ta 0:00:01
Building wheels for collected packages: lime
  Running setup.py bdist_wheel for lime ... [?25l- \ done
[?25h  Stored in directory: /Users/joaeechew/Library/Caches/pip/wheels/ba/f0/48/ed6de3efda30be193f2546a3193a0017878b590577f945fc94
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.1.1.19


# Import libraries

In [10]:
# Import libraries
import numpy as np
import pandas as pd
import pylab as pl
import datetime

from treeinterpreter import treeinterpreter as ti

import lime
import lime.lime_tabular

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# include plots inline in the notebook (ipython specific)
%matplotlib inline

# Input cell

In [16]:
path = 'titanic-train.csv' #Enter data path file here
keep = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'] #Keep needed features only

one_hot_encoding = True #one-hot encoding for categorical values
dropna = True #Do you want to drop missing values?
impute = False #Do you want to impute missing values?
missing_values = 'NaN' #Make sure missing data has been replaced with NaN first
impute_strategy = 'most_frequent' # 'most_frequent', 'mean', 'median'
scale_features = True #Do you want to scale numerical figures?

target = 'Survived' #Target variable

seed = 42 #For random state
test_size = 0.4 #For train_test_split

# Read data set

In [34]:
# Read datasets from excel files
df = pd.read_csv(path)
print("Dataset has {} rows, {} columns".format(*df.shape))
df.head(5)

Dataset has 891 rows, 12 columns


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Data pre-processing

In [35]:
#### Standardisable pre-processing

#Keep needed features only
df = df[keep]

#Dummy encoding
if one_hot_encoding:
    df = pd.get_dummies(df, drop_first=True)

#Drop missing values
if dropna:
    df = df.dropna(how='any') #'any', 'all'
    
if impute:
    imp = Imputer(missing_values=missing_values, strategy=impute_strategy, axis=0)
    imp.fit_transform(df)
    
#Scaling of numerical data
if scale_features:
    scaler = StandardScaler()
    scaler.fit_transform(df)
    
print("Dataset has {} rows, {} columns".format(*df.shape))

Dataset has 714 rows, 9 columns


In [36]:
#Strings needs to be converted to integers:
# 1. Name - This can be removed
# 2. Sex - Converted into 1/0
# 3. Ticket - Probably insignificant, remove for now.
# 4. Cabin - Might represent where they are in the ship, remove for now.
# 5. Embarked - 'Matrix conversion' as no levels inherent in number

#Remove floats - value too large for random forest
df['Age'] = df['Age'].astype(int)
df['Fare'] = df['Fare'].astype(int)

In [37]:
#Create response and target variable
X = df.drop(target, axis=1)
y = df[target]

## Machine Learning
Here we train and fit the model.

In [38]:
#Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

In [51]:
#Tuning the model
param_grid = { "n_estimators"      : [100, 150],
           "criterion"         : ["gini"],
           "max_features"      : ['auto'], #auto, sqrt, log2, int/n_feature
           "max_depth"         : [5, 20],
           "min_samples_split" : [2, 4] ,
           "bootstrap": [True]}

rf = RandomForestClassifier(random_state=seed)

rf_cv = GridSearchCV(rf, param_grid, cv=5)
rf_cv.fit(X_train, y_train)

rf_best = rf_cv.best_estimator_

# Print the tuned parameters
print("Tuned Parameters: {}".format(rf_cv.best_params_)) 

Tuned Parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 100}


In [40]:
# Make prediction
y_pred = rf_cv.predict(X_test)

# Compute and print metrics
print("Accuracy: {}".format(rf_cv.score(X_test, y_test)))
print("Best score is {}".format(rf_cv.best_score_))
print(classification_report(y_test, y_pred))
print("Tuned Model Parameters: {}".format(rf_cv.best_params_))

Accuracy: 0.7762237762237763
Best score is 0.8481308411214953
             precision    recall  f1-score   support

          0       0.75      0.91      0.82       165
          1       0.83      0.60      0.69       121

avg / total       0.78      0.78      0.77       286

Tuned Model Parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 100}


## Feature importance
What are the most important drivers of store sales according to the model

In [52]:
#Display feature importance
def feature_importance(model, trainData, display_n_rows):
    """Display feature importance & weighting for tree based model"""
    fi = model.feature_importances_*100
    feat_imp = pd.DataFrame(list(zip(fi,trainData.columns.values)))
    feat_imp = feat_imp.sort_values(by=0, axis=0, ascending=False)
    feat_imp.columns = ['importance %', 'feature']
    print(feat_imp[:display_n_rows])

In [53]:
#Display features & weighting
feature_importance(rf_best, X_train, 10)

   importance %     feature
5     41.920456    Sex_male
1     16.694158         Age
4     16.309330        Fare
0     14.161379      Pclass
3      4.039234       Parch
2      3.621181       SibSp
7      2.631645  Embarked_S
6      0.622617  Embarked_Q


## Tree interpreter
Decomposing random forest predictions

In [55]:
#Select single row of data
i = 203
instances = X.iloc[[i]]

#Make prediction
print("Instance {} prediction:".format(i), rf_best.predict(instances.values.reshape(1,-1)))

#Run tree interpreter
prediction, bias, contributions = ti.predict(rf_best, instances)

feat_contributions = pd.DataFrame(list(zip(contributions[0][:,1],X.columns.values)))
feat_contributions = feat_contributions.sort_values(by=0, axis=0, ascending=False)
feat_contributions.columns = ['contribution', 'feature']

print(feat_contributions)

Instance 203 prediction: [0]
   contribution     feature
0      0.131180      Pclass
4      0.022597        Fare
6      0.000398  Embarked_Q
3     -0.004655       Parch
2     -0.005616       SibSp
7     -0.010283  Embarked_S
1     -0.129622         Age
5     -0.165670    Sex_male


In [95]:
contributions[2:,1][:,1]

array([  1.78588163e-01,   2.44655506e-02,  -8.91710476e-02,
        -2.27609937e-01,   2.77130786e-01,   1.55335750e-03,
         5.28946829e-02,   1.93047818e-01,  -2.71635127e-01,
        -1.52049632e-01,  -7.32661254e-02,  -2.32913336e-01,
        -1.79950482e-01,   2.06209324e-01,  -8.77942052e-02,
        -4.68171585e-02,   2.77476896e-01,   1.91761592e-01,
         3.98385375e-01,  -1.49245953e-02,   2.58961016e-01,
        -4.16874017e-02,  -1.71912544e-02,   1.53657251e-01,
         5.12568511e-02,   2.72279485e-02,  -1.47206674e-01,
        -6.49632806e-02,   1.22422846e-01,  -2.09290363e-01,
        -4.52448374e-01,   6.91218327e-02,   5.00721772e-02,
        -1.13439600e-01,   1.10077509e-01,  -1.71784674e-02,
         9.64193108e-03,   1.62937203e-01,  -1.47169113e-01,
         3.46694519e-02,   2.29002662e-02,   7.37685791e-02,
         2.37326159e-02,   1.40656178e-01,   8.99615932e-03,
        -2.03010147e-01,   8.83018093e-02,   9.51072994e-02,
         6.40629908e-02,

In [107]:
pd.DataFrame(data=contributions[:,1][:,1])

Unnamed: 0,0
0,0.019701
1,0.009642
2,0.178588
3,0.024466
4,-0.089171
5,-0.227610
6,0.277131
7,0.001553
8,0.052895
9,0.193048


In [None]:
df_feat_contributions = pd.DataFrame(list(zip(contributions,X.columns.values)))
df_feat_contributions = feat_contributions.sort_values(by=0, axis=0, ascending=False)
df_feat_contributions.columns = ['contribution', 'feature']

In [48]:
#Print for all instances and with features as header

instances = X

#Run tree interpreter
prediction, bias, contributions = ti.predict(rf, instances)

df_feat_contributions = pd.DataFrame(list(zip(contributions[0][:,1],X.columns.values)))
df_feat_contributions = feat_contributions.sort_values(by=0, axis=0, ascending=False)
df_feat_contributions.columns = ['contribution', 'feature']

#print(feat_contributions)
contributions


array([[[ 0.03704237, -0.03704237],
        [-0.01970131,  0.01970131],
        [-0.11893289,  0.11893289],
        ..., 
        [ 0.00833191, -0.00833191],
        [-0.00048392,  0.00048392],
        [-0.00124174,  0.00124174]],

       [[-0.09039026,  0.09039026],
        [-0.00964193,  0.00964193],
        [-0.00999184,  0.00999184],
        ..., 
        [-0.04061603,  0.04061603],
        [-0.00514825,  0.00514825],
        [-0.01627394,  0.01627394]],

       [[ 0.17023272, -0.17023272],
        [-0.17858816,  0.17858816],
        [-0.07808077,  0.07808077],
        ..., 
        [ 0.01539781, -0.01539781],
        [-0.00422497,  0.00422497],
        [ 0.01758792, -0.01758792]],

       ..., 
       [[-0.16405851,  0.16405851],
        [-0.0134143 ,  0.0134143 ],
        [-0.00564899,  0.00564899],
        ..., 
        [ 0.00831817, -0.00831817],
        [-0.0015949 ,  0.0015949 ],
        [ 0.00771645, -0.00771645]],

       [[-0.16156707,  0.16156707],
        [-0.23997534,  

## LIME explainer
Decomposing random forest predictions alternative package

In [30]:
feature_names = X.columns
categorical_features = [5,6,7,8,9]
class_names = ['Did Not Survive', 'Survived']

data = X.values

categorical_names = {}
for feature in categorical_features:
    le = preprocessing.LabelEncoder()
    le.fit(data[:, feature])
    data[:, feature] = le.transform(data[:, feature])
    categorical_names[feature] = le.classes_

In [44]:
explainer_cont = lime.lime_tabular.LimeTabularExplainer(X, feature_names=feature_names,
                                                        class_names=class_names, discretize_continuous=True)

TypeError: unhashable type: 'slice'

In [33]:
explainer = lime.lime_tabular.LimeTabularExplainer(data,feature_names = feature_names,class_names=class_names,
                                                   categorical_features=categorical_features, 
                                                   categorical_names=categorical_names, kernel_width=3)



In [42]:
i = 203
predict_fn = lambda x: rf.predict_proba(encoder.transform(x)).astype(float)
exp = explainer.explain_instance(X[i], predict_fn, num_features=5)
exp.show_in_notebook(show_all=False)

KeyError: 203