# Setup ------------------------------------------------------------------------------------
### Libraries, Functions, and Data

In [1]:
import sklearn as sk
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from pylab import *
import sys

print ('Python version:', sys.version)
print ('scikit-learn version:', sk.__version__)

Python version: 3.6.2 |Anaconda custom (64-bit)| (default, Jul 20 2017, 13:51:32) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
scikit-learn version: 0.19.1


### Build a Data Quality Report

In [2]:
#input a dataframe
#returns a dataframe with helpful statistics on the dataframe and its columns

def dqr(d):
    #data types
    dqr_data_types = pd.DataFrame(d.dtypes, columns=['Data Type'])

    #percent missing
    dqr_percent_missing = pd.DataFrame(100*(d.isnull().sum()/len(d)).round(3), columns=['% Missing'])

    #unique values
    dqr_unique_values = pd.DataFrame(columns=['Unique Values'])
    for c in d:
        dqr_unique_values.loc[c]=d[c].nunique()
        
    #mode
    dqr_mode = pd.DataFrame(d.mode().loc[0], columns=['Mode'])
    
    #count mode
    dqr_count_mode = pd.DataFrame(columns=['Count Mode'])
    for c in d:
        dqr_count_mode.loc[c]=d[c][d[c] == dqr_mode.loc[[c]].iloc[0]['Mode']].count()  

    #% mode
    dqr_percent_mode = pd.DataFrame(100*(dqr_count_mode['Count Mode'].values/len(d)).round(3), index=dqr_count_mode.index, columns=['% Mode'])
            
    #distribution stats    
    i=1
    for c in d:
        if i==1:
            dqr_stats = pd.DataFrame(d[c].describe())
        if i>1:
            dqr_stats = dqr_stats.join(pd.DataFrame(d[c].describe()))
        i=i+1
    dqr_stats=dqr_stats.transpose().drop('count', axis=1)

    print("# of records: ", len(d))
    
    #don't include count mode
    #.join(dqr_count_mode[['Count Mode']].astype(int))
    
    return dqr_data_types.join(dqr_unique_values[['Unique Values']].astype(int)).join(dqr_percent_missing).join(dqr_mode).join(dqr_percent_mode).join(dqr_stats)


### Build a Cross Validation Function

In [3]:
from sklearn import cross_validation



In [4]:
# input a list of models, the type of score to be used with cv, and k
# each element in the list of models should have two items: the model object and the name you want to use for that 
# model object
# returns a dataframe with the names you entered and the mean of the cv scores across all k folds

def cv_fun(models, score, k):
    i = 0
    for m in models:
        scores = cross_validation.cross_val_score(models[i][0], titanic_inputs, titanic_target, scoring=score, cv=k)
    
        if i==0:
            list1 = list()
            list2 = list()
            
        list1.append(scores.mean())
        list2.append(models[i][1])

        i=i+1

    return pd.DataFrame(list1, index=list2, columns=[score]).sort_values(by=score, ascending=False)

## Import Data

In [5]:
titanic_raw = pd.read_csv('train.csv', header=0, index_col=0)
titanic = pd.read_csv('train.csv', header=0, index_col=0)
titanic.head()

FileNotFoundError: File b'train.csv' does not exist


# Data Prep --------------------------------------------------------

### Run the DQR

In [None]:
dqr(titanic)

### Drop Ticket

In [None]:
titanic = titanic.drop(['Ticket'],axis=1)

### Change Sex to a dummie field called Male

In [None]:
titanic['Male'] = titanic['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
titanic = titanic.drop(['Sex'],axis=1)

### Create FamilySize from SibSp and Parch

In [None]:
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch']
titanic = titanic.drop(['SibSp', 'Parch'],axis=1)

### Explore Cabin and create CabinFill

In [None]:
titanic['Cabin'].value_counts().head()

In [None]:
titanic.Cabin.str.get(0).value_counts()

In [None]:
titanic['CabinFill'] = titanic.Cabin.str.get(0)
titanic.loc[ (titanic.CabinFill.isnull()) , 'CabinFill'] = "NULL"
titanic = titanic.drop(['Cabin'],axis=1)

### Fill nulls for Embarked

In [None]:
titanic.loc[ (titanic.Embarked.isnull()), 'Embarked'] = "NULL"

### Create Title

In [None]:
titanic['Title']="NONE"
titanic.loc[ (titanic.Name.str.contains("Mr. ")), 'Title'] = "Mr"
titanic.loc[ (titanic.Name.str.contains("Mrs. ")), 'Title'] = "Mrs"
titanic.loc[ (titanic.Name.str.contains("Miss. ")), 'Title'] = "Miss"
titanic.loc[ (titanic.Name.str.contains("Master. ")), 'Title'] = "Master"
titanic.loc[ (titanic.Name.str.contains("Rev. ")), 'Title'] = "Rev"
titanic.loc[ (titanic.Name.str.contains("Mme. ")), 'Title'] = "Mme"
titanic = titanic.drop(['Name'],axis=1)

### Create dummie fields for the remaining categorical fields

In [None]:
d1 = pd.get_dummies(titanic.Title,prefix='Title') #, drop_first=True)
d2 = pd.get_dummies(titanic.CabinFill,prefix='Cabin') #, drop_first=True)
d3 = pd.get_dummies(titanic.Embarked,prefix='Emb') #, drop_first=True)
d4 = pd.get_dummies(titanic.Pclass,prefix='Pclass') #, drop_first=True)
titanic=pd.concat([titanic,d1,d2,d3,d4],axis=1)
titanic = titanic.drop(['Title', 'CabinFill', 'Embarked', 'Pclass'],axis=1)

### Use a linear model to impute Age when it is missing

In [None]:
trainage = titanic[ titanic['Age'].isnull()==0 ]

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
columns = titanic.loc[:, titanic.columns != 'Age'].columns
target = trainage['Age'].values
trainX = trainage[list(columns)].values

In [None]:
lr.fit(trainX,target)

In [None]:
trainage.loc[:,'AgePred']=lr.predict(trainX)
plt.axis([0.0,90.0, 0.0,90.0])
ax = plt.gca()
ax.set_autoscale_on(False)
plt.scatter(trainage.Age,trainage.AgePred)

In [None]:
pAge = pd.DataFrame(lr.predict(titanic[columns]), index=titanic.index, columns=['Age Pred'])
titanic=pd.concat([titanic,pAge],axis=1)
titanic.head()

In [None]:
titanic.loc[titanic['Age'].isnull(), 'Age'] = titanic['Age Pred']
titanic = titanic.drop(['Age Pred'],axis=1)
titanic.head()

In [None]:
dqr(titanic)


# Predictive Modeling Approach --------------------------------------------------------

#### The data definitely has some correlated inputs (e.g. Age and FamilySize)
#### Ignore those for now as that does not effect ability to predict
#### Many of the categorical fields are almost always zero making the input space cluttered
#### The ensemble methods will not be hampered by those


### Train and Test Sets

In [None]:
titanic_train = titanic.sample(frac=.70, random_state=0)
titanic_train.sort_index(inplace=True)
titanic_test = titanic[~titanic.isin(titanic_train)].dropna()

### Begin Model Building

In [None]:
columns = list(titanic.columns)
columns.remove('Survived')
columns.remove('Title_NONE')
columns.remove('Cabin_NULL')
columns.remove('Emb_NULL')
columns.remove('Pclass_1')
titanic_target = titanic_train["Survived"].values
titanic_inputs = titanic_train[columns].values
titanic_test_target = titanic_test["Survived"].values
titanic_test_inputs = titanic_test[columns].values

In [None]:
from sklearn import tree
dt1 = tree.DecisionTreeClassifier(criterion='gini', max_depth=5,min_samples_leaf=10)
dt1 = dt1.fit(titanic_inputs, titanic_target)
dt1prb = dt1.predict_proba(titanic_inputs)

dt2 = tree.DecisionTreeClassifier(criterion='gini', max_depth=3,min_samples_leaf=5)
dt2 = dt2.fit(titanic_inputs, titanic_target)
dt2prb = dt2.predict_proba(titanic_inputs)

In [None]:
from sklearn.naive_bayes import GaussianNB
nb1 = GaussianNB()
nb1 = nb1.fit(titanic_inputs, titanic_target)
nb1prb = nb1.predict_proba(titanic_inputs)

In [None]:
from sklearn import svm
sv1 = svm.SVC(kernel='linear', probability=True)
sv1 = sv1.fit(titanic_inputs, titanic_target)
sv1prb = sv1.predict_proba(titanic_inputs)

sv2 = svm.SVC(kernel='rbf', probability=True)
sv2 = sv2.fit(titanic_inputs, titanic_target)
sv2prb = sv2.predict_proba(titanic_inputs)

sv3 = svm.SVC(kernel='sigmoid', probability=True)
sv3 = sv3.fit(titanic_inputs, titanic_target)
sv3prb = sv3.predict_proba(titanic_inputs)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
kn1 = KNeighborsClassifier(n_neighbors=1)
kn1 = kn1.fit(titanic_inputs, titanic_target)
kn1prb = kn1.predict_proba(titanic_inputs)

kn2 = KNeighborsClassifier(n_neighbors=5)
kn2 = kn2.fit(titanic_inputs, titanic_target)
kn2prb = kn2.predict_proba(titanic_inputs)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ad1 = AdaBoostClassifier(n_estimators=1000)
ad1 = ad1.fit(titanic_inputs, titanic_target)
ad1prb = ad1.predict_proba(titanic_inputs)

In [None]:
from sklearn.ensemble import RandomForestClassifier 
rf1 = RandomForestClassifier(n_estimators = 10)
rf1 = rf1.fit(titanic_inputs, titanic_target)
rf1prb = rf1.predict_proba(titanic_inputs)
rf1pclass = rf1.predict(titanic_inputs)

rf2 = RandomForestClassifier(n_estimators = 100)
rf2 = rf2.fit(titanic_inputs, titanic_target)
rf2prb = rf2.predict_proba(titanic_inputs)
rf2pclass = rf2.predict(titanic_inputs)

### ROC Curves

#### On all the training data

In [None]:
from sklearn import metrics

dt1fpr,dt1tpr,dt1thresholds = metrics.roc_curve(titanic_target,dt1prb[:,1])
dt2fpr,dt2tpr,dt2thresholds = metrics.roc_curve(titanic_target,dt2prb[:,1])
rf1fpr,rf1tpr,rf1thresholds = metrics.roc_curve(titanic_target,rf1prb[:,1])
rf2fpr,rf2tpr,rf2thresholds = metrics.roc_curve(titanic_target,rf2prb[:,1])
sv1fpr,sv1tpr,sv1thresholds = metrics.roc_curve(titanic_target,sv1prb[:,1])
sv2fpr,sv2tpr,sv2thresholds = metrics.roc_curve(titanic_target,sv2prb[:,1])
sv3fpr,sv3tpr,sv3thresholds = metrics.roc_curve(titanic_target,sv3prb[:,1])
kn1fpr,kn1tpr,kn1thresholds = metrics.roc_curve(titanic_target,kn1prb[:,1])
kn2fpr,kn2tpr,kn2thresholds = metrics.roc_curve(titanic_target,kn2prb[:,1])
ad1fpr,ad1tpr,ad1thresholds = metrics.roc_curve(titanic_target,ad1prb[:,1])
nb1fpr,nb1tpr,nb1thresholds = metrics.roc_curve(titanic_target,nb1prb[:,1])

plt.clf()
plt.plot(dt1fpr,dt1tpr,color='red')
plt.plot(dt2fpr,dt2tpr,color='yellow')
plt.plot(rf1fpr,rf1tpr,color='blue')
plt.plot(rf2fpr,rf2tpr,color='purple')
plt.plot(sv1fpr,sv1tpr,color='green')
plt.plot(sv2fpr,sv2tpr,color='cyan')
plt.plot(sv3fpr,sv3tpr,color='grey')
plt.plot(kn2fpr,kn2tpr,color='black')
plt.plot(kn1fpr,kn1tpr,color='purple')
plt.plot(ad1fpr,ad1tpr,color='magenta')
plt.plot(nb1fpr,nb1tpr,color='brown')

### Run 5-fold cv to estimate predictive power

In [None]:
# use the cv function found up under the Setup section
# enter a list with each entry holding the model object followed by a text name you want to give the model

input_models = [[rf1, 'rf1']]
input_models.append([rf2, 'rf2'])
input_models.append([sv1, 'sv1'])
input_models.append([sv2, 'sv2'])
input_models.append([sv3, 'sv3'])
input_models.append([nb1, 'nb1'])
input_models.append([ad1, 'ad1'])
input_models.append([kn1, 'kn1'])
input_models.append([kn2, 'kn2'])
input_models.append([dt1, 'dt1'])
input_models.append([dt2, 'dt2'])

cv_roc = cv_fun(input_models, 'roc_auc', 5)
cv_roc

### Evaluate on the test data

In [None]:
#build a dataframe with predictive results

#dataframe index
auc_test_names = list()
auc_test_names.append('dt1')
auc_test_names.append('dt2')
auc_test_names.append('rf1')
auc_test_names.append('rf2')
auc_test_names.append('sv1')
auc_test_names.append('sv2')
auc_test_names.append('sv3')
auc_test_names.append('kn1')
auc_test_names.append('kn2')
auc_test_names.append('ad1')
auc_test_names.append('nb1')

dt1fpr_test,dt1tpr_test,dt1thresholds_test = \
    metrics.roc_curve(titanic_test_target,dt1.predict_proba(titanic_test_inputs)[:,1])
dt2fpr_test,dt2tpr_test,dt2thresholds_test = \
    metrics.roc_curve(titanic_test_target,dt2.predict_proba(titanic_test_inputs)[:,1])
rf1fpr_test,rf1tpr_test,rf1thresholds_test = \
    metrics.roc_curve(titanic_test_target,rf1.predict_proba(titanic_test_inputs)[:,1])
rf2fpr_test,rf2tpr_test,rf2thresholds_test = \
    metrics.roc_curve(titanic_test_target,rf2.predict_proba(titanic_test_inputs)[:,1])
sv1fpr_test,sv1tpr_test,sv1thresholds_test = \
    metrics.roc_curve(titanic_test_target,sv1.predict_proba(titanic_test_inputs)[:,1])
sv2fpr_test,sv2tpr_test,sv2thresholds_test = \
    metrics.roc_curve(titanic_test_target,sv2.predict_proba(titanic_test_inputs)[:,1])
sv3fpr_test,sv3tpr_test,sv3thresholds_test = \
    metrics.roc_curve(titanic_test_target,sv3.predict_proba(titanic_test_inputs)[:,1])
kn1fpr_test,kn1tpr_test,kn1thresholds_test = \
    metrics.roc_curve(titanic_test_target,kn1.predict_proba(titanic_test_inputs)[:,1])
kn2fpr_test,kn2tpr_test,kn2thresholds_test = \
    metrics.roc_curve(titanic_test_target,kn2.predict_proba(titanic_test_inputs)[:,1])
ad1fpr_test,ad1tpr_test,ad1thresholds_test = \
    metrics.roc_curve(titanic_test_target,ad1.predict_proba(titanic_test_inputs)[:,1])
nb1fpr_test,nb1tpr_test,nb1thresholds_test = \
    metrics.roc_curve(titanic_test_target,nb1.predict_proba(titanic_test_inputs)[:,1])

auc_test = list() 
auc_test.append(metrics.auc(dt1fpr_test, dt1tpr_test))
auc_test.append(metrics.auc(dt2fpr_test, dt2tpr_test))
auc_test.append(metrics.auc(rf1fpr_test, rf1tpr_test))
auc_test.append(metrics.auc(rf2fpr_test, rf2tpr_test))
auc_test.append(metrics.auc(sv1fpr_test, sv1tpr_test))
auc_test.append(metrics.auc(sv2fpr_test, sv2tpr_test))
auc_test.append(metrics.auc(sv3fpr_test, sv3tpr_test))
auc_test.append(metrics.auc(kn1fpr_test, kn1tpr_test))
auc_test.append(metrics.auc(kn2fpr_test, kn2tpr_test))
auc_test.append(metrics.auc(ad1fpr_test, ad1tpr_test))
auc_test.append(metrics.auc(nb1fpr_test, nb1tpr_test))

acc_test = list() 
acc_test.append(metrics.accuracy_score(titanic_test_target,dt1.predict(titanic_test_inputs)))
acc_test.append(metrics.accuracy_score(titanic_test_target,dt2.predict(titanic_test_inputs)))
acc_test.append(metrics.accuracy_score(titanic_test_target,rf1.predict(titanic_test_inputs)))
acc_test.append(metrics.accuracy_score(titanic_test_target,rf2.predict(titanic_test_inputs)))
acc_test.append(metrics.accuracy_score(titanic_test_target,sv1.predict(titanic_test_inputs)))
acc_test.append(metrics.accuracy_score(titanic_test_target,sv2.predict(titanic_test_inputs)))
acc_test.append(metrics.accuracy_score(titanic_test_target,sv3.predict(titanic_test_inputs)))
acc_test.append(metrics.accuracy_score(titanic_test_target,kn1.predict(titanic_test_inputs)))
acc_test.append(metrics.accuracy_score(titanic_test_target,kn2.predict(titanic_test_inputs)))
acc_test.append(metrics.accuracy_score(titanic_test_target,ad1.predict(titanic_test_inputs)))
acc_test.append(metrics.accuracy_score(titanic_test_target,nb1.predict(titanic_test_inputs)))

auc_test_df = pd.DataFrame(auc_test, index=auc_test_names, columns=['AUC'])
acc_test_df = pd.DataFrame(acc_test, index=auc_test_names, columns=['Accuracy'])
auc_test_df.join(acc_test_df).sort_values(by='AUC', ascending=False)

# Interpretable Modeling Approach ---------------------------------------------

### Run DQR again

In [None]:
dqr(titanic)

In [None]:
dqr(titanic_raw)

In [None]:
temp2 = pd.DataFrame(titanic_raw.groupby(['Pclass'])['Survived'].mean())
temp3 = pd.DataFrame(titanic_raw.groupby(['Pclass'])['Survived'].count())
pd.DataFrame(temp2['Survived'].values, index=temp2.index, columns=['mean']) \
    .join(pd.DataFrame(temp3['Survived'].values, index=temp3.index, columns=['count']))

In [None]:
temp2 = pd.DataFrame(titanic_raw.groupby(['Sex'])['Survived'].mean())
temp3 = pd.DataFrame(titanic_raw.groupby(['Sex'])['Survived'].count())
pd.DataFrame(temp2['Survived'].values, index=temp2.index, columns=['mean']) \
    .join(pd.DataFrame(temp3['Survived'].values, index=temp3.index, columns=['count']))

In [None]:
temp = pd.DataFrame(titanic_raw.Cabin.str.get(0)).join(titanic_raw['Survived'])
temp.loc[ (temp.Cabin.isnull()) , 'Cabin'] = "NULL"


temp2 = pd.DataFrame(temp.groupby(['Cabin'])['Survived'].mean())
temp3 = pd.DataFrame(temp.groupby(['Cabin'])['Survived'].count())
pd.DataFrame(temp2['Survived'].values, index=temp2.index, columns=['mean']) \
    .join(pd.DataFrame(temp3['Survived'].values, index=temp3.index, columns=['count']))

In [None]:
temp = pd.DataFrame(titanic_raw['Embarked']).join(titanic_raw['Survived'])
temp.loc[ (temp.Embarked.isnull()), 'Embarked'] = "NULL"

temp2 = pd.DataFrame(temp.groupby(['Embarked'])['Survived'].mean())
temp3 = pd.DataFrame(temp.groupby(['Embarked'])['Survived'].count())
pd.DataFrame(temp2['Survived'].values, index=temp2.index, columns=['mean']) \
    .join(pd.DataFrame(temp3['Survived'].values, index=temp3.index, columns=['count']))

In [None]:
temp = pd.DataFrame(titanic_raw['Name']).join(titanic_raw['Survived']).join(titanic_raw['Age'])
temp['Title']="NONE"
temp.loc[ (temp.Name.str.contains("Mr. ")), 'Title'] = "Mr"
temp.loc[ (temp.Name.str.contains("Mrs. ")), 'Title'] = "Mrs"
temp.loc[ (temp.Name.str.contains("Miss. ")), 'Title'] = "Miss"
temp.loc[ (temp.Name.str.contains("Master. ")), 'Title'] = "Master"
temp.loc[ (temp.Name.str.contains("Rev. ")), 'Title'] = "Rev"
temp.loc[ (temp.Name.str.contains("Mme. ")), 'Title'] = "Mme"

temp2 = pd.DataFrame(temp.groupby(['Title'])['Survived'].mean())
temp3 = pd.DataFrame(temp.groupby(['Title'])['Survived'].count())
temp4 = pd.DataFrame(temp.groupby(['Title'])['Age'].min())
temp5 = pd.DataFrame(temp.groupby(['Title'])['Age'].max())
pd.DataFrame(temp2['Survived'].values, index=temp2.index, columns=['mean']) \
    .join(pd.DataFrame(temp3['Survived'].values, index=temp3.index, columns=['count'])) \
    .join(pd.DataFrame(temp4['Age'].values, index=temp3.index, columns=['min_age'])) \
    .join(pd.DataFrame(temp5['Age'].values, index=temp4.index, columns=['max_age']))

In [None]:
temp = pd.DataFrame(titanic_raw['Name']).join(titanic_raw['Survived']).join(titanic_raw['Age'])

### Check for multicolinearity

In [None]:
from pandas.tools.plotting import scatter_matrix

scatter_matrix(titanic[['Age', 'Fare', 'FamilySize']], alpha=0.2, figsize=(6, 6), diagonal='kde')
plt.show()

## LIME

In [None]:
!pip install lime

In [None]:
import lime
from lime import lime_tabular
from sklearn.cross_validation import train_test_split

In [None]:
cv_train, cv_test, cv_labels_train, cv_labels_test = sk.cross_validation.train_test_split(titanic_inputs, titanic_target, train_size=0.80)

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(cv_train, mode="classification", class_names=['Died', 'Survived'],
                                                   feature_names=columns, verbose=True,   
                                                   discretize_continuous=True)

In [None]:
i = np.random.randint(0, cv_test.shape[0])
i = 100

In [None]:
cv_test[i]

In [None]:
print('True Survived Label', cv_labels_test[i])

In [None]:
print('Predicted Probability of Surviving', rf2.predict_proba(cv_test[i].reshape(1,-1))[0][1])

In [None]:
exp = explainer.explain_instance(cv_test[i], rf2.predict_proba)

In [None]:
exp.show_in_notebook(show_table=True, show_all=True)

In [None]:
exp = explainer.explain_instance(cv_test[i], rf2.predict_proba, num_features=5)
print('Couples probability of staying together:', exp.predict_proba[1])
exp.as_pyplot_figure()