In [97]:
#Import all required libraries
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import random
import math
import seaborn as sns

#sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score

#Not a new window
%matplotlib inline
plt.rcParams['xtick.labelsize'] = 25
plt.rcParams['ytick.labelsize'] = 25

In [98]:
#Import the data files
data = pd.read_csv('../hackathon/data/bank-additional-full.csv', sep=';')
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [99]:
# Separating inputs and target variables
X = data.drop(['y'], axis=1)
y = data['y']
print X.head()
print y.head()

   age        job  marital    education  default housing loan    contact  \
0   56  housemaid  married     basic.4y       no      no   no  telephone   
1   57   services  married  high.school  unknown      no   no  telephone   
2   37   services  married  high.school       no     yes   no  telephone   
3   40     admin.  married     basic.6y       no      no   no  telephone   
4   56   services  married  high.school       no      no  yes  telephone   

  month day_of_week  duration  campaign  pdays  previous     poutcome  \
0   may         mon       261         1    999         0  nonexistent   
1   may         mon       149         1    999         0  nonexistent   
2   may         mon       226         1    999         0  nonexistent   
3   may         mon       151         1    999         0  nonexistent   
4   may         mon       307         1    999         0  nonexistent   

   emp.var.rate  cons.price.idx  cons.conf.idx  euribor3m  nr.employed  
0           1.1          93.994

In [100]:
#Describe the data to identify the missing data
#Describe categorical data
def describe_categorical(X):
    from IPython.display import display, HTML
    display(HTML(X[X.columns[X.dtypes == 'object']].describe().to_html()))
print data.info()
print data.describe()
print describe_categorical(data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
age               41188 non-null int64
job               41188 non-null object
marital           41188 non-null object
education         41188 non-null object
default           41188 non-null object
housing           41188 non-null object
loan              41188 non-null object
contact           41188 non-null object
month             41188 non-null object
day_of_week       41188 non-null object
duration          41188 non-null int64
campaign          41188 non-null int64
pdays             41188 non-null int64
previous          41188 non-null int64
poutcome          41188 non-null object
emp.var.rate      41188 non-null float64
cons.price.idx    41188 non-null float64
cons.conf.idx     41188 non-null float64
euribor3m         41188 non-null float64
nr.employed       41188 non-null float64
y                 41188 non-null object
dtypes: float64(5), int64(5), object(11)
memory usa

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome,y
count,41188,41188,41188,41188,41188,41188,41188,41188,41188,41188,41188
unique,12,4,8,3,3,3,2,10,5,3,2
top,admin.,married,university.degree,no,yes,no,cellular,may,thu,nonexistent,no
freq,10422,24928,12168,32588,21576,33950,26144,13769,8623,35563,36548


None


In [101]:
#Separate into test and training data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.1, random_state=42)


In [102]:
#Numerical variables for a quick model. The value numeric uses only numeric inputs from the dataframe
numeric = list(X_train.dtypes[X_train.dtypes != 'object'].index)
#example:
print X_train[numeric].head()
print X_train[numeric].describe()

#Convert yes and no to 1 and 0 for y_train and y_test
y_train = y_train.map(dict(yes=1, no=0))
y_test = y_test.map(dict(yes=1, no=0))

#print y_train[0]
#print len(y_train)


       age  duration  campaign  pdays  previous  emp.var.rate  cons.price.idx  \
38651   22       358         1    999         0          -3.4          92.649   
12733   52       262         1    999         0           1.4          93.918   
35807   47       953         3    999         0          -1.8          92.893   
1423    38       403         2    999         0           1.1          93.994   
52      60       253         1    999         0           1.1          93.994   

       cons.conf.idx  euribor3m  nr.employed  
38651          -30.1      0.720       5017.5  
12733          -42.7      4.962       5228.1  
35807          -46.2      1.244       5099.1  
1423           -36.4      4.855       5191.0  
52             -36.4      4.857       5191.0  
                age      duration      campaign         pdays      previous  \
count  37069.000000  37069.000000  37069.000000  37069.000000  37069.000000   
mean      40.029297    258.147023      2.571933    962.783512      0.1731

In [103]:
#First cut model with set variables for continuous data using a standard fix set of parameters:
#n_estimators=50, oob_score=True, random_state=42

model = RandomForestRegressor(n_estimators=50, oob_score=True, random_state=42)
model.fit(X_train[numeric],y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=50, n_jobs=1, oob_score=True, random_state=42,
           verbose=0, warm_start=False)

In [104]:
#Check c-stat = roc_auc_score(y, model.oob_prediction_)
y_oob = model.oob_prediction_
print "c-stat: ", roc_auc_score(y_train, y_oob)

c-stat:  0.924121438788


In [105]:
#Test the fit with test data
y_pred = model.predict(X_test[numeric])
print "c-stat: ", roc_auc_score(y_test, y_pred)


c-stat:  0.921785237823


In [None]:
#Feature engineering of continuous variables
