In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_sas('./data/rata_m_srs_an.sas7bdat', encoding='unicode_escape')

In [3]:
df['NUM_RATE_SCAD'].unique()

array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 12., 13.,
       14., 23., 11., 15., 16., 17., 22., 18., 19., 20., 21., 24., 25.,
       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
       39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., 51.,
       52., 53., 54., 55., 56., 57., 58., 60.])

In [4]:
df.dropna(inplace=True)

In [5]:
y = np.where(df['NUM_RATE_SCAD'].values>0, 1, 0)

In [6]:
df.head()

Unnamed: 0,COD_STT_OPE,COD_FASE_OPE,DAT_RIF_M,NUM_RATE_RSD,IMP_RATE_RSD,NUM_RATE_SCAD,IMP_RATE_SCAD,Contract_ID
10,F,EX,2008-01-31,0.0,0.0,5.0,42166.0,S_000057
11,F,EX,2008-02-29,0.0,0.0,6.0,42534.0,S_000057
12,F,EX,2008-03-31,0.0,0.0,2.0,25818.0,S_000057
13,F,EX,2008-04-30,0.0,0.0,2.0,23966.0,S_000057
14,F,EX,2008-05-31,0.0,0.0,3.0,24182.0,S_000057


In [7]:
data = df[['COD_FASE_OPE','NUM_RATE_RSD', 'IMP_RATE_RSD', 'IMP_RATE_SCAD']]

In [8]:
data.head()

Unnamed: 0,COD_FASE_OPE,NUM_RATE_RSD,IMP_RATE_RSD,IMP_RATE_SCAD
10,EX,0.0,0.0,42166.0
11,EX,0.0,0.0,42534.0
12,EX,0.0,0.0,25818.0
13,EX,0.0,0.0,23966.0
14,EX,0.0,0.0,24182.0


In [9]:
data['COD_FASE_OPE'].unique()

array(['EX', 'UP', 'UK'], dtype=object)

In [10]:
dummies = pd.get_dummies(data['COD_FASE_OPE'])

In [11]:
new_data = pd.concat([data, dummies], join='inner', axis=1)

In [12]:
del new_data['COD_FASE_OPE']

In [13]:
X = new_data.values

In [14]:
new_data.columns.values

array(['NUM_RATE_RSD', 'IMP_RATE_RSD', 'IMP_RATE_SCAD', 'EX', 'UK', 'UP'],
      dtype=object)

In [15]:
X[0]

array([0.0000e+00, 0.0000e+00, 4.2166e+04, 1.0000e+00, 0.0000e+00,
       0.0000e+00])

In [16]:
y[0]

1

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=123
                                                    , train_size=0.8, test_size=0.2)

In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline

In [20]:
pipe = make_pipeline(
    StandardScaler(), 
    LogisticRegressionCV(cv=5)
)

In [21]:
sum(np.isnan(X_train))

array([0, 0, 0, 0, 0, 0])

In [22]:
pipe.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregressioncv', LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0))])

In [23]:
pipe.score(X_test, y_test)

0.9898132427843803

In [24]:
from sklearn.metrics import confusion_matrix

In [25]:
y_pred = pipe.predict(X_test)

In [26]:
confusion_matrix(y_test, y_pred)

array([[ 33,   5],
       [  1, 550]], dtype=int64)

In [27]:
from sklearn.feature_selection import f_classif

In [28]:
y_pred_train = pipe.predict(X_train)
F, pvals = f_classif(X_train, y_pred_train)

In [29]:
anova = pd.DataFrame(index=new_data.columns.values)

In [30]:
anova['F-values'] = F
anova['P-values'] = pvals

In [31]:
anova

Unnamed: 0,F-values,P-values
NUM_RATE_RSD,65.709546,8.314027e-16
IMP_RATE_RSD,9.502794,0.002075401
IMP_RATE_SCAD,65.087565,1.130233e-15
EX,5467.215357,0.0
UK,1869.540014,4.788594e-301
UP,767.434692,2.018744e-146


## Decision Trees

In [32]:
from sklearn.tree import DecisionTreeClassifier

In [33]:
tree = DecisionTreeClassifier(max_depth=2)

In [47]:
X_train_new = np.hstack([X_train[:,:2],X_train[:,4:]]) # Excluding the highly correlated column
X_test_new = np.hstack([X_test[:,:2],X_test[:,4:]])

In [48]:
tree.fit(X_train_new, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [49]:
tree.score(X_test_new, y_test)

0.9779286926994907

In [50]:
from sklearn.tree import export_graphviz

In [51]:
# NEW VERSION:
# from sklearn.tree import plot_tree

In [52]:
export_graphviz(tree, out_file='my_tree.dot')

In [53]:
X_train[:,2]

array([114613., 224355.,  17276., ...,  36113., 126388.,  45001.])

In [54]:
y_train

array([1, 1, 1, ..., 1, 1, 1])

## Hyperparameter search: Grid-search  cross-validation

In [55]:
from sklearn.model_selection import GridSearchCV

In [56]:
param_grid = {'max_depth':[2,4,6,8], 'criterion':['gini', 'entropy']}

In [57]:
grid = GridSearchCV(
    DecisionTreeClassifier(), 
    param_grid = param_grid, 
    cv=5,# 3/5 = prototype/PoC, cv=10 for higher confidence
    return_train_score=True
)

In [58]:
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [2, 4, 6, 8], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [59]:
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.0,0.0,0.0,0.0,gini,2,"{'criterion': 'gini', 'max_depth': 2}",1.0,1.0,1.0,...,1.0,0.0,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,0.0,0.0,0.0,0.0,gini,4,"{'criterion': 'gini', 'max_depth': 4}",1.0,1.0,1.0,...,1.0,0.0,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,0.003126,0.006252,0.0,0.0,gini,6,"{'criterion': 'gini', 'max_depth': 6}",1.0,1.0,1.0,...,1.0,0.0,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,0.0,0.0,0.0,0.0,gini,8,"{'criterion': 'gini', 'max_depth': 8}",1.0,1.0,1.0,...,1.0,0.0,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,4e-05,8e-05,0.0,0.0,entropy,2,"{'criterion': 'entropy', 'max_depth': 2}",1.0,1.0,1.0,...,1.0,0.0,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
5,0.0,0.0,0.0,0.0,entropy,4,"{'criterion': 'entropy', 'max_depth': 4}",1.0,1.0,1.0,...,1.0,0.0,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
6,0.0,0.0,0.0,0.0,entropy,6,"{'criterion': 'entropy', 'max_depth': 6}",1.0,1.0,1.0,...,1.0,0.0,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
7,0.0,0.0,0.0,0.0,entropy,8,"{'criterion': 'entropy', 'max_depth': 8}",1.0,1.0,1.0,...,1.0,0.0,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [62]:
df[['NUM_RATE_RSD','IMP_RATE_RSD','NUM_RATE_SCAD', 'IMP_RATE_SCAD']].corr()

Unnamed: 0,NUM_RATE_RSD,IMP_RATE_RSD,NUM_RATE_SCAD,IMP_RATE_SCAD
NUM_RATE_RSD,1.0,0.720017,-0.047444,0.036025
IMP_RATE_RSD,0.720017,1.0,-0.043223,0.141133
NUM_RATE_SCAD,-0.047444,-0.043223,1.0,0.153453
IMP_RATE_SCAD,0.036025,0.141133,0.153453,1.0


## Model persistance

In [64]:
import pickle

In [65]:
best_model = grid.best_estimator_

In [66]:
best_model

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [67]:
pickle.dump(best_model, open('greatest_tree.pkl', 'wb'))