In [1]:
import numpy as np
import pandas as pd

In [2]:
# TODO : install pydot and port code for python3

# This function creates images of tree models using pydot
import pydot
from StringIO import StringIO
from sklearn.tree import export_graphviz
from IPython.display import Image

def print_tree(estimator, features, class_names=None, filled=True):
    tree = estimator
    names = features
    color = filled
    classn = class_names
    
    dot_data = StringIO()
    export_graphviz(estimator, out_file=dot_data, feature_names=features, class_names=classn, filled=filled)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    return(graph)

ModuleNotFoundError: No module named 'pydot'

In [22]:
'''If we wish to visualize the decision tree, we should run :'''
#graph = print_tree(model, predictor_names)
#Image(graph.create_png())

In [10]:
stevens = pd.read_csv('../data/stevens.csv')
#stevens.info()
stevens.head()

Unnamed: 0,Docket,Term,Circuit,Issue,Petitioner,Respondent,LowerCourt,Unconst,Reverse
0,93-1408,1994,2nd,EconomicActivity,BUSINESS,BUSINESS,liberal,0,1
1,93-1577,1994,9th,EconomicActivity,BUSINESS,BUSINESS,liberal,0,1
2,93-1612,1994,5th,EconomicActivity,BUSINESS,BUSINESS,liberal,0,1
3,94-623,1994,1st,EconomicActivity,BUSINESS,BUSINESS,conser,0,1
4,94-1175,1995,7th,JudicialPower,BUSINESS,BUSINESS,conser,0,1


In an effort to achieve better accuracy on our models,
we convert all non-numerical variables into categorical ones; for any with multiple values, 
we will create a new variable for each factor using **pd.get_dummies()**.
- Should one use the `drop_first=True` arg in pd.get_dummies(), so as to avoid multi-collinearity issues ?
- Need to look into dimensionality reduction et al (PCA).

In [11]:
# split predictors & response 
y_pre = stevens.iloc[:,-1]
X_pre = stevens.iloc[:,2:8]
# turn LowerCourt column into binary values
X_pre.LowerCourt = X_pre.LowerCourt.factorize()[0]

In [8]:
# count how many distinct variables this wil produce
count = 0
for i in X_pre.columns:
    count += len(X_pre[i].unique())
count

52

In [12]:
# explode each categorical valriable into distinct dummy variables
X_post = pd.get_dummies(data=X_pre, columns=X_pre.columns[:-2]).astype(int)

# merge response & dummy variables to create new dataset
stevens = pd.concat([y_pre, X_post],axis=1)

In [13]:
# investigate possible 1-1 correlations between explanatory & responce variables
cm = stevens.corr()

cm[((cm < -0.6) | (cm > 0.6)) & (cm != 1)].any().value_counts()

False    51
dtype: int64

In [14]:
# investigate possible  multi-collinearity between explanatory variables
cm = stevens.iloc[:,1:].corr()
cm[((cm < -0.6) | (cm > 0.6)) & (cm != 1)].any().value_counts()

False    50
dtype: int64

### Use Cross-Validation to evaluate model accuracy

- **K** can be any number, but K=10 is generally recommended.
- For **classification** problems, `stratified sampling` (via **StratifiedKFold**) is recommended for creating the folds.
  - Each response class should be represented with equal proportions in each of the K folds.
  - **scikit-learn**'s `cross_val_score` function does this by default.
      - When the **cv** argument is an integer, `cross_val_score` uses the **KFold or StratifiedKFold** strategies by default (depending on the absence or presence of the target array).



In [15]:
# re-assign predictor/response variables

y = stevens.iloc[:,0]
X = stevens.iloc[:,1:]

In [16]:
from sklearn.model_selection import cross_val_score


In [18]:
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.model_selection import cross_val_score

decTree = DTC(max_depth=2)
scores = cross_val_score(decTree, X, y, cv=10, scoring='accuracy')
#print scores

# use average accuracy as an estimate of out-of-sample accuracy
print(scores.mean())

0.593961038961


In [27]:
# "Gini importance" of each feature: 
# the (normalized) total reduction of error brought by that feature
decTree.fit(X,y)
feat_imp = pd.DataFrame({'feature':X.columns, 'importance':decTree.feature_importances_})
# sort & show 5 most important features 
feat_imp.sort_values(by='importance', ascending=False).head(5)

Unnamed: 0,feature,importance
0,LowerCourt,0.812121
48,Respondent_STATE,0.094145
12,Circuit_9th,0.093734
47,Respondent_POLITICIAN,0.0
46,Respondent_OTHER,0.0


In [42]:
np.arange(0.5,0.9, 0.1)

array([ 0.5,  0.6,  0.7,  0.8])

In [53]:
# search for an optimal value for one of the model's parameter, 
# e.g. DecisionTreeClassifier(min_sample_split)
p_range = np.arange(0.1,0.6, 0.1)
p_scores = []
for p in p_range:
    decTree = DTC(min_samples_leaf=p)
    scores = cross_val_score(decTree, X, y, cv=10, scoring='accuracy')
    p_scores.append(scores.mean())
p_scores

[0.61673672818409664,
 0.668368079289132,
 0.668368079289132,
 0.668368079289132,
 0.5459614946457052]

In [55]:
# RandomForestClassifier(n_estimators)
from sklearn.ensemble import RandomForestClassifier as RFC

param_range = range(2,10)
p_scores = []
for p in param_range:
    rfc = RFC(min_samples_leaf=0.1, max_depth=p)
    scores = cross_val_score(rfc, X, y, cv=10, scoring='accuracy')
    p_scores.append(scores.mean())
p_scores

[0.58675096832991569,
 0.57250113921166546,
 0.61297277284119389,
 0.59766005923900667,
 0.58111300979722036,
 0.58835440874914569,
 0.63957051720209612,
 0.57995727956254273]

In [66]:
# KNN (n_neighbors)
from sklearn.neighbors import KNeighborsClassifier

param_range = range(25,51,5)
p_scores = []
for p in param_range:
    knn = KNeighborsClassifier(weights='distance', n_neighbors=p)
    scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
    p_scores.append(scores.mean())
p_scores

[0.54305365686944629,
 0.54308612440191384,
 0.56244702665755297,
 0.56949589883800411,
 0.59236557302346771,
 0.58525290498974702]

---

### Use Cross-Validation to evaluate the accuracies of different models 

In [69]:
# 10-fold cross-validation with logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
print(cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean())

0.537413419913


In [86]:
# 10-fold cross-validation with the best KNearestNeighbors model
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=50, weights='distance')
cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean()

0.58525290498974702

In [103]:
# 10-fold cross-validation with Random Forests
from sklearn.ensemble import RandomForestClassifier as RFC
rfc = RFC(min_samples_leaf=0.1)
cross_val_score(rfc, X, y, cv=10, scoring='accuracy').mean()

0.60286056049213943

In [132]:
# 10-fold cross-validation with Decision Trees with manually tuned features
from sklearn.tree import DecisionTreeClassifier as DTC
regTree = DTC(min_samples_leaf=0.2)
cross_val_score(regTree, X, y, cv=10, scoring='accuracy').mean()

0.668368079289132

---

### Feature Importance
- This is approximate, use with iterative approach + mean_score for more consistent results.

In [133]:
# "Gini importance" of each feature: 
# the (normalized) total reduction of error brought by that feature
regTree.fit(X,y)
feat_imp = pd.DataFrame({'feature':X.columns, 'importance':regTree.feature_importances_})
# sort & show 5 most important features 
feat_imp.sort_values(by='importance', ascending=False).head(5)

Unnamed: 0,feature,importance
0,LowerCourt,1.0
37,Petitioner_US,0.0
27,Petitioner_BUSINESS,0.0
28,Petitioner_CITY,0.0
29,Petitioner_CRIMINAL.DEFENDENT,0.0


---

### Alternate approach using Support Vector Machine

In [113]:
# 10-fold cross-validation with SVM Classifier

from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

svc = SVC(C=0.5, kernel='rbf', class_weight='balanced', gamma=0.05,)
scores = cross_val_score(svc, X, y, cv=10, scoring='accuracy')
#print scores

# use average accuracy as an estimate of out-of-sample accuracy
print(scores.mean())

0.661225222146


---

### Using Cross-Validation & GridSearch for Efficient Discovery of optimal tuning parameters

In [135]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

#from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import RandomForestClassifier as RFC

In [136]:
'''create a parameter grid ; this will be a Dict which maps the parameter names to the values to be searched. '''
# various param lists
range1 = range(1,10)
range2 = range(1,20,2)
range3 = range(5,30,2)
range4 = range(5,50,5)
range5 = [1,2]
range6 = range(2,32,2)

RFC_grid = dict(n_estimators=range3, min_samples_split=range6, min_samples_leaf=range4)
DTC_grid = dict(min_samples_split=range6, min_samples_leaf=range4, class_weight=['balanced', None])

In [137]:
# instantiate the models & corresponding grids ; specify n_jobs=-1 for parallel computation
rfc = RFC(class_weight='balanced')
dtc = DTC()

dtc_grid = GridSearchCV(dtc, DTC_grid, cv=10, scoring='accuracy', n_jobs =-1)
rfc_grid = GridSearchCV(rfc, RFC_grid, cv=10, scoring='accuracy', n_jobs = -1)

In [138]:
# fit the DTC grid with data
dtc_grid.fit(X, y)

# examine the best model
print(dtc_grid.best_score_)
print(dtc_grid.best_params_)
print(dtc_grid.best_estimator_)

0.666077738516
{'class_weight': 'balanced', 'min_samples_leaf': 35, 'min_samples_split': 2}
DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=35,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')


In [None]:
# fit the RFC grid with data
rfc_grid.fit(X, y)

# examine the best model
print(rfc_grid.best_score_)
print(rfc_grid.best_params_)

---

### Bagging

In [35]:
from sklearn.tree import DecisionTreeClassifier as DTC
#from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score

regTree = DTC(class_weight='balanced',min_samples_leaf=35, min_samples_split=2)
#rf = RFC(n_estimators=7, min_samples_leaf=35, min_samples_split=14)

bag = BaggingClassifier(base_estimator=regTree,
                        n_estimators=100, 
                        max_samples=0.7, 
                        max_features=0.7,
                        oob_score=True,
                        n_jobs=-1, 
                        random_state=1)

cross_val_score(bag, X, y, cv=7, scoring='accuracy').mean()


0.65397363100615125

### Out-of-Bag score
- compute the out-of-bag **R-squared** score (not MSE, unfortunately!) for given **n_estimators**.
- When **n_estimators** is sufficiently large, the **out-of-bag error** is an accurate estimate of **out-of-sample error**.

In [36]:
bag.fit(X,y)
bag.oob_score_

0.66784452296819785

---

### Extra Trees ensemble Model

In [65]:
from sklearn.ensemble import ExtraTreesClassifier as ETC

ETC_grid = {'max_features':[0.25, 0.35, 0.5]}
etc_grid = GridSearchCV(ETC(n_estimators=13, criterion='entropy', min_samples_leaf=35, min_samples_split=4), ETC_grid, cv=10, scoring='accuracy', n_jobs = -1)

etc_grid.fit(X,y)
# examine the best model
print(etc_grid.best_score_)
print(etc_grid.best_params_)
print(etc_grid.best_estimator_)

0.646643109541
{'max_features': 0.35}
ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=None, max_features=0.35, max_leaf_nodes=None,
           min_samples_leaf=35, min_samples_split=4,
           min_weight_fraction_leaf=0.0, n_estimators=13, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)


---

### Gradient Boosting

 - Rather than looking at 200 (say) parallel estimators, We construct a chain of 200 estimators which iteratively refine the results of the previous estimator. 
 - The idea is that by sequentially applying very fast, simple models, we can get a total model error which is better than any of the individual pieces.


In [66]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier as GBC

gbc = GBC()
params = {'n_estimators':range(10,21,2), 'min_samples_leaf':range(15,51,5), 'min_samples_split':range(10,32,2)}
gbc_grid = GridSearchCV(gbc, params, cv=10, scoring='accuracy', n_jobs = -1)
gbc_grid.fit(X,y)

print(gbc_grid.best_score_)
print(gbc_grid.best_params_)
print(gbc_grid.best_estimator_)

0.648409893993
{'min_samples_split': 10, 'n_estimators': 10, 'min_samples_leaf': 35}
GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=35, min_samples_split=10,
              min_weight_fraction_leaf=0.0, n_estimators=10,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)


---

### Reducing computational expense using RandomizedSearchCV
- Searching many different parameters at once may be computationally infeasible.
- **RandomizedSearchCV** searches a subset of the parameters, and you control the computational "budget".


In [67]:
from sklearn.model_selection import RandomizedSearchCV

# specify "parameter distributions" rather than a "parameter grid"
range3 = range(5,30,2)
range4 = range(5,50,5)
range6 = range(2,32,2)

RFC_grid = dict(n_estimators=range3, min_samples_split=range6, min_samples_leaf=range4)

# instantiate the models & corresponding grids ; specify n_jobs=-1 for parallel computation
rfc = RFC(class_weight='balanced', criterion='entropy')

# n_iter controls the number of searches
rand = RandomizedSearchCV(rfc, RFC_grid, cv=10, scoring='accuracy', n_iter=10, random_state=5)
rand.fit(X, y)

# examine the best model
print(rand.best_score_)
print(rand.best_params_)
print(rand.best_estimator_)

0.648409893993
{'n_estimators': 15, 'min_samples_split': 10, 'min_samples_leaf': 30}
RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='entropy', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_samples_leaf=30, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


---

### Closing Remarks

There is a problem with using the `grid.best_score_` score for evaluation, however.

You might be making what is called a **multiple hypothesis testing error**. If you try very many parameter settings, some of them will work better just by chance, and the score that you obtained might not reflect how your model would perform on new unseen data. 

Therefore, it is good to **split off a separate test-set before performing grid-search**. 

This pattern can be seen as a **training-validation-test split**, and is common in machine learning:

```python
# use from sklearn.model_selection import StratifiedShuffleSplit
# to create train/test sets X_train, X_test, y_train, y_test

# run param grid search on training set
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10], 'gamma': [0.001, 0.01, 0.1, 1]}
cv = KFold(n=len(X_train), n_folds=10, shuffle=True)

grid = GridSearchCV(SVR(), param_grid=param_grid, cv=cv)

# obtain accuracy on test set
grid.fit(X_train, y_train)
grid.score(X_test, y_test)
```
