# Decision Tree Experimentation
First, we import all relevant packages

The crossvalidation's train_test_split() help us by splitting data into train & test set. This is easy way out before we do further processing:
We should preprocess the data by partioning with the same percentage for training, cross_validation and test set.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import cross_val_score

import matplotlib.pyplot as plt
from matplotlib.legend_handler import HandlerLine2D

## Dataset Directories

In [2]:
# Input data before feature selection
input_data_before_fs = pd.read_csv('processed_train.csv', index_col=0)

# Input data after feature selection
input_data_after_fs = pd.read_csv('processed_train_after_feature.csv', index_col=0)

# Upsampling without feature selection

# Upsampling with feature selection

# Downsampling without feature selection

# Upsampling with feature selection


# List of all the input data
input_all = {
    "normal_before_fs" : input_data_before_fs,
#     "normal_after_fs" : input_data_after_fs
}

In [3]:
# for input in all_input:
#     print ("Dataset Length:: ", len(input))
#     print ("Dataset Shape: ", input.shape)
#     input_data.info()
#     input_data.head(5)

## General Functions

In [4]:
def preprocessing(data):
    #Split data into variables types - boolean, categorical, continuous, ID
    bool_var = list(data.select_dtypes(['bool']))
    cont_var = list(data.select_dtypes(['float64']))
    cat_var = list(data.select_dtypes(['int64']))

    #Input Data can be from all except id details
    final_input_data = data[cat_var + cont_var + bool_var]
    
    x = final_input_data.loc[:, final_input_data.columns != 'Target'].values
    y = final_input_data['Target'].values
    y=y.astype('int')
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, 
                                                    random_state = 100 , stratify = y)
    
    return x_train, x_test, y_train, y_test

In [5]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import cross_validate

def tenfold(model, x, y, metric='accuracy'):
    kfold = StratifiedKFold(n_splits=10, random_state=100, shuffle=True)
    scores = cross_validate(model, x, y, cv=kfold, scoring=metric, 
                            return_train_score=True)
    return scores

# accuracy_mean = scores['test_score'].mean()
# accuracy_std = scores['train_score'].std()

In [7]:
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score, classification_report, f1_score, confusion_matrix

def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return roc_auc_score(y_test, y_pred, average=average)

def metric_consolidation(input_all, classifier, method = "cross_validation"):
    metrics = {'accuracy': 'accuracy',
               'roc_auc': make_scorer(multiclass_roc_auc_score, average='weighted'),
               'f1_weighted': 'f1_weighted'
              }
    
    for input_name, input_data in input_all.items():
        # split the data
        x_train, x_test, y_train, y_test = preprocessing(input_data)

        # fit the classifier to the training data
        classifier.fit(x_train, y_train)

        # apply all metrics to the classifier for cross_validation
        if method == "cross_validation":
            scores = tenfold(classifier, x_train, y_train, metric = metrics)
            print ("Metrics for %s: \n" %input_name)
            for metric in metrics:
                test_score_name = "test_" + metric
                test_score = scores[test_score_name]
                print ("%s Test Score: %0.2f +/- %0.2f" %(metric, test_score.mean()*100,
                                               test_score.std()*100))   
            print ("\n")
            
        if method == "test":
            y_pred = classifier.predict(x_test)
            accuracy = accuracy_score(y_test, y_pred)
            roc_score = multiclass_roc_auc_score(y_test, y_pred, average='weighted')
            f1_weighted = f1_score(y_test, y_pred, average='weighted')
            
            metric_values = {'accuracy': accuracy,
                             'roc_auc': roc_score,
                             'f1_weighted': f1_weighted
                            }
            for metric in metrics:
                test_score = metric_values[metric]
                print ("%s Test Score: %0.2f +/- %0.2f" %(metric, test_score.mean()*100,
                                               test_score.std()*100)) 

In [16]:
import warnings
warnings.filterwarnings('ignore')

# Search through parameters to fill up the values
min_sample_split_values = [2, 50, 100]
min_sample_leaf_values = [1, 50, 100]
criterion_values = ['gini', 'entropy']

for criterion in criterion_values:
    for min_sample_split in min_sample_split_values:
        for min_sample_leaf in min_sample_leaf_values:
            decision_tree = DecisionTreeClassifier(class_weight=None, criterion=criterion, max_depth=None,
                                         max_features=None, max_leaf_nodes=None, min_samples_leaf=min_sample_leaf,
                                         min_samples_split=min_sample_split, min_weight_fraction_leaf=0.0,
                                         presort=False, random_state=100, splitter='best')
            
            print ("For decision tree with: \n criterion: %s \n min split: %s \n min leaf: %s \n"
                  %(criterion, min_sample_split, min_sample_leaf))
            metric_consolidation(input_all, decision_tree)


For decision tree with: 
 criterion: gini 
 min split: 2 
 min leaf: 1 

Metrics for normal_before_fs: 

accuracy Test Score: 58.82 +/- 3.26
f1_weighted Test Score: 58.78 +/- 2.69
roc_auc Test Score: 63.61 +/- 2.11


For decision tree with: 
 criterion: gini 
 min split: 2 
 min leaf: 50 

Metrics for normal_before_fs: 

accuracy Test Score: 65.64 +/- 1.80
f1_weighted Test Score: 59.25 +/- 1.84
roc_auc Test Score: 60.15 +/- 2.03


For decision tree with: 
 criterion: gini 
 min split: 2 
 min leaf: 100 

Metrics for normal_before_fs: 

accuracy Test Score: 66.98 +/- 1.75
f1_weighted Test Score: 58.63 +/- 1.87
roc_auc Test Score: 59.70 +/- 3.32


For decision tree with: 
 criterion: gini 
 min split: 50 
 min leaf: 1 

Metrics for normal_before_fs: 

accuracy Test Score: 64.20 +/- 1.80
f1_weighted Test Score: 61.40 +/- 2.90
roc_auc Test Score: 64.06 +/- 3.50


For decision tree with: 
 criterion: gini 
 min split: 50 
 min leaf: 50 

Metrics for normal_before_fs: 

accuracy Test Score: 

In [8]:
# Test Values for Decision Tree

decision_tree = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                                         max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
                                         min_samples_split=50, min_weight_fraction_leaf=0.0,
                                         presort=False, random_state=100, splitter='best')

metric_consolidation(input_all, decision_tree,method='test')

accuracy Test Score: 61.16 +/- 0.00
roc_auc Test Score: 60.69 +/- 0.00
f1_weighted Test Score: 58.39 +/- 0.00


In [34]:
def graphing_decisiontree(input_data, criterion='entropy'):
    
    x_train, x_test, y_train, y_test = preprocessing(input_data)
    
    ### Max Depth ------------------------------------------------------------------------------------
    max_depths = np.linspace(1, 32, 32, endpoint=True)
    train_results = []
    test_results = []
    for max_depth in max_depths:
        dt = DecisionTreeClassifier(class_weight=None, criterion=criterion, max_depth=max_depth,
                                         max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
                                         min_samples_split=2, min_weight_fraction_leaf=0.0,
                                         presort=False, random_state=100, splitter='best')
        scores = tenfold(dt, x_train, y_train)
        accuracy_train = scores['train_score'].mean()
        # Add acc score to previous train results
        train_results.append(accuracy_train)
        accuracy_test = scores['test_score'].mean()
        # Add acc score to previous test results
        test_results.append(accuracy_test)

    line1, = plt.plot(max_depths, train_results,'b', label="Average CV Train Accuracy")
    line2, = plt.plot(max_depths, test_results, 'r', label="Average CV Test Accuracy")
    plt.legend(handler_map={line2: HandlerLine2D(numpoints=2)})
    plt.ylabel("Average Accuracy score")
    plt.xlabel("Tree depth")
    plt.show()

    # Finding the best score and parameter to use
    best_accuracy_score = max(test_results)
    best_max_depth = max_depths[test_results.index(best_accuracy_score)]
    print ('Best Max Depth Value:', best_max_depth)
    print ('Corresponding Accuracy Value:', best_accuracy_score)
    
    ### Min Sample Splits ------------------------------------------------------------------------------------
    min_samples_splits = np.linspace(2, 100, 99, endpoint=True)
    train_results = []
    test_results = []
    for min_samples_split in min_samples_splits:
        dt = DecisionTreeClassifier(class_weight=None, criterion=criterion, max_depth=None,
                                         max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
                                         min_samples_split=int(min_samples_split), min_weight_fraction_leaf=0.0,
                                         presort=False, random_state=100, splitter='best')
        scores = tenfold(dt, x_train, y_train)
        accuracy_train = scores['train_score'].mean()
        # Add acc score to previous train results
        train_results.append(accuracy_train)
        accuracy_test = scores['test_score'].mean()
        # Add acc score to previous test results
        test_results.append(accuracy_test)

    line1, = plt.plot(min_samples_splits, train_results,'b', label="Average CV Train Accuracy")
    line2, = plt.plot(min_samples_splits, test_results, 'r', label="Average CV Test Accuracy")
    plt.legend(handler_map={line2: HandlerLine2D(numpoints=2)})
    plt.ylabel("Average Accuracy score")
    plt.xlabel("min samples split")
    plt.show()

    # Finding the best score and parameter to use
    best_accuracy_score = max(test_results)
    best_min_samples_split = min_samples_splits[test_results.index(best_accuracy_score)]
    print ('Best Min Sample Split Value:', best_min_samples_split)
    print ('Corresponding Accuracy Value:', best_accuracy_score)
    
    ### Min Samples Leaf ------------------------------------------------------------------------------------
    min_samples_leafs = np.linspace(1, 80, 80, endpoint=True)
    train_results = []
    test_results = []
    for min_samples_leaf in min_samples_leafs:
        dt = DecisionTreeClassifier(class_weight=None, criterion=criterion, max_depth=None,
                                         max_features=None, max_leaf_nodes=None, min_samples_leaf=int(min_samples_leaf),
                                         min_samples_split=2, min_weight_fraction_leaf=0.0,
                                         presort=False, random_state=100, splitter='best')
        scores = tenfold(dt, x_train, y_train)
        accuracy_train = scores['train_score'].mean()
        # Add acc score to previous train results
        train_results.append(accuracy_train)
        accuracy_test = scores['test_score'].mean()
        # Add acc score to previous test results
        test_results.append(accuracy_test)

    line1, = plt.plot(min_samples_leafs, train_results,'b', label="Average CV Train Accuracy")
    line2, = plt.plot(min_samples_leafs, test_results, 'r', label="Average CV Test Accuracy")
    plt.legend(handler_map={line2: HandlerLine2D(numpoints=2)})
    plt.ylabel("Average Accuracy score")
    plt.xlabel("min samples leaf")
    plt.show()

    # Finding the best score and parameter to use
    best_accuracy_score = max(test_results)
    best_min_samples_leaf = min_samples_leafs[test_results.index(best_accuracy_score)]
    print ('Best Min Samples Leaf Value:', best_min_samples_leaf)
    print ('Corresponding Accuracy Value:', best_accuracy_score)
    
    ### Max Features ------------------------------------------------------------------------------------
    max_features = list(range(1,input_data.shape[1]))
    train_results = []
    test_results = []
    for max_feature in max_features:
        dt = DecisionTreeClassifier(class_weight=None, criterion=criterion, max_depth=None,
                                         max_features=max_feature, max_leaf_nodes=None, min_samples_leaf=1,
                                         min_samples_split=2, min_weight_fraction_leaf=0.0,
                                         presort=False, random_state=100, splitter='best')
        scores = tenfold(dt, x_train, y_train)
        accuracy_train = scores['train_score'].mean()
        # Add acc score to previous train results
        train_results.append(accuracy_train)
        accuracy_test = scores['test_score'].mean()
        # Add acc score to previous test results
        test_results.append(accuracy_test)

    line1, = plt.plot(max_features, train_results,'b', label="Average CV Train Accuracy")
    line2, = plt.plot(max_features, test_results, 'r', label="Average CV Test Accuracy")
    plt.legend(handler_map={line2: HandlerLine2D(numpoints=2)})
    plt.ylabel("Average Accuracy score")
    plt.xlabel("max_features")
    plt.show()

    # Finding the best score and parameter to use
    best_accuracy_score = max(test_results)
    best_max_feature = max_features[test_results.index(best_accuracy_score)]
    print ('Best Max Feature Value:', best_max_feature)
    print ('Corresponding Accuracy Value:', best_accuracy_score)

## Decision Tree Modelling

### Baseline Decision Trees

In [22]:
# Gini
clf_gini = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                                     max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
                                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                                     presort=False, random_state=100, splitter='best')
#Entropy
clf_entropy = DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                                     max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
                                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                                     presort=False, random_state=100, splitter='best')

### Performance Metric For Dataset before Feature Selection
1. Accuracy Score
2. F1 Score
3. Confusion Matrix

In [None]:
# Predicting using Dataset before Feature Selection
Y_predict_entropy_initial = clf_entropy.predict(x_test)
Y_predict_gini_initial = clf_gini.predict(X_test)

In [None]:
# Assess predict score based on Accuracy Score for Dataset before Feature Selection

print ('Testing acc for entropy before feature selection is %f' %accuracy_score(Y_predict_entropy_initial, Y_test))
print ('Testing acc for gini before feature selection is %f' %accuracy_score(Y_predict_gini_initial, Y_test))

In [None]:
# Assess predict score based on F1 Score for Dataset after Feature Selection

print ('Testing f1 score for entropy before feature selection is %f' %f1_score(Y_test, Y_predict_entropy_initial, labels=[1,2,3,4], average='weighted'))
print ('Testing f1 score for gini before feature selection is %f' %f1_score(Y_test, Y_predict_gini_initial, labels=[1,2,3,4], average='weighted'))

In [None]:
# Assess predict score based on Confusion Matrix for Dataset after Feature Selection

print ('Testing confusion matrix for entropy before feature selection is \n', confusion_matrix(Y_test, Y_predict_entropy_initial, labels=[1,2,3,4]))
print ('Testing confusion matrix for gini before feature selection is \n', confusion_matrix(Y_test, Y_predict_gini_initial, labels=[1,2,3,4]))

In [None]:
metrics = ['accuracy', 'f1_score', 'confusion_matrix']

for input_data in input_all:
    x_train, x_test, y_train, y_test = preprocessing(input_data)
    
    for metric in metrics:
        if metric == "accuracy":
            print ('Testing acc for entropy before feature selection is %f' %accuracy_score(Y_predict_entropy_initial, Y_test))

### Performance Metric For Dataset after Feature Selection
1. Accuracy Score
2. F1 Score
3. Confusion Matrix

In [None]:
# Predicting using Dataset after Feature Selection
Y_predict_entropy_initial_fs = clf_entropy_fs.predict(X_test_fs)
Y_predict_gini_initial_fs = clf_gini_fs.predict(X_test_fs)

In [None]:
# Assess predict score based on Accuracy Score for Dataset after Feature Selection

# print ('Testing acc for entropy after feature selection is %f' %accuracy_score(Y_test_fs, Y_predict_entropy_initial_fs))
# print ('Testing acc for gini after feature selection is %f' %accuracy_score(Y_predict_gini_initial_fs, Y_test_fs))

In [None]:
cv_entropy_results = tenfold(clf_entropy_fs, X, Y)
print ('Testing acc for entropy after feature selection is %f. Std dv is (+/-) %f.' %(cv_entropy_results['test_score'].mean(), cv_entropy_results['test_score'].std()*2))

cv_gini_results = tenfold(clf_gini_fs, X, Y)
print ('Testing acc for gini after feature selection is %f. Std dv is (+/-) %f.' %(cv_gini_results['test_score'].mean(), cv_gini_results['test_score'].std()*2))

In [None]:
# Assess predict score based on F1 Score for Dataset after Feature Selection

print ('Testing f1 score for entropy after feature selection is %f' %f1_score(Y_test_fs, Y_predict_entropy_initial_fs, labels=[1,2,3,4], average='weighted'))
print ('Testing f1 score for gini after feature selection is %f' %f1_score(Y_test_fs, Y_predict_gini_initial_fs, labels=[1,2,3,4], average='weighted'))

In [None]:
# Assess predict score based on Confusion Matrix for Dataset after Feature Selection

print ('Testing confusion matrix for entropy after feature selection is \n', confusion_matrix(Y_test_fs, Y_predict_entropy_initial_fs, labels=[1,2,3,4]))
print ('Testing confusion matrix for gini after feature selection is \n', confusion_matrix(Y_test_fs, Y_predict_gini_initial_fs, labels=[1,2,3,4]))

### Initial Classification Results

In [None]:
# look at classification report for the initial modelling.
print ("Classification report for Gini: \n", classification_report(Y_test, Y_predict_gini_initial))
print ("Classification report for Entropy: \n", classification_report(Y_test, Y_predict_entropy_initial))

## Graphing Parameters

We are going to plot each parameters on a graph, based on accuracy score as the performance metric. 

In [None]:
input_data = input_all['normal_before_fs']

In [None]:
graphing_decisiontree(input_data, criterion='gini')

In [None]:
graphing_decisiontree(input_data, criterion='entropy')

## Graphing Parameters (Combining Parameters)

We are going to plot each parameters on a graph, based on accuracy score as the performance metric. Optimize one feature after another.

In [None]:
# Write function for this instead

# Final Tuning of Parameters
Lets try combining them together first

In [None]:
clf_entropy_final = DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3.0,
                                     max_features=30, max_leaf_nodes=None,
                                     min_impurity_split=1e-07, min_samples_leaf=1,
                                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                                     presort=False, random_state=100, splitter='best')
clf_entropy_final.fit(X_train_fs, Y_train_fs)

In [None]:
# predict( ) will do the model prediction, predict y based on the input x
Y_predict_entropy_final = clf_entropy_final.predict(X_test)
print ('testing acc for entropy is %f' %accuracy_score(Y_predict_entropy_final, Y_test))

Lets try the other model.

In [None]:
clf_gini_final = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                                     max_features=None, max_leaf_nodes=None,
                                     min_impurity_split=1e-07, min_samples_leaf=50,
                                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                                     presort=False, random_state=100, splitter='best')
clf_gini_final.fit(X_train, Y_train)

In [None]:
# predict( ) will do the model prediction, predict y based on the input x
Y_predict_gini_final = clf_gini_final.predict(X_test)
print ('testing acc for gini is %f' %accuracy_score(Y_predict_gini_final, Y_test_fs))

In [None]:
# look at classification report for the above tuning.
print ("Classification report for Gini: \n", classification_report(Y_test_fs, Y_predict_gini_final))

In [None]:
# Assess predict score based on Confusion Matrix for Dataset after Feature Selection

print ('Testing confusion matrix for gini after feature selection is \n', confusion_matrix(Y_test_fs, Y_predict_gini_final, labels=[1,2,3,4]))

In [None]:
# look at classification report for the above tuning.
print ("Classification report for Entropy: \n", classification_report(Y_test_fs, Y_predict_entropy_final))

In [None]:
# Assess predict score based on Confusion Matrix for Dataset after Feature Selection

print ('Testing confusion matrix for entropy after feature selection is \n', confusion_matrix(Y_test_fs, Y_predict_entropy_final, labels=[1,2,3,4]))

In [None]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

In [None]:
dot_data = StringIO()
export_graphviz(clf_gini_final, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

In [None]:
dot_data = StringIO()
export_graphviz(clf_entropy_final, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())