# got tired of restarting the kernel so I'm doing these here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
import sklearn.metrics as metrics
from sklearn.tree import DecisionTreeClassifier
import ds_functions as ds
from sklearn.tree import export_graphviz
import pydot
import g20_functions as g20

In [1]:
def get_values_GB(data_dict,depth,features):
    # set max_depths and max_feature to fixed
    # loss is diff graphs
    # get lr as series, accuracy as y and estimators as x
    output = {}
    for loss, d1 in data_dict.items():
        output[loss] = {}
        output[loss]["train"] = {}
        output[loss]["test"] = {}
        d2 = d1[depth]
        for lr, d3 in d2.items():
            for est, d4 in d3.items():
                tt = d4[features]
                if lr not in output[loss]["train"]:
                    output[loss]["train"][lr] = []
                if lr not in output[loss]["test"]:
                    output[loss]["test"][lr] = []
                output[loss]["train"][lr].append(tt["train"])
                output[loss]["test"][lr].append(tt["test"])
    return output

In [2]:
def plot_GB_ho(data_dict,depth,features):
    output = get_values_GB(data_dict,depth,features)
        
    losses=['deviance', 'exponential']  # exponential == AdaBoost
#    learn_rates=[0.01, 0.1, 0.3, 0.5, 1]
    n_estimators=[10, 50, 100, 200, 300]
#    max_depths=[5, 10, 25]
#    max_features=[.25, 0.5, 0.75, 1]
    
    # plot performance
    plt.figure()
    fig, axs = plt.subplots(1, 2, figsize=(10, 2), squeeze=False)
    for i in range(len(losses)):
        c = losses[i]
        ds.multiple_line_chart(n_estimators, output[c]["test"], ax=axs[0, i],
                               title='Gradient Boosting for Toxic with {} loss'.format(c),
                               xlabel='estimators', ylabel='test accuracy',
                               percentage=True)
    plt.show()
    
    # plot overfitting
    plt.figure()
    fig, axs = plt.subplots(1, 2, figsize=(10, 2), squeeze=False)
    for i in range(len(losses)):
        c = losses[i]
        ds.multiple_line_chart(n_estimators, output[c]["train"], ax=axs[0, i],
                               title='Gradient Boosting for Toxic with {} loss'.format(c),
                               xlabel='estimators', ylabel='train accuracy',
                               percentage=True)
    plt.show()

In [4]:
def get_avg_std_GB(folds,depth,features):
    # set max_depths and max_feature to fixed
    # loss is diff graphs
    # get lr as series, accuracy as y and estimators as x    
    output = {}
    avg = {}
    interval = {}
    
    n_splits = len(folds)
    for loss, d1 in folds[0].items():
        avg[loss] = {}
        interval[loss] = {}
        output[loss] = {}
        avg[loss]["train"] = {}
        avg[loss]["test"] = {}
        interval[loss]["train"] = {}
        interval[loss]["test"] = {}
        output[loss]["train"] = {}
        output[loss]["test"] = {}
        d2 = d1[depth]
        for lr, d3 in d2.items():
            output[loss]["train"][lr] = {}
            output[loss]["test"][lr] = {}
            interval[loss]["train"][lr] = []
            interval[loss]["test"][lr] = []
            avg[loss]["train"][lr] = []
            avg[loss]["test"][lr] = []
            for est in d3:
                output[loss]["train"][lr][est] = np.empty(n_splits, dtype=dict)
                output[loss]["test"][lr][est] = np.empty(n_splits, dtype=dict)
    i = 0
    for data_dict in folds:
        for loss, d1 in data_dict.items():
            d2 = d1[depth]
            for lr, d3 in d2.items():
                for est, d4 in d3.items():
                    tt = d4[features]
                    output[loss]["train"][lr][est][i] = tt["train"]
                    output[loss]["test"][lr][est][i] = tt["test"]
        i += 1
    
    for loss, d1 in output.items():
        for tt, d2 in d1.items():
            for lr, d3 in d2.items():
                for est in d3:
                    avg[loss][tt][lr].append(np.mean([output[loss][tt][lr][est][j]
                                                           for j in range(n_splits)], axis=0))
                    std = np.std([output[loss][tt][lr][est][j]
                                                           for j in range(n_splits)], axis=0)
                    interval[loss][tt][lr].append(std*0.95/n_splits)
    
    return avg, interval

In [None]:
def plot_GB_cv(data_dict,depth,features):
    avg, interval = get_avg_std_GB(data_dict,depth,features)

    losses=['deviance', 'exponential']  # exponential == AdaBoost
#    learn_rates=[0.01, 0.1, 0.3, 0.5, 1]
    n_estimators=[10, 50, 100, 200, 300]
#    max_depths=[5, 10, 25]
#    max_features=[.25, 0.5, 0.75, 1]

    # plot performance
    plt.figure()
    fig, axs = plt.subplots(1, 2, figsize=(10, 2), squeeze=False)
    for i in range(len(losses)):
        c = losses[i]
        print(c)
        print(interval[c]["test"])
        ds.multiple_line_chart(n_estimators, avg[c]["test"], ax=axs[0, i],
                               title='Gradient Boosting for Heart with {} loss (mean)'.format(c),
                               xlabel='estimators', ylabel='test accuracy',
                               percentage=True)
    plt.show()
    
    # plot overfitting
    plt.figure()
    fig, axs = plt.subplots(1, 2, figsize=(10, 2), squeeze=False)
    for i in range(len(losses)):
        c = losses[i]
        print(c)
        print(interval[c]["train"])
        ds.multiple_line_chart(n_estimators, avg[c]["train"], ax=axs[0, i],
                               title='Gradient Boosting for Heart with {} loss (mean)'.format(c),
                               xlabel='estimators', ylabel='train accuracy',
                               percentage=True)
    plt.show()