# got tired of restarting the kernel so I'm doing these here

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
import sklearn.metrics as metrics
from sklearn.tree import DecisionTreeClassifier
import ds_functions as ds
from sklearn.tree import export_graphviz
import pydot
import g20_functions as g20

In [1]:
def get_values_RF(data_dict,depth):
    # set max_depths to fixed
    # criteria is diff graphs
    # get max_features as series, accuracy as y and estimators as x
    output = {}
    for criteria, d1 in data_dict.items():
        output[criteria] = {}
        output[criteria]["train"] = {}
        output[criteria]["test"] = {}
        d2 = d1[depth]
            for imp, tt in d2.items():
                if imp not in output[criteria]["train"]:
                    output[criteria]["train"][imp] = []
                if imp not in output[criteria]["test"]:
                    output[criteria]["test"][imp] = []
                output[criteria]["train"][imp].append(tt["train"])
                output[criteria]["test"][imp].append(tt["test"])
    return output

In [2]:
def plot_RF_ho(data_dict):
    output = get_values_DT(data_dict)
    
    min_impurity_decrease = [0.025, 0.01, 0.005, 0.0025, 0.001]
    max_depths = [2, 5, 10, 15, 20, 25]
    criteria = ['entropy', 'gini']
    
    # plot performance
    plt.figure()
    fig, axs = plt.subplots(1, 2, figsize=(10, 2), squeeze=False)
    for i in range(len(criteria)):
        c = criteria[i]
        ds.multiple_line_chart(max_depths, output[c]["test"], ax=axs[0, i],
                               title='Decision Trees for Toxic with {} criteria'.format(c),
                               xlabel='depth', ylabel='test accuracy',
                               percentage=True)
    plt.show()
    
    # plot overfitting
    plt.figure()
    fig, axs = plt.subplots(1, 2, figsize=(10, 2), squeeze=False)
    for i in range(len(criteria)):
        c = criteria[i]
        ds.multiple_line_chart(max_depths, output[c]["train"], ax=axs[0, i],
                               title='Decision Trees for Toxic with {} criteria'.format(c),
                               xlabel='depth', ylabel='train accuracy',
                               percentage=True)
    plt.show()

In [4]:
def get_avg_std_RF(folds):
    # get min as series, accuracy as y and depth as x
    output = {}
    avg = {}
    interval = {}
    
    n_splits = len(folds)
    for criteria, d1 in folds[0].items():
        avg[criteria] = {}
        interval[criteria] = {}
        output[criteria] = {}
        avg[criteria]["train"] = {}
        avg[criteria]["test"] = {}
        interval[criteria]["train"] = {}
        interval[criteria]["test"] = {}
        output[criteria]["train"] = {}
        output[criteria]["test"] = {}
        for d, d2 in d1.items():
            for imp, tt in d2.items():
                output[criteria]["train"][imp] = {}
                output[criteria]["test"][imp] = {}
                interval[criteria]["train"][imp] = []
                interval[criteria]["test"][imp] = []
                avg[criteria]["train"][imp] = []
                avg[criteria]["test"][imp] = []
            break
        for d, d2 in d1.items():
            for imp, tt in d2.items():
                output[criteria]["train"][imp][d] = np.empty(n_splits, dtype=dict)
                output[criteria]["test"][imp][d] = np.empty(n_splits, dtype=dict)
    i = 0
    for data_dict in folds:
        for criteria, d1 in data_dict.items():
            for d, d2 in d1.items():
                for imp, tt in d2.items():
                    output[criteria]["train"][imp][d][i] = tt["train"]
                    output[criteria]["test"][imp][d][i] = tt["test"]
        i += 1
    
    for criteria, d1 in output.items():
        for tt, d2 in d1.items():
            for imp, d3 in d2.items():
                for d in d3:
                    avg[criteria][tt][imp].append(np.mean([output[criteria][tt][imp][d][j]
                                                           for j in range(n_splits)], axis=0))
                    std = np.std([output[criteria][tt][imp][d][j]
                                                           for j in range(n_splits)], axis=0)
                    interval[criteria][tt][imp].append(std*0.95/n_splits)
    
    return avg, interval

In [None]:
def plot_RF_cv(data_dict):
    max_depths = [2, 5, 10, 15, 20, 25]
    avg, interval = get_avg_std_DT(data_dict)
    
    min_impurity_decrease = [0.025, 0.01, 0.005, 0.0025, 0.001]
    criteria = ['entropy', 'gini']
    
    
    # plot performance
    plt.figure()
    fig, axs = plt.subplots(1, 2, figsize=(10, 2), squeeze=False)
    for i in range(len(criteria)):
        c = criteria[i]
        print(c)
        print(interval[c]["test"])
        ds.multiple_line_chart(max_depths, avg[c]["test"], ax=axs[0, i],
                               title='Decision Trees for Heart with {} criteria (mean)'.format(c),
                               xlabel='depth', ylabel='test accuracy',
                               percentage=True)
    plt.show()
    
    # plot overfitting
    plt.figure()
    fig, axs = plt.subplots(1, 2, figsize=(10, 2), squeeze=False)
    for i in range(len(criteria)):
        c = criteria[i]
        print(c)
        print(interval[c]["train"])
        ds.multiple_line_chart(max_depths, avg[c]["train"], ax=axs[0, i],
                               title='Decision Trees for Heart with {} criteria (mean)'.format(c),
                               xlabel='depth', ylabel='train accuracy',
                               percentage=True)
    plt.show()