# got tired of restarting the kernel so I'm doing these here

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
import sklearn.metrics as metrics
from sklearn.tree import DecisionTreeClassifier
import ds_functions as ds
from sklearn.tree import export_graphviz
import pydot
import g20_functions as g20

In [1]:
def get_values_RF(data_dict,depth):
    # set max_depths to fixed
    # criteria is diff graphs
    # get max_features as series, accuracy as y and estimators as x
    output = {}
    for criteria, d1 in data_dict.items():
        output[criteria] = {}
        output[criteria]["train"] = {}
        output[criteria]["test"] = {}
        d2 = d1[depth]
        for f, d3 in d2.items():
            output[criteria]["train"][f] = []
            output[criteria]["test"][f] = []
            for est, tt in d3.items():
                output[criteria]["train"][f].append(tt["train"])
                output[criteria]["test"][f].append(tt["test"])
    return output

In [2]:
def plot_RF_ho(data_dict,depth):
    output = get_values_RF(data_dict,depth)
    
    n_estimators = [5, 10, 25, 50, 75, 100, 150, 200, 250, 300]
    max_features = [.1, .3, .5, .7, .9, 1]
    criteria = ['entropy', 'gini']
    
    # plot performance
    plt.figure()
    fig, axs = plt.subplots(1, 2, figsize=(10, 2), squeeze=False)
    for i in range(len(criteria)):
        c = criteria[i]
        ds.multiple_line_chart(n_estimators, output[c]["test"], ax=axs[0, i],
                               title='Random Forests for Toxic with {} criteria'.format(c),
                               xlabel='estimators', ylabel='test accuracy',
                               percentage=True)
    plt.show()
    
    # plot overfitting
    plt.figure()
    fig, axs = plt.subplots(1, 2, figsize=(10, 2), squeeze=False)
    for i in range(len(criteria)):
        c = criteria[i]
        ds.multiple_line_chart(n_estimators, output[c]["train"], ax=axs[0, i],
                               title='Random Forests for Toxic with {} criteria'.format(c),
                               xlabel='estimators', ylabel='train accuracy',
                               percentage=True)
    plt.show()

In [4]:
def get_avg_std_RF(folds,depth):
    # set max_depths to fixed
    # criteria is diff graphs
    # get max_features as series, accuracy as y and estimators as x
    output = {}
    avg = {}
    interval = {}
    
    n_splits = len(folds)
    for criteria, d1 in folds[0].items():
        avg[criteria] = {}
        interval[criteria] = {}
        output[criteria] = {}
        avg[criteria]["train"] = {}
        avg[criteria]["test"] = {}
        interval[criteria]["train"] = {}
        interval[criteria]["test"] = {}
        output[criteria]["train"] = {}
        output[criteria]["test"] = {}
        d2 = d1[depth]
        for f, d3 in d2.items():
            output[criteria]["train"][f] = {}
            output[criteria]["test"][f] = {}
            interval[criteria]["train"][f] = []
            interval[criteria]["test"][f] = []
            avg[criteria]["train"][f] = []
            avg[criteria]["test"][f] = []
            for est, tt in d3.items():
                output[criteria]["train"][f][est] = np.empty(n_splits, dtype=dict)
                output[criteria]["test"][f][est] = np.empty(n_splits, dtype=dict)
    i = 0
    for data_dict in folds:
        for criteria, d1 in data_dict.items():
            d2 = d1[depth]
            for f, d3 in d2.items():
                for est, tt in d3.items():
                    output[criteria]["train"][f][est][i] = tt["train"]
                    output[criteria]["test"][f][est][i] = tt["test"]
        i += 1
    
    for criteria, d1 in output.items():
        for tt, d2 in d1.items():
            for f, d3 in d2.items():
                for est in d3:
                    avg[criteria][tt][f].append(np.mean([output[criteria][tt][f][est][j]
                                                           for j in range(n_splits)], axis=0))
                    std = np.std([output[criteria][tt][f][est][j]
                                                           for j in range(n_splits)], axis=0)
                    interval[criteria][tt][f].append(std*0.95/n_splits)
    
    return avg, interval

In [None]:
def plot_RF_cv(data_dict,depth):
    max_depths = [2, 5, 10, 15, 20, 25]
    avg, interval = get_avg_std_RF(data_dict,depth)
    
    n_estimators = [5, 10, 25, 50, 75, 100, 150, 200, 250, 300]
    max_features = [.1, .3, .5, .7, .9, 1]
    criteria = ['entropy', 'gini']
    
    
    # plot performance
    plt.figure()
    fig, axs = plt.subplots(1, 2, figsize=(10, 2), squeeze=False)
    for i in range(len(criteria)):
        c = criteria[i]
        print(c)
        print(interval[c]["test"])
        ds.multiple_line_chart(n_estimators, avg[c]["test"], ax=axs[0, i],
                               title='Random Forests for Heart with {} criteria (mean)'.format(c),
                               xlabel='estimators', ylabel='test accuracy',
                               percentage=True)
    plt.show()
    
    # plot overfitting
    plt.figure()
    fig, axs = plt.subplots(1, 2, figsize=(10, 2), squeeze=False)
    for i in range(len(criteria)):
        c = criteria[i]
        print(c)
        print(interval[c]["train"])
        ds.multiple_line_chart(n_estimators, avg[c]["train"], ax=axs[0, i],
                               title='Random Forests for Heart with {} criteria (mean)'.format(c),
                               xlabel='estimators', ylabel='train accuracy',
                               percentage=True)
    plt.show()