In [1]:
from collections import Counter
from collections import defaultdict
from matplotlib import pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn import metrics

import pickle
import math
import re
import enchant
import pandas as pd
import os
import glob
import numpy as np
np.random.seed(512)

  from numpy.core.umath_tests import inner1d


In [2]:
#read csv
dict_label = {
    'Datetime':0, 
    'Sentence':1, 
    'Custom Object': 2, 
    'URL': 3, 
    'Numbers': 4, 
    'List': 5}
data = pd.read_csv('data/needs_extraction_data/labelled_data.csv')

data['y_act'] = [dict_label[i] for i in data['y_act']]
y = data.loc[:,['y_act']]

In [3]:
data1 = data[['%_nans', 'mean_word_count', 'std_dev_word_count', 'has_delimiters']]
data1 = data1.fillna(0)

data1 = data1.rename(columns={'mean_word_count': 'scaled_mean_token_count', 'std_dev_word_count': 'scaled_std_dev_token_count', '%_nans': 'scaled_perc_nans'})
data1.loc[data1['scaled_mean_token_count'] > 10000, 'scaled_mean_token_count'] = 10000
data1.loc[data1['scaled_mean_token_count'] < -10000, 'scaled_mean_token_count'] = -10000
data1.loc[data1['scaled_std_dev_token_count'] > 10000, 'scaled_std_dev_token_count'] = 10000
data1.loc[data1['scaled_std_dev_token_count'] < -10000, 'scaled_std_dev_token_count'] = -10000
data1.loc[data1['scaled_perc_nans'] > 10000, 'scaled_perc_nans'] = 10000
data1.loc[data1['scaled_perc_nans'] < -10000, 'scaled_perc_nans'] = -10000
column_names_to_normalize = ['scaled_mean_token_count', 'scaled_std_dev_token_count','scaled_perc_nans']
x = data1[column_names_to_normalize].values
x = np.nan_to_num(x)
x_scaled = StandardScaler().fit_transform(x)
df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = data1.index)
data1[column_names_to_normalize] = df_temp

y.y_act = y.y_act.astype(float)

print(f"> Data mean: \n{data1.mean()}")
print(f"> Data median: \n{data1.median()}")
print(f"> Data stdev: \n{data1.std()}")

# data1.to_csv('before.csv')
# f = open('current.txt','w')
# d = enchant.Dict("en_US")

# for i in data.index:
#     ival = data.at[i,'Attribute_name']
#     if ival != 'id' and d.check(ivadf_tempdata1)
#         print(f,ival)
#         print(f,y.at[i,'y_act'])
#         data1.at[i,'dictionary_item'] = 1
#     else:
#         data1.at[i,'dictionary_item'] = 0

# data1.to_csv('after.csv')
# f.close()
# print(data1.columns)

> Data mean: 
scaled_perc_nans             -2.745801e-16
scaled_mean_token_count      -1.117919e-16
scaled_std_dev_token_count   -2.236863e-17
has_delimiters                3.105360e-01
dtype: float64
> Data median: 
scaled_perc_nans             -0.653046
scaled_mean_token_count      -0.144106
scaled_std_dev_token_count   -0.171320
has_delimiters                0.000000
dtype: float64
> Data stdev: 
scaled_perc_nans              1.000925
scaled_mean_token_count       1.000925
scaled_std_dev_token_count    1.000925
has_delimiters                0.463141
dtype: float64


In [4]:
print("===[VECTORIZATION]===")
arr = data['Attribute_name'].values
data = data.fillna(0)
arr1 = data['sample_1'].values
arr1 = [str(x) for x in arr1]
arr2 = data['sample_2'].values
arr2 = [str(x) for x in arr2]

vectorizer = CountVectorizer(ngram_range=(3, 3), analyzer='char')
X = vectorizer.fit_transform(arr)
X1 = vectorizer.fit_transform(arr1)
X2 = vectorizer.fit_transform(arr2)

print(f"> Length of vectorized feature_names: {len(vectorizer.get_feature_names())}")

data1.to_csv('data/preprocessing/before.csv')
attr_df = pd.DataFrame(X.toarray())
sample1_df = pd.DataFrame(X1.toarray())
sample2_df = pd.DataFrame(X2.toarray())

data2 = pd.concat([data1, attr_df, sample1_df, sample2_df], axis=1, sort=False)
data2.to_csv('data/preprocessing/after.csv')
data2.head()

X_train, X_test, y_train, y_test = train_test_split(
    data2, y, test_size=0.2, random_state=100)
atr_train,atr_test = train_test_split(data2, test_size=0.2,random_state=100)

# X_train_train, X_test_train,y_train_train,y_test_train = train_test_split(X_train,y_train, test_size=0.25)
# print(X_train.head())
# print(y_train.head())

X_train_new = X_train.reset_index(drop=True)
y_train_new = y_train.reset_index(drop=True)
print(f"X_train preview: {X_train.head()}")
print(f"y_train preview: {y_train.head()}")

X_train_new = X_train_new.values
y_train_new = y_train_new.values

===[VECTORIZATION]===
> Length of vectorized feature_names: 8528
X_train preview:      scaled_perc_nans  scaled_mean_token_count  scaled_std_dev_token_count  \
453         -0.653097                 0.686283                    3.364514   
43          -0.653120                 0.162079                   -0.054513   
133          1.978459                -0.148544                   -0.167108   
205         -0.653120                -0.141062                   -0.175870   
282         -0.653120                -0.148960                   -0.175870   

     has_delimiters  0  1  2  3  4  5  ...   8518  8519  8520  8521  8522  \
453            True  0  0  0  0  0  0  ...      0     0     0     0     0   
43             True  0  0  0  0  0  0  ...      0     0     0     0     0   
133            True  0  0  0  0  0  0  ...      0     0     0     0     0   
205           False  0  0  0  0  0  0  ...      0     0     0     0     0   
282           False  0  0  0  0  0  0  ...      0     0     0   

In [5]:
try:
    acc_df = pd.read_csv('data/model_data.csv')
    index = len(acc_df)-1
except FileNotFoundError:
    acc_df = pd.DataFrame(columns=['Model', 'Params', 'Feats', 'Train', 'Validation', 'Test', 'Precision'])
    index = 0

In [6]:
k = 5
kf = KFold(n_splits=k)
avg_train_acc,avg_test_acc = 0,0
    
n_estimators_grid = [5,25,50,75,100]
max_depth_grid = [5,10,25,50,100]

avgsc_lst,avgsc_train_lst,avgsc_hld_lst = [],[],[]
avgsc,avgsc_train,avgsc_hld = 0,0,0

best_param_count = {'n_estimator': {}, 'max_depth': {}}
for train_index, test_index in kf.split(X_train_new):
    X_train_cur, X_test_cur = X_train_new[train_index], X_train_new[test_index]
    y_train_cur, y_test_cur = y_train_new[train_index], y_train_new[test_index]
    X_train_train, X_val,y_train_train,y_val = train_test_split(X_train_cur,y_train_cur, test_size=0.25,random_state=100)
    
    bestPerformingModel = RandomForestClassifier(n_estimators=10,max_depth=5)
    bestscore = 0
    print('='*10)
    for ne in n_estimators_grid:
        for md in max_depth_grid:
            clf = RandomForestClassifier(n_estimators=ne,max_depth=md)
            clf.fit(X_train_train, y_train_train.ravel())
            sc = clf.score(X_val, y_val)
            print(f"[n_estimator: {ne}, max_depth: {md}, accuracy: {sc}]")
            if bestscore < sc:
                bestne = ne
                bestmd = md
                bestscore = sc
                bestPerformingModel = clf
                
    if str(bestne) in best_param_count['n_estimator']:
        best_param_count['n_estimator'][str(bestne)] += 1
    else:
        best_param_count['n_estimator'][str(bestne)] = 1
        
    if str(bestmd) in best_param_count['max_depth']:
        best_param_count['max_depth'][str(bestmd)] += 1
    else:
        best_param_count['max_depth'][str(bestmd)] = 1
        
    bscr_train = bestPerformingModel.score(X_train_cur, y_train_cur)
    bscr = bestPerformingModel.score(X_test_cur, y_test_cur)
    bscr_hld = bestPerformingModel.score(X_test, y_test)

    avgsc_train_lst.append(bscr_train)
    avgsc_lst.append(bscr)
    avgsc_hld_lst.append(bscr_hld)
    
    avgsc_train = avgsc_train + bscr_train    
    avgsc = avgsc + bscr
    avgsc_hld = avgsc_hld + bscr_hld

    print()
    print(f"> Best n_estimator: {bestne} || Best max_depth: {bestmd}")
    print(f"> Best training score: {bscr_train}")
    print(f"> Best test score: {bscr}")
    print(f"> Best held score: {bscr_hld}")
print('='*10)

[n_estimator: 5, max_depth: 5, accuracy: 0.6896551724137931]
[n_estimator: 5, max_depth: 10, accuracy: 0.6896551724137931]
[n_estimator: 5, max_depth: 25, accuracy: 0.7241379310344828]
[n_estimator: 5, max_depth: 50, accuracy: 0.7586206896551724]
[n_estimator: 5, max_depth: 100, accuracy: 0.7931034482758621]
[n_estimator: 25, max_depth: 5, accuracy: 0.7011494252873564]
[n_estimator: 25, max_depth: 10, accuracy: 0.735632183908046]
[n_estimator: 25, max_depth: 25, accuracy: 0.7701149425287356]
[n_estimator: 25, max_depth: 50, accuracy: 0.7586206896551724]
[n_estimator: 25, max_depth: 100, accuracy: 0.7471264367816092]
[n_estimator: 50, max_depth: 5, accuracy: 0.6091954022988506]
[n_estimator: 50, max_depth: 10, accuracy: 0.735632183908046]
[n_estimator: 50, max_depth: 25, accuracy: 0.7816091954022989]
[n_estimator: 50, max_depth: 50, accuracy: 0.7816091954022989]
[n_estimator: 50, max_depth: 100, accuracy: 0.8045977011494253]
[n_estimator: 75, max_depth: 5, accuracy: 0.632183908045977]
[

[n_estimator: 100, max_depth: 5, accuracy: 0.6666666666666666]
[n_estimator: 100, max_depth: 10, accuracy: 0.735632183908046]
[n_estimator: 100, max_depth: 25, accuracy: 0.7701149425287356]
[n_estimator: 100, max_depth: 50, accuracy: 0.7931034482758621]
[n_estimator: 100, max_depth: 100, accuracy: 0.7701149425287356]

> Best n_estimator: 5 || Best max_depth: 50
> Best training score: 0.9248554913294798
> Best test score: 0.8023255813953488
> Best held score: 0.7706422018348624


In [7]:
y_pred = bestPerformingModel.predict(X_test)
prec = metrics.precision_score(y_test, y_pred, average=None)
cat_prec = {
    'Datetime': prec[0],
    'Sentence': prec[1],
    'Custom Object': prec[2],
    'URL': prec[3],
    'Numbers': prec[4],
    'List': prec[5],
}

  'precision', 'predicted', average, warn_for)


In [8]:
bestne = max(best_param_count['n_estimator'], key=lambda i: best_param_count['n_estimator'][i])
bestmd = max(best_param_count['max_depth'], key=lambda i: best_param_count['max_depth'][i])
bestparams = {'n_estimator': bestne, 'max_depth': bestmd}
print(f"> Best n_estimator : {bestne} || Best max_depth : {bestmd}")
print(f"> Average training score list: {avgsc_train_lst}")
print(f"> Average testing score list: {avgsc_lst}")
print(f"> Average held score list: {avgsc_hld_lst}")
print()
avgsc_train = avgsc_train/k
avgsc = avgsc/k
avgsc_hld = avgsc_hld/k
print(f"> Average training score list: {avgsc_train}")
print(f"> Average testing score list: {avgsc}")
print(f"> Average held score list: {avgsc_hld}")
acc_df.loc[index] = ['random_forest', str(bestparams),"X_stats, X_name, X_sample1, X_sample2", avgsc_train, avgsc, avgsc_hld, str(cat_prec)]
index += 1
print()

y_pred = bestPerformingModel.predict(X_test)
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
print('Confusion Matrix: Actual (Row) vs Predicted (Column)')
print(cnf_matrix)

> Best n_estimator : 50 || Best max_depth : 50
> Average training score list: [0.9507246376811594, 0.9333333333333333, 0.9335260115606936, 0.9479768786127167, 0.9248554913294798]
> Average testing score list: [0.8045977011494253, 0.8275862068965517, 0.813953488372093, 0.7906976744186046, 0.8023255813953488]
> Average held score list: [0.8165137614678899, 0.8256880733944955, 0.8073394495412844, 0.8165137614678899, 0.7706422018348624]

> Average training score list: 0.9380832705034766
> Average testing score list: 0.8078321304464048
> Average held score list: 0.8073394495412843

Confusion Matrix: Actual (Row) vs Predicted (Column)
[[24  0  2  0  1  0]
 [ 2 13  7  0  0  0]
 [ 4  2 45  0  1  0]
 [ 0  0  0  2  0  0]
 [ 0  0  1  0  0  0]
 [ 0  0  5  0  0  0]]


In [9]:
# save the model to disk
filename = 'data/pretrained/rf_finalized_model.sav'
pickle.dump(bestPerformingModel, open(filename, 'wb'))

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
y_prob = bestPerformingModel.predict_proba(X_test)

df = pd.DataFrame.from_records(y_prob)
print(df)
df.to_csv('data/model_predictions/rf_predictions.csv', index=False)

            0    1         2    3         4    5
0    0.203636  0.0  0.789091  0.0  0.007273  0.0
1    0.003636  0.4  0.589091  0.0  0.007273  0.0
2    1.000000  0.0  0.000000  0.0  0.000000  0.0
3    0.800000  0.0  0.200000  0.0  0.000000  0.0
4    0.003636  0.4  0.589091  0.0  0.007273  0.0
5    0.212766  0.0  0.782979  0.0  0.004255  0.0
6    0.000000  0.0  1.000000  0.0  0.000000  0.0
7    1.000000  0.0  0.000000  0.0  0.000000  0.0
8    1.000000  0.0  0.000000  0.0  0.000000  0.0
9    0.000000  0.4  0.600000  0.0  0.000000  0.0
10   0.216402  0.0  0.772070  0.0  0.011528  0.0
11   0.000000  0.8  0.000000  0.0  0.000000  0.2
12   1.000000  0.0  0.000000  0.0  0.000000  0.0
13   1.000000  0.0  0.000000  0.0  0.000000  0.0
14   0.016402  0.0  0.972070  0.0  0.011528  0.0
15   0.400000  0.0  0.600000  0.0  0.000000  0.0
16   0.000000  1.0  0.000000  0.0  0.000000  0.0
17   0.000000  0.2  0.600000  0.0  0.000000  0.2
18   0.000000  0.2  0.800000  0.0  0.000000  0.0
19   0.012766  0.2  

In [10]:
def test_feat_combos(index):
    combos = {
        "X_stats": data1,
        "X_name": attr_df,
        "X_stats, X_name": pd.concat([data1, attr_df], axis=1, sort=False),
        "X_sample1":  pd.concat([sample1_df], axis=1, sort=False),
        "X_name, X_sample1":  pd.concat([attr_df, sample1_df], axis=1, sort=False),
        "X_stats, X_sample1":  pd.concat([data1, sample1_df], axis=1, sort=False),
        "X_stats, X_name, X_sample1":  pd.concat([data1, attr_df, sample1_df], axis=1, sort=False)
    }
    

    for combo in combos:
        print("="*50, combo, "="*50)
        X_train, X_test, y_train, y_test = train_test_split(
            combos[combo], y, test_size=0.2, random_state=100)

        X_train_new = X_train.reset_index(drop=True)
        y_train_new = y_train.reset_index(drop=True)
        X_train_new = X_train_new.values
        y_train_new = y_train_new.values
        k = 5
        kf = KFold(n_splits=k)
        avg_train_acc,avg_test_acc = 0,0

        n_estimators_grid = [5,25,50,75,100]
        max_depth_grid = [5,10,25,50,100]

        avgsc_lst,avgsc_train_lst,avgsc_hld_lst = [],[],[]
        avgsc,avgsc_train,avgsc_hld = 0,0,0

        best_param_count = {'n_estimator': {}, 'max_depth': {}}
        for train_index, test_index in kf.split(X_train_new):
            X_train_cur, X_test_cur = X_train_new[train_index], X_train_new[test_index]
            y_train_cur, y_test_cur = y_train_new[train_index], y_train_new[test_index]
            X_train_train, X_val,y_train_train,y_val = train_test_split(X_train_cur,y_train_cur, test_size=0.25,random_state=100)

            bestPerformingModel = RandomForestClassifier(n_estimators=10,max_depth=5)
            bestscore = 0
            for ne in n_estimators_grid:
                for md in max_depth_grid:
                    clf = RandomForestClassifier(n_estimators=ne,max_depth=md)
                    clf.fit(X_train_train, y_train_train.ravel())
                    sc = clf.score(X_val, y_val)
                    print(f"[n_estimator: {ne}, max_depth: {md}, accuracy: {sc}]")
                    if bestscore < sc:
                        bestne = ne
                        bestmd = md
                        bestscore = sc
                        bestPerformingModel = clf

            if str(bestne) in best_param_count['n_estimator']:
                best_param_count['n_estimator'][str(bestne)] += 1
            else:
                best_param_count['n_estimator'][str(bestne)] = 1

            if str(bestmd) in best_param_count['max_depth']:
                best_param_count['max_depth'][str(bestmd)] += 1
            else:
                best_param_count['max_depth'][str(bestmd)] = 1

            bscr_train = bestPerformingModel.score(X_train_cur, y_train_cur)
            bscr = bestPerformingModel.score(X_test_cur, y_test_cur)
            bscr_hld = bestPerformingModel.score(X_test, y_test)

            avgsc_train_lst.append(bscr_train)
            avgsc_lst.append(bscr)
            avgsc_hld_lst.append(bscr_hld)

            avgsc_train = avgsc_train + bscr_train    
            avgsc = avgsc + bscr
            avgsc_hld = avgsc_hld + bscr_hld

            print('\t','-'*10)
            print(f"\t> Best n_estimator: {bestne} || Best max_depth: {bestmd}")
            print(f"\t> Best training score: {bscr_train}")
            print(f"\t> Best test score: {bscr}")
            print(f"\t> Best held score: {bscr_hld}")
        print('\t','-'*10)
        
        y_pred = bestPerformingModel.predict(X_test)
        prec = metrics.precision_score(y_test, y_pred, average=None)
        cat_prec = {
            'Datetime': prec[0],
            'Sentence': prec[1],
            'Custom Object': prec[2],
            'URL': prec[3],
            'Numbers': prec[4],
            'List': prec[5],
        }    
        bestne = max(best_param_count['n_estimator'], key=lambda i: best_param_count['n_estimator'][i])
        bestmd = max(best_param_count['max_depth'], key=lambda i: best_param_count['max_depth'][i])
        bestparams = {'n_estimator': bestne, 'max_depth': bestmd}
        print(f"\t> Best n_estimator : {bestne} || Best max_depth : {bestmd}")
        print(f"\t> Average training score list: {avgsc_train_lst}")
        print(f"\t> Average testing score list: {avgsc_lst}")
        print(f"\t> Average held score list: {avgsc_hld_lst}")
        print()
        avgsc_train = avgsc_train/k
        avgsc = avgsc/k
        avgsc_hld = avgsc_hld/k
        print(f"\t> Average training score list: {avgsc_train}")
        print(f"\t> Average testing score list: {avgsc}")
        print(f"\t> Average held score list: {avgsc_hld}")
        acc_df.loc[index] = ['random_forest', str(bestparams), combo, avgsc_train, avgsc, avgsc_hld, str(cat_prec)]
        index += 1
        print()

        y_pred = bestPerformingModel.predict(X_test)
        cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
        print('\tConfusion Matrix: Actual (Row) vs Predicted (Column)')
        print('\t',cnf_matrix)

In [11]:
test_feat_combos(index)
acc_df.to_csv('data/model_data.csv', index=False)

[n_estimator: 5, max_depth: 5, accuracy: 0.7701149425287356]
[n_estimator: 5, max_depth: 10, accuracy: 0.7816091954022989]
[n_estimator: 5, max_depth: 25, accuracy: 0.7931034482758621]
[n_estimator: 5, max_depth: 50, accuracy: 0.8045977011494253]
[n_estimator: 5, max_depth: 100, accuracy: 0.7701149425287356]
[n_estimator: 25, max_depth: 5, accuracy: 0.7816091954022989]
[n_estimator: 25, max_depth: 10, accuracy: 0.8045977011494253]
[n_estimator: 25, max_depth: 25, accuracy: 0.8045977011494253]
[n_estimator: 25, max_depth: 50, accuracy: 0.7931034482758621]
[n_estimator: 25, max_depth: 100, accuracy: 0.7816091954022989]
[n_estimator: 50, max_depth: 5, accuracy: 0.7701149425287356]
[n_estimator: 50, max_depth: 10, accuracy: 0.7931034482758621]
[n_estimator: 50, max_depth: 25, accuracy: 0.8045977011494253]
[n_estimator: 50, max_depth: 50, accuracy: 0.7701149425287356]
[n_estimator: 50, max_depth: 100, accuracy: 0.8045977011494253]
[n_estimator: 75, max_depth: 5, accuracy: 0.7816091954022989

[n_estimator: 75, max_depth: 50, accuracy: 0.7126436781609196]
[n_estimator: 75, max_depth: 100, accuracy: 0.7011494252873564]
[n_estimator: 100, max_depth: 5, accuracy: 0.735632183908046]
[n_estimator: 100, max_depth: 10, accuracy: 0.7011494252873564]
[n_estimator: 100, max_depth: 25, accuracy: 0.7126436781609196]
[n_estimator: 100, max_depth: 50, accuracy: 0.7011494252873564]
[n_estimator: 100, max_depth: 100, accuracy: 0.7126436781609196]
	 ----------
	> Best n_estimator: 5 || Best max_depth: 10
	> Best training score: 0.8641618497109826
	> Best test score: 0.7093023255813954
	> Best held score: 0.7614678899082569
	 ----------
	> Best n_estimator : 25 || Best max_depth : 10
	> Average training score list: [0.8724637681159421, 0.8666666666666667, 0.7890173410404624, 0.8526011560693642, 0.8641618497109826]
	> Average testing score list: [0.6781609195402298, 0.735632183908046, 0.7441860465116279, 0.8255813953488372, 0.7093023255813954]
	> Average held score list: [0.7981651376146789, 0

[n_estimator: 100, max_depth: 50, accuracy: 0.8505747126436781]
[n_estimator: 100, max_depth: 100, accuracy: 0.7241379310344828]
	 ----------
	> Best n_estimator: 100 || Best max_depth: 50
	> Best training score: 0.9335260115606936
	> Best test score: 0.8023255813953488
	> Best held score: 0.7798165137614679
[n_estimator: 5, max_depth: 5, accuracy: 0.6666666666666666]
[n_estimator: 5, max_depth: 10, accuracy: 0.7126436781609196]
[n_estimator: 5, max_depth: 25, accuracy: 0.7816091954022989]
[n_estimator: 5, max_depth: 50, accuracy: 0.7701149425287356]
[n_estimator: 5, max_depth: 100, accuracy: 0.6551724137931034]
[n_estimator: 25, max_depth: 5, accuracy: 0.6666666666666666]
[n_estimator: 25, max_depth: 10, accuracy: 0.7011494252873564]
[n_estimator: 25, max_depth: 25, accuracy: 0.8045977011494253]
[n_estimator: 25, max_depth: 50, accuracy: 0.8045977011494253]
[n_estimator: 25, max_depth: 100, accuracy: 0.7586206896551724]
[n_estimator: 50, max_depth: 5, accuracy: 0.6206896551724138]
[n_

  'precision', 'predicted', average, warn_for)


[n_estimator: 5, max_depth: 100, accuracy: 0.7931034482758621]
[n_estimator: 25, max_depth: 5, accuracy: 0.7241379310344828]
[n_estimator: 25, max_depth: 10, accuracy: 0.7471264367816092]
[n_estimator: 25, max_depth: 25, accuracy: 0.8390804597701149]
[n_estimator: 25, max_depth: 50, accuracy: 0.8160919540229885]
[n_estimator: 25, max_depth: 100, accuracy: 0.8045977011494253]
[n_estimator: 50, max_depth: 5, accuracy: 0.6781609195402298]
[n_estimator: 50, max_depth: 10, accuracy: 0.7701149425287356]
[n_estimator: 50, max_depth: 25, accuracy: 0.8275862068965517]
[n_estimator: 50, max_depth: 50, accuracy: 0.7931034482758621]
[n_estimator: 50, max_depth: 100, accuracy: 0.8275862068965517]
[n_estimator: 75, max_depth: 5, accuracy: 0.7241379310344828]
[n_estimator: 75, max_depth: 10, accuracy: 0.7471264367816092]
[n_estimator: 75, max_depth: 25, accuracy: 0.8275862068965517]
[n_estimator: 75, max_depth: 50, accuracy: 0.8275862068965517]
[n_estimator: 75, max_depth: 100, accuracy: 0.8160919540

[n_estimator: 100, max_depth: 50, accuracy: 0.8045977011494253]
[n_estimator: 100, max_depth: 100, accuracy: 0.7931034482758621]
	 ----------
	> Best n_estimator: 25 || Best max_depth: 50
	> Best training score: 0.953757225433526
	> Best test score: 0.7906976744186046
	> Best held score: 0.7706422018348624
	 ----------
	> Best n_estimator : 25 || Best max_depth : 25
	> Average training score list: [0.9536231884057971, 0.9536231884057971, 0.9335260115606936, 0.9450867052023122, 0.953757225433526]
	> Average testing score list: [0.8850574712643678, 0.7816091954022989, 0.8488372093023255, 0.8023255813953488, 0.7906976744186046]
	> Average held score list: [0.8899082568807339, 0.8348623853211009, 0.8348623853211009, 0.8073394495412844, 0.7706422018348624]

	> Average training score list: 0.9479232638016253
	> Average testing score list: 0.8217054263565892
	> Average held score list: 0.8275229357798166

	Confusion Matrix: Actual (Row) vs Predicted (Column)
	 [[26  0  1  0  0  0]
 [ 0 13  8 

  'precision', 'predicted', average, warn_for)


[n_estimator: 5, max_depth: 5, accuracy: 0.5517241379310345]
[n_estimator: 5, max_depth: 10, accuracy: 0.6666666666666666]
[n_estimator: 5, max_depth: 25, accuracy: 0.7126436781609196]
[n_estimator: 5, max_depth: 50, accuracy: 0.7701149425287356]
[n_estimator: 5, max_depth: 100, accuracy: 0.7471264367816092]
[n_estimator: 25, max_depth: 5, accuracy: 0.5402298850574713]
[n_estimator: 25, max_depth: 10, accuracy: 0.5632183908045977]
[n_estimator: 25, max_depth: 25, accuracy: 0.7011494252873564]
[n_estimator: 25, max_depth: 50, accuracy: 0.735632183908046]
[n_estimator: 25, max_depth: 100, accuracy: 0.7586206896551724]
[n_estimator: 50, max_depth: 5, accuracy: 0.5057471264367817]
[n_estimator: 50, max_depth: 10, accuracy: 0.5977011494252874]
[n_estimator: 50, max_depth: 25, accuracy: 0.735632183908046]
[n_estimator: 50, max_depth: 50, accuracy: 0.735632183908046]
[n_estimator: 50, max_depth: 100, accuracy: 0.7701149425287356]
[n_estimator: 75, max_depth: 5, accuracy: 0.5402298850574713]
[

[n_estimator: 100, max_depth: 5, accuracy: 0.6436781609195402]
[n_estimator: 100, max_depth: 10, accuracy: 0.6666666666666666]
[n_estimator: 100, max_depth: 25, accuracy: 0.7126436781609196]
[n_estimator: 100, max_depth: 50, accuracy: 0.7471264367816092]
[n_estimator: 100, max_depth: 100, accuracy: 0.7471264367816092]
	 ----------
	> Best n_estimator: 25 || Best max_depth: 50
	> Best training score: 0.8526011560693642
	> Best test score: 0.7674418604651163
	> Best held score: 0.8073394495412844
	 ----------
	> Best n_estimator : 5 || Best max_depth : 50
	> Average training score list: [0.8840579710144928, 0.8376811594202899, 0.8005780346820809, 0.8121387283236994, 0.8526011560693642]
	> Average testing score list: [0.7126436781609196, 0.7816091954022989, 0.7558139534883721, 0.627906976744186, 0.7674418604651163]
	> Average held score list: [0.7981651376146789, 0.7798165137614679, 0.7614678899082569, 0.7614678899082569, 0.8073394495412844]

	> Average training score list: 0.837411409901

  'precision', 'predicted', average, warn_for)


[n_estimator: 5, max_depth: 5, accuracy: 0.6551724137931034]
[n_estimator: 5, max_depth: 10, accuracy: 0.6781609195402298]
[n_estimator: 5, max_depth: 25, accuracy: 0.735632183908046]
[n_estimator: 5, max_depth: 50, accuracy: 0.7126436781609196]
[n_estimator: 5, max_depth: 100, accuracy: 0.632183908045977]
[n_estimator: 25, max_depth: 5, accuracy: 0.6091954022988506]
[n_estimator: 25, max_depth: 10, accuracy: 0.7126436781609196]
[n_estimator: 25, max_depth: 25, accuracy: 0.7816091954022989]
[n_estimator: 25, max_depth: 50, accuracy: 0.7816091954022989]
[n_estimator: 25, max_depth: 100, accuracy: 0.6781609195402298]
[n_estimator: 50, max_depth: 5, accuracy: 0.6091954022988506]
[n_estimator: 50, max_depth: 10, accuracy: 0.6896551724137931]
[n_estimator: 50, max_depth: 25, accuracy: 0.735632183908046]
[n_estimator: 50, max_depth: 50, accuracy: 0.7931034482758621]
[n_estimator: 50, max_depth: 100, accuracy: 0.7011494252873564]
[n_estimator: 75, max_depth: 5, accuracy: 0.5747126436781609]
[

[n_estimator: 75, max_depth: 100, accuracy: 0.6781609195402298]
[n_estimator: 100, max_depth: 5, accuracy: 0.7011494252873564]
[n_estimator: 100, max_depth: 10, accuracy: 0.7471264367816092]
[n_estimator: 100, max_depth: 25, accuracy: 0.7701149425287356]
[n_estimator: 100, max_depth: 50, accuracy: 0.8045977011494253]
[n_estimator: 100, max_depth: 100, accuracy: 0.6781609195402298]
	 ----------
	> Best n_estimator: 100 || Best max_depth: 50
	> Best training score: 0.9364161849710982
	> Best test score: 0.7674418604651163
	> Best held score: 0.7889908256880734
	 ----------
	> Best n_estimator : 100 || Best max_depth : 50
	> Average training score list: [0.9304347826086956, 0.9217391304347826, 0.9161849710982659, 0.8988439306358381, 0.9364161849710982]
	> Average testing score list: [0.7701149425287356, 0.8045977011494253, 0.8023255813953488, 0.8023255813953488, 0.7674418604651163]
	> Average held score list: [0.8073394495412844, 0.8348623853211009, 0.8440366972477065, 0.7339449541284404,

  'precision', 'predicted', average, warn_for)


[n_estimator: 5, max_depth: 5, accuracy: 0.5632183908045977]
[n_estimator: 5, max_depth: 10, accuracy: 0.6781609195402298]
[n_estimator: 5, max_depth: 25, accuracy: 0.6666666666666666]
[n_estimator: 5, max_depth: 50, accuracy: 0.8160919540229885]
[n_estimator: 5, max_depth: 100, accuracy: 0.7126436781609196]
[n_estimator: 25, max_depth: 5, accuracy: 0.5977011494252874]
[n_estimator: 25, max_depth: 10, accuracy: 0.6896551724137931]
[n_estimator: 25, max_depth: 25, accuracy: 0.7701149425287356]
[n_estimator: 25, max_depth: 50, accuracy: 0.7931034482758621]
[n_estimator: 25, max_depth: 100, accuracy: 0.7701149425287356]
[n_estimator: 50, max_depth: 5, accuracy: 0.5517241379310345]
[n_estimator: 50, max_depth: 10, accuracy: 0.6781609195402298]
[n_estimator: 50, max_depth: 25, accuracy: 0.7701149425287356]
[n_estimator: 50, max_depth: 50, accuracy: 0.8160919540229885]
[n_estimator: 50, max_depth: 100, accuracy: 0.8275862068965517]
[n_estimator: 75, max_depth: 5, accuracy: 0.5172413793103449

[n_estimator: 75, max_depth: 100, accuracy: 0.7816091954022989]
[n_estimator: 100, max_depth: 5, accuracy: 0.632183908045977]
[n_estimator: 100, max_depth: 10, accuracy: 0.6781609195402298]
[n_estimator: 100, max_depth: 25, accuracy: 0.7586206896551724]
[n_estimator: 100, max_depth: 50, accuracy: 0.8275862068965517]
[n_estimator: 100, max_depth: 100, accuracy: 0.7701149425287356]
	 ----------
	> Best n_estimator: 25 || Best max_depth: 50
	> Best training score: 0.9393063583815029
	> Best test score: 0.8023255813953488
	> Best held score: 0.8073394495412844
	 ----------
	> Best n_estimator : 50 || Best max_depth : 100
	> Average training score list: [0.9536231884057971, 0.9072463768115943, 0.9190751445086706, 0.9364161849710982, 0.9393063583815029]
	> Average testing score list: [0.735632183908046, 0.8275862068965517, 0.7441860465116279, 0.8255813953488372, 0.8023255813953488]
	> Average held score list: [0.7614678899082569, 0.7889908256880734, 0.7614678899082569, 0.7706422018348624, 0.

  'precision', 'predicted', average, warn_for)


	Confusion Matrix: Actual (Row) vs Predicted (Column)
	 [[26  0  1  0  0  0]
 [ 0 14  8  0  0  0]
 [ 5  1 46  0  0  0]
 [ 0  0  0  2  0  0]
 [ 0  0  1  0  0  0]
 [ 0  0  5  0  0  0]]
[n_estimator: 5, max_depth: 5, accuracy: 0.6436781609195402]
[n_estimator: 5, max_depth: 10, accuracy: 0.6551724137931034]
[n_estimator: 5, max_depth: 25, accuracy: 0.7471264367816092]
[n_estimator: 5, max_depth: 50, accuracy: 0.7816091954022989]
[n_estimator: 5, max_depth: 100, accuracy: 0.7471264367816092]
[n_estimator: 25, max_depth: 5, accuracy: 0.5977011494252874]
[n_estimator: 25, max_depth: 10, accuracy: 0.7126436781609196]
[n_estimator: 25, max_depth: 25, accuracy: 0.7701149425287356]
[n_estimator: 25, max_depth: 50, accuracy: 0.7816091954022989]
[n_estimator: 25, max_depth: 100, accuracy: 0.7471264367816092]
[n_estimator: 50, max_depth: 5, accuracy: 0.6781609195402298]
[n_estimator: 50, max_depth: 10, accuracy: 0.7816091954022989]
[n_estimator: 50, max_depth: 25, accuracy: 0.7931034482758621]
[n_e

[n_estimator: 50, max_depth: 100, accuracy: 0.7931034482758621]
[n_estimator: 75, max_depth: 5, accuracy: 0.7241379310344828]
[n_estimator: 75, max_depth: 10, accuracy: 0.735632183908046]
[n_estimator: 75, max_depth: 25, accuracy: 0.8045977011494253]
[n_estimator: 75, max_depth: 50, accuracy: 0.8160919540229885]
[n_estimator: 75, max_depth: 100, accuracy: 0.8045977011494253]
[n_estimator: 100, max_depth: 5, accuracy: 0.6781609195402298]
[n_estimator: 100, max_depth: 10, accuracy: 0.7586206896551724]
[n_estimator: 100, max_depth: 25, accuracy: 0.7701149425287356]
[n_estimator: 100, max_depth: 50, accuracy: 0.8505747126436781]
[n_estimator: 100, max_depth: 100, accuracy: 0.7931034482758621]
	 ----------
	> Best n_estimator: 100 || Best max_depth: 50
	> Best training score: 0.9566473988439307
	> Best test score: 0.8372093023255814
	> Best held score: 0.8256880733944955
	 ----------
	> Best n_estimator : 50 || Best max_depth : 50
	> Average training score list: [0.9478260869565217, 0.92173

  'precision', 'predicted', average, warn_for)
