In [1]:
from collections import Counter
from collections import defaultdict
from matplotlib import pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn import metrics

import pickle
import math
import re
import enchant
import pandas as pd
import os
import glob
import numpy as np
np.random.seed(512)

  from numpy.core.umath_tests import inner1d


In [2]:
#read csv
dict_label = {
    'Datetime':0, 
    'Sentence':1, 
    'Custom Object': 2, 
    'URL': 3, 
    'Numbers': 4, 
    'List': 5}
data = pd.read_csv('data/needs_extraction_data/labelled_added.csv')

data['y_act'] = [dict_label[i] for i in data['y_act']]
y = data.loc[:,['y_act']]

In [3]:
data1 = data[['%_nans', 'mean_word_count',
              'std_dev_word_count', 'has_delimiters', 'mean_stopword_total',
              'mean_whitespace_count', 'mean_char_count', 'mean_delim_count',
              'stdev_stopword_total', 'stdev_whitespace_count', 'stdev_char_count',
              'stdev_delim_count', 'has_url', 'has_date']]
data1 = data1.fillna(0)

data1 = data1.rename(columns={
    'mean_word_count': 'scaled_mean_token_count',
    'std_dev_word_count': 'scaled_std_dev_token_count',
    '%_nans': 'scaled_perc_nans',
    'mean_stopword_total': 'scaled_mean_stopword_total',
    'mean_whitespace_count': 'scaled_mean_whitespace_count',
    'mean_char_count': 'scaled_mean_char_count',
    'mean_delim_count': 'scaled_mean_delim_count',
    'stdev_stopword_total': 'scaled_stdev_stopword_total',
    'stdev_whitespace_count': 'scaled_stdev_whitespace_count',
    'stdev_char_count': 'scaled_stdev_char_count',
    'stdev_delim_count': 'scaled_stdev_delim_count'
})
data1.loc[data1['scaled_mean_token_count'] >
          10000, 'scaled_mean_token_count'] = 10000
data1.loc[data1['scaled_mean_token_count'] < -
          10000, 'scaled_mean_token_count'] = -10000

data1.loc[data1['scaled_std_dev_token_count'] >
          10000, 'scaled_std_dev_token_count'] = 10000
data1.loc[data1['scaled_std_dev_token_count'] < -
          10000, 'scaled_std_dev_token_count'] = -10000

data1.loc[data1['scaled_perc_nans'] > 10000, 'scaled_perc_nans'] = 10000
data1.loc[data1['scaled_perc_nans'] < -10000, 'scaled_perc_nans'] = -10000

data1.loc[data1['scaled_mean_stopword_total'] >
          10000, 'scaled_mean_stopword_total'] = 10000
data1.loc[data1['scaled_mean_stopword_total'] < -
          10000, 'scaled_mean_stopword_total'] = -10000

data1.loc[data1['scaled_mean_whitespace_count'] >
          10000, 'scaled_mean_whitespace_count'] = 10000
data1.loc[data1['scaled_mean_whitespace_count'] < -
          10000, 'scaled_mean_whitespace_count'] = -10000

data1.loc[data1['scaled_mean_char_count'] >
          10000, 'scaled_mean_char_count'] = 10000
data1.loc[data1['scaled_mean_char_count'] < -
          10000, 'scaled_mean_char_count'] = -10000

data1.loc[data1['scaled_mean_delim_count'] >
          10000, 'scaled_mean_delim_count'] = 10000
data1.loc[data1['scaled_mean_delim_count'] < -
          10000, 'scaled_mean_delim_count'] = -10000

data1.loc[data1['scaled_stdev_stopword_total'] >
          10000, 'scaled_stdev_stopword_total'] = 10000
data1.loc[data1['scaled_stdev_stopword_total'] < -
          10000, 'scaled_stdev_stopword_total'] = -10000

data1.loc[data1['scaled_stdev_whitespace_count'] >
          10000, 'scaled_stdev_whitespace_count'] = 10000
data1.loc[data1['scaled_stdev_whitespace_count'] < -
          10000, 'scaled_stdev_whitespace_count'] = -10000

data1.loc[data1['scaled_stdev_char_count'] >
          10000, 'scaled_stdev_char_count'] = 10000
data1.loc[data1['scaled_stdev_char_count'] < -
          10000, 'scaled_stdev_char_count'] = -10000

data1.loc[data1['scaled_stdev_delim_count'] >
          10000, 'scaled_stdev_delim_count'] = 10000
data1.loc[data1['scaled_stdev_delim_count'] < -
          10000, 'scaled_stdev_delim_count'] = -10000

column_names_to_normalize = ['scaled_mean_token_count',
                             'scaled_std_dev_token_count',
                             'scaled_perc_nans',
                             'scaled_mean_stopword_total',
                             'scaled_mean_whitespace_count',
                             'scaled_mean_char_count',
                             'scaled_mean_delim_count',
                             'scaled_stdev_stopword_total',
                             'scaled_stdev_whitespace_count',
                             'scaled_stdev_char_count',
                             'scaled_stdev_delim_count']
x = data1[column_names_to_normalize].values
x = np.nan_to_num(x)
x_scaled = StandardScaler().fit_transform(x)
df_temp = pd.DataFrame(
    x_scaled, columns=column_names_to_normalize, index=data1.index)
data1[column_names_to_normalize] = df_temp

y.y_act = y.y_act.astype(float)

print(f"> Data mean: {data1.mean()}\n")
print(f"> Data median: {data1.median()}\n")
print(f"> Data stdev: {data1.std()}")

> Data mean: scaled_perc_nans                -2.745801e-16
scaled_mean_token_count         -1.117919e-16
scaled_std_dev_token_count      -2.236863e-17
has_delimiters                   3.105360e-01
scaled_mean_stopword_total       8.619107e-18
scaled_mean_whitespace_count    -1.126127e-16
scaled_mean_char_count           5.130421e-17
scaled_mean_delim_count         -5.915375e-17
scaled_stdev_stopword_total      6.413026e-17
scaled_stdev_whitespace_count   -2.236863e-17
scaled_stdev_char_count         -3.488686e-18
scaled_stdev_delim_count         9.516930e-17
has_url                          8.687616e-02
has_date                         7.560074e-01
dtype: float64

> Data median: scaled_perc_nans                -0.653046
scaled_mean_token_count         -0.144106
scaled_std_dev_token_count      -0.171320
has_delimiters                   0.000000
scaled_mean_stopword_total      -0.178121
scaled_mean_whitespace_count    -0.144106
scaled_mean_char_count          -0.166657
scaled_mean_delim_

In [4]:
print("===[VECTORIZATION]===")
arr = data['Attribute_name'].values
data = data.fillna(0)
arr1 = data['sample_1'].values
arr1 = [str(x) for x in arr1]
arr2 = data['sample_2'].values
arr2 = [str(x) for x in arr2]

vectorizer = CountVectorizer(ngram_range=(3, 3), analyzer='char')
X = vectorizer.fit_transform(arr)
X1 = vectorizer.fit_transform(arr1)
X2 = vectorizer.fit_transform(arr2)

print(f"> Length of vectorized feature_names: {len(vectorizer.get_feature_names())}")

data1.to_csv('data/preprocessing/before.csv')
attr_df = pd.DataFrame(X.toarray())
sample1_df = pd.DataFrame(X1.toarray())
sample2_df = pd.DataFrame(X2.toarray())

data2 = pd.concat([data1, attr_df, sample1_df, sample2_df], axis=1, sort=False)
data2.to_csv('data/preprocessing/after.csv')
data2.head()

X_train, X_test, y_train, y_test = train_test_split(
    data2, y, test_size=0.2, random_state=100)
atr_train,atr_test = train_test_split(data2, test_size=0.2,random_state=100)

# X_train_train, X_test_train,y_train_train,y_test_train = train_test_split(X_train,y_train, test_size=0.25)
# print(X_train.head())
# print(y_train.head())

X_train_new = X_train.reset_index(drop=True)
y_train_new = y_train.reset_index(drop=True)
print(f"X_train preview: {X_train.head()}")
print(f"y_train preview: {y_train.head()}")

X_train_new = X_train_new.values
y_train_new = y_train_new.values

===[VECTORIZATION]===
> Length of vectorized feature_names: 8528
X_train preview:      scaled_perc_nans  scaled_mean_token_count  scaled_std_dev_token_count  \
453         -0.653097                 0.686283                    3.364514   
43          -0.653120                 0.162079                   -0.054513   
133          1.978459                -0.148544                   -0.167108   
205         -0.653120                -0.141062                   -0.175870   
282         -0.653120                -0.148960                   -0.175870   

     has_delimiters  scaled_mean_stopword_total  scaled_mean_whitespace_count  \
453            True                    0.945220                      0.686283   
43             True                    0.126161                      0.162079   
133            True                   -0.187400                     -0.148544   
205           False                   -0.178121                     -0.141062   
282           False                   -0.187

In [5]:
try:
    acc_df = pd.read_csv('data/model_data.csv')
    index = len(acc_df)
except FileNotFoundError:
    acc_df = pd.DataFrame(columns=['Model', 'Params', 'Feats', 'Train', 'Validation', 'Test', 'Precision'])
    index = 0

In [6]:
k = 5
kf = KFold(n_splits=k)
avg_train_acc,avg_test_acc = 0,0
    
n_estimators_grid = [5,25,50,75,100]
max_depth_grid = [5,10,25,50,100]

avgsc_lst,avgsc_train_lst,avgsc_hld_lst = [],[],[]
avgsc,avgsc_train,avgsc_hld = 0,0,0

best_param_count = {'n_estimator': {}, 'max_depth': {}}
for train_index, test_index in kf.split(X_train_new):
    X_train_cur, X_test_cur = X_train_new[train_index], X_train_new[test_index]
    y_train_cur, y_test_cur = y_train_new[train_index], y_train_new[test_index]
    X_train_train, X_val,y_train_train,y_val = train_test_split(X_train_cur,y_train_cur, test_size=0.25,random_state=100)
    
    bestPerformingModel = RandomForestClassifier(n_estimators=10,max_depth=5)
    bestscore = 0
    print('='*10)
    for ne in n_estimators_grid:
        for md in max_depth_grid:
            clf = RandomForestClassifier(n_estimators=ne,max_depth=md)
            clf.fit(X_train_train, y_train_train.ravel())
            sc = clf.score(X_val, y_val)
            print(f"[n_estimator: {ne}, max_depth: {md}, accuracy: {sc}]")
            if bestscore < sc:
                bestne = ne
                bestmd = md
                bestscore = sc
                bestPerformingModel = clf
                
    if str(bestne) in best_param_count['n_estimator']:
        best_param_count['n_estimator'][str(bestne)] += 1
    else:
        best_param_count['n_estimator'][str(bestne)] = 1
        
    if str(bestmd) in best_param_count['max_depth']:
        best_param_count['max_depth'][str(bestmd)] += 1
    else:
        best_param_count['max_depth'][str(bestmd)] = 1
        
    bscr_train = bestPerformingModel.score(X_train_cur, y_train_cur)
    bscr = bestPerformingModel.score(X_test_cur, y_test_cur)
    bscr_hld = bestPerformingModel.score(X_test, y_test)

    avgsc_train_lst.append(bscr_train)
    avgsc_lst.append(bscr)
    avgsc_hld_lst.append(bscr_hld)
    
    avgsc_train = avgsc_train + bscr_train    
    avgsc = avgsc + bscr
    avgsc_hld = avgsc_hld + bscr_hld

    print()
    print(f"> Best n_estimator: {bestne} || Best max_depth: {bestmd}")
    print(f"> Best training score: {bscr_train}")
    print(f"> Best test score: {bscr}")
    print(f"> Best held score: {bscr_hld}")
print('='*10)

[n_estimator: 5, max_depth: 5, accuracy: 0.6666666666666666]
[n_estimator: 5, max_depth: 10, accuracy: 0.7011494252873564]
[n_estimator: 5, max_depth: 25, accuracy: 0.7471264367816092]
[n_estimator: 5, max_depth: 50, accuracy: 0.8275862068965517]
[n_estimator: 5, max_depth: 100, accuracy: 0.7471264367816092]
[n_estimator: 25, max_depth: 5, accuracy: 0.7011494252873564]
[n_estimator: 25, max_depth: 10, accuracy: 0.7586206896551724]
[n_estimator: 25, max_depth: 25, accuracy: 0.8045977011494253]
[n_estimator: 25, max_depth: 50, accuracy: 0.7931034482758621]
[n_estimator: 25, max_depth: 100, accuracy: 0.8160919540229885]
[n_estimator: 50, max_depth: 5, accuracy: 0.7126436781609196]
[n_estimator: 50, max_depth: 10, accuracy: 0.7471264367816092]
[n_estimator: 50, max_depth: 25, accuracy: 0.8045977011494253]
[n_estimator: 50, max_depth: 50, accuracy: 0.8160919540229885]
[n_estimator: 50, max_depth: 100, accuracy: 0.7931034482758621]
[n_estimator: 75, max_depth: 5, accuracy: 0.6896551724137931

[n_estimator: 75, max_depth: 100, accuracy: 0.8620689655172413]
[n_estimator: 100, max_depth: 5, accuracy: 0.7471264367816092]
[n_estimator: 100, max_depth: 10, accuracy: 0.7816091954022989]
[n_estimator: 100, max_depth: 25, accuracy: 0.8275862068965517]
[n_estimator: 100, max_depth: 50, accuracy: 0.8505747126436781]
[n_estimator: 100, max_depth: 100, accuracy: 0.8275862068965517]

> Best n_estimator: 50 || Best max_depth: 50
> Best training score: 0.9653179190751445
> Best test score: 0.872093023255814
> Best held score: 0.8348623853211009


In [7]:
y_pred = bestPerformingModel.predict(X_test)
prec = metrics.precision_score(y_test, y_pred, average=None)
cat_prec = {
    'Datetime': prec[0],
    'Sentence': prec[1],
    'Custom Object': prec[2],
    'URL': prec[3],
    'Numbers': prec[4],
    'List': prec[5],
}

  'precision', 'predicted', average, warn_for)


In [8]:
bestne = max(best_param_count['n_estimator'], key=lambda i: best_param_count['n_estimator'][i])
bestmd = max(best_param_count['max_depth'], key=lambda i: best_param_count['max_depth'][i])
bestparams = {'n_estimator': bestne, 'max_depth': bestmd}
print(f"> Best n_estimator : {bestne} || Best max_depth : {bestmd}")
print(f"> Average training score list: {avgsc_train_lst}")
print(f"> Average testing score list: {avgsc_lst}")
print(f"> Average held score list: {avgsc_hld_lst}")
print()
avgsc_train = avgsc_train/k
avgsc = avgsc/k
avgsc_hld = avgsc_hld/k
print(f"> Average training score list: {avgsc_train}")
print(f"> Average testing score list: {avgsc}")
print(f"> Average held score list: {avgsc_hld}")
acc_df.loc[index] = ['random_forest', str(bestparams),"X_stats, X_name, X_sample1, X_sample2", avgsc_train, avgsc, avgsc_hld, str(cat_prec)]
index += 1
print()

y_pred = bestPerformingModel.predict(X_test)
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
print('Confusion Matrix: Actual (Row) vs Predicted (Column)')
print(cnf_matrix)

> Best n_estimator : 50 || Best max_depth : 100
> Average training score list: [0.9594202898550724, 0.9536231884057971, 0.9624277456647399, 0.9682080924855492, 0.9653179190751445]
> Average testing score list: [0.8275862068965517, 0.8505747126436781, 0.8488372093023255, 0.8372093023255814, 0.872093023255814]
> Average held score list: [0.8532110091743119, 0.8532110091743119, 0.8256880733944955, 0.8532110091743119, 0.8348623853211009]

> Average training score list: 0.9617994470972606
> Average testing score list: 0.8472600908847902
> Average held score list: 0.8440366972477064

Confusion Matrix: Actual (Row) vs Predicted (Column)
[[23  0  4  0  0  0]
 [ 0 19  3  0  0  0]
 [ 3  2 47  0  0  0]
 [ 0  0  0  2  0  0]
 [ 1  0  0  0  0  0]
 [ 0  2  3  0  0  0]]


In [9]:
# save the model to disk
filename = 'data/pretrained/rf_finalized_model.sav'
pickle.dump(bestPerformingModel, open(filename, 'wb'))

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
y_prob = bestPerformingModel.predict_proba(X_test)

df = pd.DataFrame.from_records(y_prob)
print(df)
df.to_csv('data/model_predictions/rf_predictions.csv', index=False)

            0         1         2     3         4     5
0    0.068966  0.140513  0.746479  0.02  0.024042  0.00
1    0.020000  0.440000  0.200000  0.28  0.020000  0.04
2    1.000000  0.000000  0.000000  0.00  0.000000  0.00
3    0.860892  0.000000  0.099015  0.00  0.040092  0.00
4    0.025766  0.401282  0.412175  0.00  0.040777  0.12
5    0.431923  0.020513  0.484231  0.00  0.063333  0.00
6    0.000000  0.020769  0.977596  0.00  0.001635  0.00
7    0.967692  0.000000  0.024615  0.00  0.007692  0.00
8    0.900000  0.000000  0.060000  0.00  0.040000  0.00
9    0.001000  0.440000  0.419000  0.00  0.020000  0.12
10   0.104376  0.040769  0.767678  0.04  0.047177  0.00
11   0.020000  0.700000  0.180000  0.00  0.040000  0.06
12   0.980000  0.000000  0.000000  0.00  0.020000  0.00
13   0.940000  0.000000  0.040000  0.00  0.020000  0.00
14   0.033069  0.040769  0.868443  0.00  0.057719  0.00
15   0.612815  0.020513  0.261996  0.00  0.104676  0.00
16   0.000000  0.880000  0.080000  0.02  0.00000

In [10]:
def test_feat_combos(index):
    combos = {
        "X_stats": data1,
        "X_name": attr_df,
        "X_stats, X_name": pd.concat([data1, attr_df], axis=1, sort=False),
        "X_sample1":  pd.concat([sample1_df], axis=1, sort=False),
        "X_name, X_sample1":  pd.concat([attr_df, sample1_df], axis=1, sort=False),
        "X_stats, X_sample1":  pd.concat([data1, sample1_df], axis=1, sort=False),
        "X_stats, X_name, X_sample1":  pd.concat([data1, attr_df, sample1_df], axis=1, sort=False)
    }
    

    for combo in combos:
        print("="*50, combo, "="*50)
        X_train, X_test, y_train, y_test = train_test_split(
            combos[combo], y, test_size=0.2, random_state=100)

        X_train_new = X_train.reset_index(drop=True)
        y_train_new = y_train.reset_index(drop=True)
        X_train_new = X_train_new.values
        y_train_new = y_train_new.values
        k = 5
        kf = KFold(n_splits=k)
        avg_train_acc,avg_test_acc = 0,0

        n_estimators_grid = [5,25,50,75,100]
        max_depth_grid = [5,10,25,50,100]

        avgsc_lst,avgsc_train_lst,avgsc_hld_lst = [],[],[]
        avgsc,avgsc_train,avgsc_hld = 0,0,0

        best_param_count = {'n_estimator': {}, 'max_depth': {}}
        for train_index, test_index in kf.split(X_train_new):
            X_train_cur, X_test_cur = X_train_new[train_index], X_train_new[test_index]
            y_train_cur, y_test_cur = y_train_new[train_index], y_train_new[test_index]
            X_train_train, X_val,y_train_train,y_val = train_test_split(X_train_cur,y_train_cur, test_size=0.25,random_state=100)

            bestPerformingModel = RandomForestClassifier(n_estimators=10,max_depth=5)
            bestscore = 0
            for ne in n_estimators_grid:
                for md in max_depth_grid:
                    clf = RandomForestClassifier(n_estimators=ne,max_depth=md)
                    clf.fit(X_train_train, y_train_train.ravel())
                    sc = clf.score(X_val, y_val)
                    print(f"[n_estimator: {ne}, max_depth: {md}, accuracy: {sc}]")
                    if bestscore < sc:
                        bestne = ne
                        bestmd = md
                        bestscore = sc
                        bestPerformingModel = clf

            if str(bestne) in best_param_count['n_estimator']:
                best_param_count['n_estimator'][str(bestne)] += 1
            else:
                best_param_count['n_estimator'][str(bestne)] = 1

            if str(bestmd) in best_param_count['max_depth']:
                best_param_count['max_depth'][str(bestmd)] += 1
            else:
                best_param_count['max_depth'][str(bestmd)] = 1

            bscr_train = bestPerformingModel.score(X_train_cur, y_train_cur)
            bscr = bestPerformingModel.score(X_test_cur, y_test_cur)
            bscr_hld = bestPerformingModel.score(X_test, y_test)

            avgsc_train_lst.append(bscr_train)
            avgsc_lst.append(bscr)
            avgsc_hld_lst.append(bscr_hld)

            avgsc_train = avgsc_train + bscr_train    
            avgsc = avgsc + bscr
            avgsc_hld = avgsc_hld + bscr_hld

            print('\t','-'*10)
            print(f"\t> Best n_estimator: {bestne} || Best max_depth: {bestmd}")
            print(f"\t> Best training score: {bscr_train}")
            print(f"\t> Best test score: {bscr}")
            print(f"\t> Best held score: {bscr_hld}")
        print('\t','-'*10)
        
        y_pred = bestPerformingModel.predict(X_test)
        prec = metrics.precision_score(y_test, y_pred, average=None)
        cat_prec = {
            'Datetime': prec[0],
            'Sentence': prec[1],
            'Custom Object': prec[2],
            'URL': prec[3],
            'Numbers': prec[4],
            'List': prec[5],
        }    
        bestne = max(best_param_count['n_estimator'], key=lambda i: best_param_count['n_estimator'][i])
        bestmd = max(best_param_count['max_depth'], key=lambda i: best_param_count['max_depth'][i])
        bestparams = {'n_estimator': bestne, 'max_depth': bestmd}
        print(f"\t> Best n_estimator : {bestne} || Best max_depth : {bestmd}")
        print(f"\t> Average training score list: {avgsc_train_lst}")
        print(f"\t> Average testing score list: {avgsc_lst}")
        print(f"\t> Average held score list: {avgsc_hld_lst}")
        print()
        avgsc_train = avgsc_train/k
        avgsc = avgsc/k
        avgsc_hld = avgsc_hld/k
        print(f"\t> Average training score list: {avgsc_train}")
        print(f"\t> Average testing score list: {avgsc}")
        print(f"\t> Average held score list: {avgsc_hld}")
        acc_df.loc[index] = ['random_forest', str(bestparams), combo, avgsc_train, avgsc, avgsc_hld, str(cat_prec)]
        index += 1
        print()

        y_pred = bestPerformingModel.predict(X_test)
        cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
        print('\tConfusion Matrix: Actual (Row) vs Predicted (Column)')
        print('\t',cnf_matrix)

In [11]:
test_feat_combos(index)
acc_df.to_csv('data/model_data.csv', index=False)

[n_estimator: 5, max_depth: 5, accuracy: 0.8620689655172413]
[n_estimator: 5, max_depth: 10, accuracy: 0.8505747126436781]
[n_estimator: 5, max_depth: 25, accuracy: 0.8735632183908046]
[n_estimator: 5, max_depth: 50, accuracy: 0.8620689655172413]
[n_estimator: 5, max_depth: 100, accuracy: 0.8850574712643678]
[n_estimator: 25, max_depth: 5, accuracy: 0.8275862068965517]
[n_estimator: 25, max_depth: 10, accuracy: 0.8620689655172413]
[n_estimator: 25, max_depth: 25, accuracy: 0.8735632183908046]
[n_estimator: 25, max_depth: 50, accuracy: 0.8505747126436781]
[n_estimator: 25, max_depth: 100, accuracy: 0.8620689655172413]
[n_estimator: 50, max_depth: 5, accuracy: 0.8620689655172413]
[n_estimator: 50, max_depth: 10, accuracy: 0.8620689655172413]
[n_estimator: 50, max_depth: 25, accuracy: 0.8505747126436781]
[n_estimator: 50, max_depth: 50, accuracy: 0.8735632183908046]
[n_estimator: 50, max_depth: 100, accuracy: 0.8505747126436781]
[n_estimator: 75, max_depth: 5, accuracy: 0.8505747126436781

[n_estimator: 75, max_depth: 50, accuracy: 0.8275862068965517]
[n_estimator: 75, max_depth: 100, accuracy: 0.8390804597701149]
[n_estimator: 100, max_depth: 5, accuracy: 0.8275862068965517]
[n_estimator: 100, max_depth: 10, accuracy: 0.8390804597701149]
[n_estimator: 100, max_depth: 25, accuracy: 0.8160919540229885]
[n_estimator: 100, max_depth: 50, accuracy: 0.8160919540229885]
[n_estimator: 100, max_depth: 100, accuracy: 0.8390804597701149]
	 ----------
	> Best n_estimator: 50 || Best max_depth: 100
	> Best training score: 0.9624277456647399
	> Best test score: 0.872093023255814
	> Best held score: 0.8256880733944955
	 ----------
	> Best n_estimator : 25 || Best max_depth : 100
	> Average training score list: [0.9565217391304348, 0.9681159420289855, 0.9624277456647399, 0.9624277456647399, 0.9624277456647399]
	> Average testing score list: [0.7816091954022989, 0.8160919540229885, 0.8604651162790697, 0.8604651162790697, 0.872093023255814]
	> Average held score list: [0.8073394495412844

  'precision', 'predicted', average, warn_for)


[n_estimator: 25, max_depth: 10, accuracy: 0.7241379310344828]
[n_estimator: 25, max_depth: 25, accuracy: 0.7471264367816092]
[n_estimator: 25, max_depth: 50, accuracy: 0.8045977011494253]
[n_estimator: 25, max_depth: 100, accuracy: 0.735632183908046]
[n_estimator: 50, max_depth: 5, accuracy: 0.7126436781609196]
[n_estimator: 50, max_depth: 10, accuracy: 0.7241379310344828]
[n_estimator: 50, max_depth: 25, accuracy: 0.7471264367816092]
[n_estimator: 50, max_depth: 50, accuracy: 0.7931034482758621]
[n_estimator: 50, max_depth: 100, accuracy: 0.7471264367816092]
[n_estimator: 75, max_depth: 5, accuracy: 0.6781609195402298]
[n_estimator: 75, max_depth: 10, accuracy: 0.7126436781609196]
[n_estimator: 75, max_depth: 25, accuracy: 0.735632183908046]
[n_estimator: 75, max_depth: 50, accuracy: 0.7816091954022989]
[n_estimator: 75, max_depth: 100, accuracy: 0.735632183908046]
[n_estimator: 100, max_depth: 5, accuracy: 0.632183908045977]
[n_estimator: 100, max_depth: 10, accuracy: 0.712643678160

  'precision', 'predicted', average, warn_for)


[n_estimator: 5, max_depth: 50, accuracy: 0.8275862068965517]
[n_estimator: 5, max_depth: 100, accuracy: 0.8160919540229885]
[n_estimator: 25, max_depth: 5, accuracy: 0.7586206896551724]
[n_estimator: 25, max_depth: 10, accuracy: 0.8160919540229885]
[n_estimator: 25, max_depth: 25, accuracy: 0.8505747126436781]
[n_estimator: 25, max_depth: 50, accuracy: 0.8505747126436781]
[n_estimator: 25, max_depth: 100, accuracy: 0.8275862068965517]
[n_estimator: 50, max_depth: 5, accuracy: 0.7701149425287356]
[n_estimator: 50, max_depth: 10, accuracy: 0.8160919540229885]
[n_estimator: 50, max_depth: 25, accuracy: 0.8390804597701149]
[n_estimator: 50, max_depth: 50, accuracy: 0.8505747126436781]
[n_estimator: 50, max_depth: 100, accuracy: 0.8390804597701149]
[n_estimator: 75, max_depth: 5, accuracy: 0.7816091954022989]
[n_estimator: 75, max_depth: 10, accuracy: 0.8505747126436781]
[n_estimator: 75, max_depth: 25, accuracy: 0.8505747126436781]
[n_estimator: 75, max_depth: 50, accuracy: 0.839080459770

[n_estimator: 100, max_depth: 25, accuracy: 0.8735632183908046]
[n_estimator: 100, max_depth: 50, accuracy: 0.8735632183908046]
[n_estimator: 100, max_depth: 100, accuracy: 0.8505747126436781]
	 ----------
	> Best n_estimator: 50 || Best max_depth: 25
	> Best training score: 0.976878612716763
	> Best test score: 0.8372093023255814
	> Best held score: 0.8532110091743119
	 ----------
	> Best n_estimator : 50 || Best max_depth : 25
	> Average training score list: [0.9652173913043478, 0.9623188405797102, 0.9682080924855492, 0.9739884393063584, 0.976878612716763]
	> Average testing score list: [0.8620689655172413, 0.8160919540229885, 0.9069767441860465, 0.872093023255814, 0.8372093023255814]
	> Average held score list: [0.8532110091743119, 0.8348623853211009, 0.8532110091743119, 0.8532110091743119, 0.8532110091743119]

	> Average training score list: 0.9693222752785458
	> Average testing score list: 0.8588879978615342
	> Average held score list: 0.8495412844036696

	Confusion Matrix: Actual

  'precision', 'predicted', average, warn_for)


[n_estimator: 5, max_depth: 5, accuracy: 0.5517241379310345]
[n_estimator: 5, max_depth: 10, accuracy: 0.6666666666666666]
[n_estimator: 5, max_depth: 25, accuracy: 0.7126436781609196]
[n_estimator: 5, max_depth: 50, accuracy: 0.7701149425287356]
[n_estimator: 5, max_depth: 100, accuracy: 0.7471264367816092]
[n_estimator: 25, max_depth: 5, accuracy: 0.5402298850574713]
[n_estimator: 25, max_depth: 10, accuracy: 0.5632183908045977]
[n_estimator: 25, max_depth: 25, accuracy: 0.7011494252873564]
[n_estimator: 25, max_depth: 50, accuracy: 0.735632183908046]
[n_estimator: 25, max_depth: 100, accuracy: 0.7586206896551724]
[n_estimator: 50, max_depth: 5, accuracy: 0.5057471264367817]
[n_estimator: 50, max_depth: 10, accuracy: 0.5977011494252874]
[n_estimator: 50, max_depth: 25, accuracy: 0.735632183908046]
[n_estimator: 50, max_depth: 50, accuracy: 0.735632183908046]
[n_estimator: 50, max_depth: 100, accuracy: 0.7701149425287356]
[n_estimator: 75, max_depth: 5, accuracy: 0.5402298850574713]
[

[n_estimator: 100, max_depth: 5, accuracy: 0.6436781609195402]
[n_estimator: 100, max_depth: 10, accuracy: 0.6666666666666666]
[n_estimator: 100, max_depth: 25, accuracy: 0.7126436781609196]
[n_estimator: 100, max_depth: 50, accuracy: 0.7471264367816092]
[n_estimator: 100, max_depth: 100, accuracy: 0.7471264367816092]
	 ----------
	> Best n_estimator: 25 || Best max_depth: 50
	> Best training score: 0.8526011560693642
	> Best test score: 0.7674418604651163
	> Best held score: 0.8073394495412844
	 ----------
	> Best n_estimator : 5 || Best max_depth : 50
	> Average training score list: [0.8840579710144928, 0.8376811594202899, 0.8005780346820809, 0.8121387283236994, 0.8526011560693642]
	> Average testing score list: [0.7126436781609196, 0.7816091954022989, 0.7558139534883721, 0.627906976744186, 0.7674418604651163]
	> Average held score list: [0.7981651376146789, 0.7798165137614679, 0.7614678899082569, 0.7614678899082569, 0.8073394495412844]

	> Average training score list: 0.837411409901

  'precision', 'predicted', average, warn_for)


[n_estimator: 5, max_depth: 5, accuracy: 0.6551724137931034]
[n_estimator: 5, max_depth: 10, accuracy: 0.6781609195402298]
[n_estimator: 5, max_depth: 25, accuracy: 0.735632183908046]
[n_estimator: 5, max_depth: 50, accuracy: 0.7126436781609196]
[n_estimator: 5, max_depth: 100, accuracy: 0.632183908045977]
[n_estimator: 25, max_depth: 5, accuracy: 0.6091954022988506]
[n_estimator: 25, max_depth: 10, accuracy: 0.7126436781609196]
[n_estimator: 25, max_depth: 25, accuracy: 0.7816091954022989]
[n_estimator: 25, max_depth: 50, accuracy: 0.7816091954022989]
[n_estimator: 25, max_depth: 100, accuracy: 0.6781609195402298]
[n_estimator: 50, max_depth: 5, accuracy: 0.6091954022988506]
[n_estimator: 50, max_depth: 10, accuracy: 0.6896551724137931]
[n_estimator: 50, max_depth: 25, accuracy: 0.735632183908046]
[n_estimator: 50, max_depth: 50, accuracy: 0.7931034482758621]
[n_estimator: 50, max_depth: 100, accuracy: 0.7011494252873564]
[n_estimator: 75, max_depth: 5, accuracy: 0.5747126436781609]
[

[n_estimator: 75, max_depth: 100, accuracy: 0.6781609195402298]
[n_estimator: 100, max_depth: 5, accuracy: 0.7011494252873564]
[n_estimator: 100, max_depth: 10, accuracy: 0.7471264367816092]
[n_estimator: 100, max_depth: 25, accuracy: 0.7701149425287356]
[n_estimator: 100, max_depth: 50, accuracy: 0.8045977011494253]
[n_estimator: 100, max_depth: 100, accuracy: 0.6781609195402298]
	 ----------
	> Best n_estimator: 100 || Best max_depth: 50
	> Best training score: 0.9364161849710982
	> Best test score: 0.7674418604651163
	> Best held score: 0.7889908256880734
	 ----------
	> Best n_estimator : 100 || Best max_depth : 50
	> Average training score list: [0.9304347826086956, 0.9217391304347826, 0.9161849710982659, 0.8988439306358381, 0.9364161849710982]
	> Average testing score list: [0.7701149425287356, 0.8045977011494253, 0.8023255813953488, 0.8023255813953488, 0.7674418604651163]
	> Average held score list: [0.8073394495412844, 0.8348623853211009, 0.8440366972477065, 0.7339449541284404,

  'precision', 'predicted', average, warn_for)


[n_estimator: 5, max_depth: 5, accuracy: 0.5862068965517241]
[n_estimator: 5, max_depth: 10, accuracy: 0.7931034482758621]
[n_estimator: 5, max_depth: 25, accuracy: 0.7931034482758621]
[n_estimator: 5, max_depth: 50, accuracy: 0.8620689655172413]
[n_estimator: 5, max_depth: 100, accuracy: 0.7816091954022989]
[n_estimator: 25, max_depth: 5, accuracy: 0.7241379310344828]
[n_estimator: 25, max_depth: 10, accuracy: 0.7586206896551724]
[n_estimator: 25, max_depth: 25, accuracy: 0.8160919540229885]
[n_estimator: 25, max_depth: 50, accuracy: 0.8390804597701149]
[n_estimator: 25, max_depth: 100, accuracy: 0.8620689655172413]
[n_estimator: 50, max_depth: 5, accuracy: 0.735632183908046]
[n_estimator: 50, max_depth: 10, accuracy: 0.7701149425287356]
[n_estimator: 50, max_depth: 25, accuracy: 0.8160919540229885]
[n_estimator: 50, max_depth: 50, accuracy: 0.8390804597701149]
[n_estimator: 50, max_depth: 100, accuracy: 0.8505747126436781]
[n_estimator: 75, max_depth: 5, accuracy: 0.6896551724137931]

[n_estimator: 75, max_depth: 100, accuracy: 0.8045977011494253]
[n_estimator: 100, max_depth: 5, accuracy: 0.7241379310344828]
[n_estimator: 100, max_depth: 10, accuracy: 0.7701149425287356]
[n_estimator: 100, max_depth: 25, accuracy: 0.8390804597701149]
[n_estimator: 100, max_depth: 50, accuracy: 0.8275862068965517]
[n_estimator: 100, max_depth: 100, accuracy: 0.8160919540229885]
	 ----------
	> Best n_estimator: 75 || Best max_depth: 25
	> Best training score: 0.9364161849710982
	> Best test score: 0.8837209302325582
	> Best held score: 0.8532110091743119
	 ----------
	> Best n_estimator : 5 || Best max_depth : 50
	> Average training score list: [0.9478260869565217, 0.9304347826086956, 0.9364161849710982, 0.9595375722543352, 0.9364161849710982]
	> Average testing score list: [0.7586206896551724, 0.8275862068965517, 0.7790697674418605, 0.872093023255814, 0.8837209302325582]
	> Average held score list: [0.7889908256880734, 0.8165137614678899, 0.8165137614678899, 0.8165137614678899, 0.8

  'precision', 'predicted', average, warn_for)


	Confusion Matrix: Actual (Row) vs Predicted (Column)
	 [[24  0  3  0  0  0]
 [ 0 18  4  0  0  0]
 [ 2  1 49  0  0  0]
 [ 0  0  0  2  0  0]
 [ 1  0  0  0  0  0]
 [ 0  2  3  0  0  0]]
[n_estimator: 5, max_depth: 5, accuracy: 0.6551724137931034]
[n_estimator: 5, max_depth: 10, accuracy: 0.6896551724137931]
[n_estimator: 5, max_depth: 25, accuracy: 0.7816091954022989]
[n_estimator: 5, max_depth: 50, accuracy: 0.8160919540229885]
[n_estimator: 5, max_depth: 100, accuracy: 0.7586206896551724]
[n_estimator: 25, max_depth: 5, accuracy: 0.7011494252873564]
[n_estimator: 25, max_depth: 10, accuracy: 0.7931034482758621]
[n_estimator: 25, max_depth: 25, accuracy: 0.8045977011494253]
[n_estimator: 25, max_depth: 50, accuracy: 0.8275862068965517]
[n_estimator: 25, max_depth: 100, accuracy: 0.8045977011494253]
[n_estimator: 50, max_depth: 5, accuracy: 0.7471264367816092]
[n_estimator: 50, max_depth: 10, accuracy: 0.7816091954022989]
[n_estimator: 50, max_depth: 25, accuracy: 0.8275862068965517]
[n_e

[n_estimator: 50, max_depth: 100, accuracy: 0.8505747126436781]
[n_estimator: 75, max_depth: 5, accuracy: 0.7701149425287356]
[n_estimator: 75, max_depth: 10, accuracy: 0.8275862068965517]
[n_estimator: 75, max_depth: 25, accuracy: 0.8390804597701149]
[n_estimator: 75, max_depth: 50, accuracy: 0.8505747126436781]
[n_estimator: 75, max_depth: 100, accuracy: 0.8620689655172413]
[n_estimator: 100, max_depth: 5, accuracy: 0.735632183908046]
[n_estimator: 100, max_depth: 10, accuracy: 0.7816091954022989]
[n_estimator: 100, max_depth: 25, accuracy: 0.8390804597701149]
[n_estimator: 100, max_depth: 50, accuracy: 0.8275862068965517]
[n_estimator: 100, max_depth: 100, accuracy: 0.8390804597701149]
	 ----------
	> Best n_estimator: 50 || Best max_depth: 50
	> Best training score: 0.9682080924855492
	> Best test score: 0.872093023255814
	> Best held score: 0.8440366972477065
	 ----------
	> Best n_estimator : 100 || Best max_depth : 50
	> Average training score list: [0.9594202898550724, 0.959420

  'precision', 'predicted', average, warn_for)
