In [1]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn import metrics
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer

import pickle
import numpy as np # linear algebra
import pandas as pd

In [2]:
#read csv
dict_label = {
    'Datetime':0, 
    'Sentence':1, 
    'Custom Object': 2, 
    'URL': 3, 
    'Numbers': 4, 
    'List': 5}
data = pd.read_csv('data/needs_extraction_data/labelled_added.csv')

data['y_act'] = [dict_label[i] for i in data['y_act']]
y = data.loc[:,['y_act']]

In [3]:
data1 = data[['%_nans', 'mean_word_count',
              'std_dev_word_count', 'has_delimiters', 'mean_stopword_total',
              'mean_whitespace_count', 'mean_char_count', 'mean_delim_count',
              'stdev_stopword_total', 'stdev_whitespace_count', 'stdev_char_count',
              'stdev_delim_count', 'has_url', 'has_date']]
data1 = data1.fillna(0)

data1 = data1.rename(columns={
    'mean_word_count': 'scaled_mean_token_count',
    'std_dev_word_count': 'scaled_std_dev_token_count',
    '%_nans': 'scaled_perc_nans',
    'mean_stopword_total': 'scaled_mean_stopword_total',
    'mean_whitespace_count': 'scaled_mean_whitespace_count',
    'mean_char_count': 'scaled_mean_char_count',
    'mean_delim_count': 'scaled_mean_delim_count',
    'stdev_stopword_total': 'scaled_stdev_stopword_total',
    'stdev_whitespace_count': 'scaled_stdev_whitespace_count',
    'stdev_char_count': 'scaled_stdev_char_count',
    'stdev_delim_count': 'scaled_stdev_delim_count'
})
data1.loc[data1['scaled_mean_token_count'] >
          10000, 'scaled_mean_token_count'] = 10000
data1.loc[data1['scaled_mean_token_count'] < -
          10000, 'scaled_mean_token_count'] = -10000

data1.loc[data1['scaled_std_dev_token_count'] >
          10000, 'scaled_std_dev_token_count'] = 10000
data1.loc[data1['scaled_std_dev_token_count'] < -
          10000, 'scaled_std_dev_token_count'] = -10000

data1.loc[data1['scaled_perc_nans'] > 10000, 'scaled_perc_nans'] = 10000
data1.loc[data1['scaled_perc_nans'] < -10000, 'scaled_perc_nans'] = -10000

data1.loc[data1['scaled_mean_stopword_total'] >
          10000, 'scaled_mean_stopword_total'] = 10000
data1.loc[data1['scaled_mean_stopword_total'] < -
          10000, 'scaled_mean_stopword_total'] = -10000

data1.loc[data1['scaled_mean_whitespace_count'] >
          10000, 'scaled_mean_whitespace_count'] = 10000
data1.loc[data1['scaled_mean_whitespace_count'] < -
          10000, 'scaled_mean_whitespace_count'] = -10000

data1.loc[data1['scaled_mean_char_count'] >
          10000, 'scaled_mean_char_count'] = 10000
data1.loc[data1['scaled_mean_char_count'] < -
          10000, 'scaled_mean_char_count'] = -10000

data1.loc[data1['scaled_mean_delim_count'] >
          10000, 'scaled_mean_delim_count'] = 10000
data1.loc[data1['scaled_mean_delim_count'] < -
          10000, 'scaled_mean_delim_count'] = -10000

data1.loc[data1['scaled_stdev_stopword_total'] >
          10000, 'scaled_stdev_stopword_total'] = 10000
data1.loc[data1['scaled_stdev_stopword_total'] < -
          10000, 'scaled_stdev_stopword_total'] = -10000

data1.loc[data1['scaled_stdev_whitespace_count'] >
          10000, 'scaled_stdev_whitespace_count'] = 10000
data1.loc[data1['scaled_stdev_whitespace_count'] < -
          10000, 'scaled_stdev_whitespace_count'] = -10000

data1.loc[data1['scaled_stdev_char_count'] >
          10000, 'scaled_stdev_char_count'] = 10000
data1.loc[data1['scaled_stdev_char_count'] < -
          10000, 'scaled_stdev_char_count'] = -10000

data1.loc[data1['scaled_stdev_delim_count'] >
          10000, 'scaled_stdev_delim_count'] = 10000
data1.loc[data1['scaled_stdev_delim_count'] < -
          10000, 'scaled_stdev_delim_count'] = -10000

column_names_to_normalize = ['scaled_mean_token_count',
                             'scaled_std_dev_token_count',
                             'scaled_perc_nans',
                             'scaled_mean_stopword_total',
                             'scaled_mean_whitespace_count',
                             'scaled_mean_char_count',
                             'scaled_mean_delim_count',
                             'scaled_stdev_stopword_total',
                             'scaled_stdev_whitespace_count',
                             'scaled_stdev_char_count',
                             'scaled_stdev_delim_count']
x = data1[column_names_to_normalize].values
x = np.nan_to_num(x)
x_scaled = StandardScaler().fit_transform(x)
df_temp = pd.DataFrame(
    x_scaled, columns=column_names_to_normalize, index=data1.index)
data1[column_names_to_normalize] = df_temp

y.y_act = y.y_act.astype(float)

print(f"> Data mean: {data1.mean()}\n")
print(f"> Data median: {data1.median()}\n")
print(f"> Data stdev: {data1.std()}")

> Data mean: scaled_perc_nans                -2.745801e-16
scaled_mean_token_count         -1.117919e-16
scaled_std_dev_token_count      -2.236863e-17
has_delimiters                   3.105360e-01
scaled_mean_stopword_total       8.619107e-18
scaled_mean_whitespace_count    -1.126127e-16
scaled_mean_char_count           5.130421e-17
scaled_mean_delim_count         -5.915375e-17
scaled_stdev_stopword_total      6.413026e-17
scaled_stdev_whitespace_count   -2.236863e-17
scaled_stdev_char_count         -3.488686e-18
scaled_stdev_delim_count         9.516930e-17
has_url                          8.687616e-02
has_date                         7.560074e-01
dtype: float64

> Data median: scaled_perc_nans                -0.653046
scaled_mean_token_count         -0.144106
scaled_std_dev_token_count      -0.171320
has_delimiters                   0.000000
scaled_mean_stopword_total      -0.178121
scaled_mean_whitespace_count    -0.144106
scaled_mean_char_count          -0.166657
scaled_mean_delim_

In [4]:
print("===[VECTORIZATION]===")
arr = data['Attribute_name'].values
data = data.fillna(0)
arr1 = data['sample_1'].values
arr1 = [str(x) for x in arr1]
arr2 = data['sample_2'].values
arr2 = [str(x) for x in arr2]

vectorizer = CountVectorizer(ngram_range=(3, 3), analyzer='char')
X = vectorizer.fit_transform(arr)
X1 = vectorizer.fit_transform(arr1)
X2 = vectorizer.fit_transform(arr2)

print(f"> Length of vectorized feature_names: {len(vectorizer.get_feature_names())}")

data1.to_csv('data/preprocessing/before.csv')
attr_df = pd.DataFrame(X.toarray())
sample1_df = pd.DataFrame(X1.toarray())
sample2_df = pd.DataFrame(X2.toarray())

data2 = pd.concat([data1, attr_df, sample1_df, sample2_df], axis=1, sort=False)
data2.to_csv('data/preprocessing/after.csv')
data2.head()

X_train, X_test, y_train, y_test = train_test_split(
    data2, y, test_size=0.2, random_state=100)
atr_train,atr_test = train_test_split(data2, test_size=0.2,random_state=100)

# X_train_train, X_test_train,y_train_train,y_test_train = train_test_split(X_train,y_train, test_size=0.25)
# print(X_train.head())
# print(y_train.head())

X_train_new = X_train.reset_index(drop=True)
y_train_new = y_train.reset_index(drop=True)
print(f"X_train preview: {X_train.head()}")
print(f"y_train preview: {y_train.head()}")

X_train_new = X_train_new.values
y_train_new = y_train_new.values

===[VECTORIZATION]===
> Length of vectorized feature_names: 8528
X_train preview:      scaled_perc_nans  scaled_mean_token_count  scaled_std_dev_token_count  \
453         -0.653097                 0.686283                    3.364514   
43          -0.653120                 0.162079                   -0.054513   
133          1.978459                -0.148544                   -0.167108   
205         -0.653120                -0.141062                   -0.175870   
282         -0.653120                -0.148960                   -0.175870   

     has_delimiters  scaled_mean_stopword_total  scaled_mean_whitespace_count  \
453            True                    0.945220                      0.686283   
43             True                    0.126161                      0.162079   
133            True                   -0.187400                     -0.148544   
205           False                   -0.178121                     -0.141062   
282           False                   -0.187

In [5]:
try:
    acc_df = pd.read_csv('data/model_data.csv')
    index = len(acc_df)
except FileNotFoundError:
    acc_df = pd.DataFrame(columns=['Model', 'Params', 'Feats', 'Train', 'Validation', 'Test', 'Precision'])
    index = 0

In [6]:
k = 5
kf = KFold(n_splits=k)
avg_train_acc, avg_test_acc = 0, 0

cvals = [0.1, 1, 10, 100, 1000]
gamavals = [0.0001, 0.001, 0.01, 0.1, 1, 10]


avgsc_lst, avgsc_train_lst, avgsc_hld_lst = [], [], []
avgsc, avgsc_train, avgsc_hld = 0, 0, 0

best_param_count = {'C': {}, 'gamma': {}}
for train_index, test_index in kf.split(X_train_new):
    X_train_cur, X_test_cur = X_train_new[train_index], X_train_new[test_index]
    y_train_cur, y_test_cur = y_train_new[train_index], y_train_new[test_index]
    X_train_train, X_val, y_train_train, y_val = train_test_split(
        X_train_cur, y_train_cur, test_size=0.25, random_state=100)

    bestPerformingModel = svm.SVC(
        C=100, decision_function_shape="ovo", gamma=0.001, probability=True)
    bestscore = 0
    print('='*10)
    for cval in cvals:
        for gval in gamavals:
            clf = svm.SVC(C=cval, decision_function_shape="ovo", gamma=gval, probability=True)
            clf.fit(X_train_train, y_train_train)
            sc = clf.score(X_val, y_val)
            print(f"[C: {cval}, gamma: {gval}, accuracy: {sc}]")
            if bestscore < sc:
                bestcval = cval
                bestgval = gval
                bestscore = sc
                bestPerformingModel = clf

    if str(bestcval) in best_param_count['C']:
        best_param_count['C'][str(bestcval)] += 1
    else:
        best_param_count['C'][str(bestcval)] = 1
        
    if str(bestgval) in best_param_count['gamma']:
        best_param_count['gamma'][str(bestgval)] += 1
    else:
        best_param_count['gamma'][str(bestgval)] = 1
        
    bscr_train = bestPerformingModel.score(X_train_cur, y_train_cur)
    bscr = bestPerformingModel.score(X_test_cur, y_test_cur)
    bscr_hld = bestPerformingModel.score(X_test, y_test)

    avgsc_train_lst.append(bscr_train)
    avgsc_lst.append(bscr)
    avgsc_hld_lst.append(bscr_hld)

    avgsc_train = avgsc_train + bscr_train
    avgsc = avgsc + bscr
    avgsc_hld = avgsc_hld + bscr_hld
    print()
    print(f"> Best C: {bestcval} || Best gamma: {bestgval}")
    print(f"> Best training score: {bscr_train}")
    print(f"> Best test score: {bscr}")
    print(f"> Best held score: {bscr_hld}")
    print('='*10)



  y = column_or_1d(y, warn=True)


[C: 0.1, gamma: 0.0001, accuracy: 0.4367816091954023]
[C: 0.1, gamma: 0.001, accuracy: 0.4367816091954023]
[C: 0.1, gamma: 0.01, accuracy: 0.4367816091954023]
[C: 0.1, gamma: 0.1, accuracy: 0.4367816091954023]
[C: 0.1, gamma: 1, accuracy: 0.4367816091954023]
[C: 0.1, gamma: 10, accuracy: 0.4367816091954023]
[C: 1, gamma: 0.0001, accuracy: 0.45977011494252873]
[C: 1, gamma: 0.001, accuracy: 0.47126436781609193]
[C: 1, gamma: 0.01, accuracy: 0.7816091954022989]
[C: 1, gamma: 0.1, accuracy: 0.5287356321839081]
[C: 1, gamma: 1, accuracy: 0.4367816091954023]
[C: 1, gamma: 10, accuracy: 0.4367816091954023]
[C: 10, gamma: 0.0001, accuracy: 0.47126436781609193]
[C: 10, gamma: 0.001, accuracy: 0.7586206896551724]
[C: 10, gamma: 0.01, accuracy: 0.8275862068965517]
[C: 10, gamma: 0.1, accuracy: 0.5287356321839081]
[C: 10, gamma: 1, accuracy: 0.4367816091954023]
[C: 10, gamma: 10, accuracy: 0.4367816091954023]
[C: 100, gamma: 0.0001, accuracy: 0.7471264367816092]
[C: 100, gamma: 0.001, accuracy: 0

[C: 1000, gamma: 0.1, accuracy: 0.5862068965517241]
[C: 1000, gamma: 1, accuracy: 0.5287356321839081]
[C: 1000, gamma: 10, accuracy: 0.5287356321839081]

> Best C: 100 || Best gamma: 0.001
> Best training score: 0.9479768786127167
> Best test score: 0.813953488372093
> Best held score: 0.8165137614678899


In [7]:
y_pred = bestPerformingModel.predict(X_test)
prec = metrics.precision_score(y_test, y_pred, average=None)
cat_prec = {
    'Datetime': prec[0],
    'Sentence': prec[1],
    'Custom Object': prec[2],
    'URL': prec[3],
    'Numbers': prec[4],
    'List': prec[5],
}

  'precision', 'predicted', average, warn_for)


In [8]:
bestcval = max(best_param_count['C'], key=lambda i: best_param_count['C'][i])
bestgval = max(best_param_count['gamma'], key=lambda i: best_param_count['gamma'][i])
bestparams = {'C': bestcval, 'gamma': bestgval}
print(f"> Best n_estimator : {bestcval} || Best max_depth : {bestgval}")
print(f"> Average training score list: {avgsc_train_lst}")
print(f"> Average testing score list: {avgsc_lst}")
print(f"> Average held score list: {avgsc_hld_lst}")
print()
avgsc_train = avgsc_train/k
avgsc = avgsc/k
avgsc_hld = avgsc_hld/k
print(f"> Average training score list: {avgsc_train}")
print(f"> Average testing score list: {avgsc}")
print(f"> Average held score list: {avgsc_hld}")
acc_df.loc[index] = ['rbf_svm', str(bestparams),"X_stats, X_name, X_sample1, X_sample2", avgsc_train, avgsc, avgsc_hld, str(cat_prec)]
index += 1
print()

y_pred = bestPerformingModel.predict(X_test)
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
print('Confusion Matrix: Actual (Row) vs Predicted (Column)')
print(cnf_matrix)

> Best n_estimator : 10 || Best max_depth : 0.01
> Average training score list: [0.9594202898550724, 0.9420289855072463, 0.9364161849710982, 0.9479768786127167, 0.9479768786127167]
> Average testing score list: [0.7816091954022989, 0.7931034482758621, 0.7906976744186046, 0.7906976744186046, 0.813953488372093]
> Average held score list: [0.8165137614678899, 0.8532110091743119, 0.8623853211009175, 0.8532110091743119, 0.8165137614678899]

> Average training score list: 0.9467638435117701
> Average testing score list: 0.7940122961774927
> Average held score list: 0.8403669724770643

Confusion Matrix: Actual (Row) vs Predicted (Column)
[[24  0  3  0  0  0]
 [ 0 15  7  0  0  0]
 [ 0  4 48  0  0  0]
 [ 0  0  0  2  0  0]
 [ 0  0  1  0  0  0]
 [ 0  2  3  0  0  0]]


In [9]:
# save the model to disk
filename = 'data/pretrained/svm_finalized_model.pickle'
pickle.dump(bestPerformingModel, open(filename, 'wb'))

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
y_prob = bestPerformingModel.predict_proba(X_test)

df = pd.DataFrame.from_records(y_prob)
print(df)
df.to_csv('data/model_predictions/svm_predictions.csv', index=False)

            0         1         2         3         4         5
0    0.050191  0.190817  0.606397  0.019551  0.060109  0.072936
1    0.000710  0.471062  0.064973  0.413661  0.000560  0.049034
2    0.972477  0.010076  0.004033  0.004598  0.003835  0.004981
3    0.954424  0.011392  0.012680  0.004816  0.010146  0.006541
4    0.010583  0.313748  0.510585  0.025192  0.008992  0.130900
5    0.198713  0.061708  0.712828  0.004310  0.011233  0.011208
6    0.021402  0.040071  0.894343  0.003807  0.027164  0.013213
7    0.963618  0.013814  0.006999  0.005107  0.004364  0.006097
8    0.709331  0.065208  0.063210  0.013592  0.083719  0.064939
9    0.014227  0.593120  0.284747  0.013244  0.009985  0.084678
10   0.111423  0.070668  0.638055  0.012992  0.074922  0.091940
11   0.004466  0.759793  0.133714  0.012168  0.003369  0.086491
12   0.993529  0.001556  0.000640  0.002988  0.000476  0.000811
13   0.956614  0.011455  0.009036  0.006044  0.008047  0.008804
14   0.122889  0.137258  0.548581  0.013

In [10]:
def test_feat_combos(index):
    combos = {
        "X_stats": data1,
        "X_name": attr_df,
        "X_stats, X_name": pd.concat([data1, attr_df], axis=1, sort=False),
        "X_sample1":  pd.concat([sample1_df], axis=1, sort=False),
        "X_name, X_sample1":  pd.concat([attr_df, sample1_df], axis=1, sort=False),
        "X_stats, X_sample1":  pd.concat([data1, sample1_df], axis=1, sort=False),
        "X_stats, X_name, X_sample1":  pd.concat([data1, attr_df, sample1_df], axis=1, sort=False)
    }
    

    for combo in combos:
        print("="*50, combo, "="*50)
        X_train, X_test, y_train, y_test = train_test_split(
            combos[combo], y, test_size=0.2, random_state=100)

        X_train_new = X_train.reset_index(drop=True)
        y_train_new = y_train.reset_index(drop=True)
        X_train_new = X_train_new.values
        y_train_new = y_train_new.values
        k = 5
        kf = KFold(n_splits=k)
        avg_train_acc, avg_test_acc = 0, 0

        cvals = [0.1, 1, 10, 100, 1000]
        gamavals = [0.0001, 0.001, 0.01, 0.1, 1, 10]


        avgsc_lst, avgsc_train_lst, avgsc_hld_lst = [], [], []
        avgsc, avgsc_train, avgsc_hld = 0, 0, 0

        best_param_count = {'C': {}, 'gamma': {}}
        for train_index, test_index in kf.split(X_train_new):
            X_train_cur, X_test_cur = X_train_new[train_index], X_train_new[test_index]
            y_train_cur, y_test_cur = y_train_new[train_index], y_train_new[test_index]
            X_train_train, X_val, y_train_train, y_val = train_test_split(
                X_train_cur, y_train_cur, test_size=0.25, random_state=100)

            bestPerformingModel = svm.SVC(
                C=100, decision_function_shape="ovo", gamma=0.001, probability=True)
            bestscore = 0
            print('-'*10)
            for cval in cvals:
                for gval in gamavals:
                    clf = svm.SVC(C=cval, decision_function_shape="ovo", gamma=gval, probability=True)
                    clf.fit(X_train_train, y_train_train)
                    sc = clf.score(X_val, y_val)
                    print(f"[C: {cval}, gamma: {gval}, accuracy: {sc}]")
                    if bestscore < sc:
                        bestcval = cval
                        bestgval = gval
                        bestscore = sc
                        bestPerformingModel = clf

            if str(bestcval) in best_param_count['C']:
                best_param_count['C'][str(bestcval)] += 1
            else:
                best_param_count['C'][str(bestcval)] = 1

            if str(bestgval) in best_param_count['gamma']:
                best_param_count['gamma'][str(bestgval)] += 1
            else:
                best_param_count['gamma'][str(bestgval)] = 1

            bscr_train = bestPerformingModel.score(X_train_cur, y_train_cur)
            bscr = bestPerformingModel.score(X_test_cur, y_test_cur)
            bscr_hld = bestPerformingModel.score(X_test, y_test)

            avgsc_train_lst.append(bscr_train)
            avgsc_lst.append(bscr)
            avgsc_hld_lst.append(bscr_hld)

            avgsc_train = avgsc_train + bscr_train
            avgsc = avgsc + bscr
            avgsc_hld = avgsc_hld + bscr_hld
            print()
            print(f"\t> Best C: {bestcval} || Best gamma: {bestgval}")
            print(f"\t> Best training score: {bscr_train}")
            print(f"\t> Best test score: {bscr}")
            print(f"\t> Best held score: {bscr_hld}")
        print('\t', '-'*10)
        
        y_pred = bestPerformingModel.predict(X_test)
        prec = metrics.precision_score(y_test, y_pred, average=None)
        cat_prec = {
            'Datetime': prec[0],
            'Sentence': prec[1],
            'Custom Object': prec[2],
            'URL': prec[3],
            'Numbers': prec[4],
            'List': prec[5],
        }    
        bestcval = max(best_param_count['C'], key=lambda i: best_param_count['C'][i])
        bestgval = max(best_param_count['gamma'], key=lambda i: best_param_count['gamma'][i])
        bestparams = {'C': bestcval, 'gamma': bestgval}
        print(f"\t> Best n_estimator : {bestcval} || Best max_depth : {bestgval}")
        print(f"\t> Average training score list: {avgsc_train_lst}")
        print(f"\t> Average testing score list: {avgsc_lst}")
        print(f"\t> Average held score list: {avgsc_hld_lst}")
        print()
        avgsc_train = avgsc_train/k
        avgsc = avgsc/k
        avgsc_hld = avgsc_hld/k
        print(f"\t> Average training score list: {avgsc_train}")
        print(f"\t> Average testing score list: {avgsc}")
        print(f"\t> Average held score list: {avgsc_hld}")
        acc_df.loc[index] = ['rbf_svm', str(bestparams), combo, avgsc_train, avgsc, avgsc_hld, str(cat_prec)]
        index += 1
        print()

        y_pred = bestPerformingModel.predict(X_test)
        cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
        print('\tConfusion Matrix: Actual (Row) vs Predicted (Column)')
        print('\t', cnf_matrix)

In [11]:
test_feat_combos(index)
acc_df.to_csv('data/model_data.csv', index=False)

----------
[C: 0.1, gamma: 0.0001, accuracy: 0.4367816091954023]
[C: 0.1, gamma: 0.001, accuracy: 0.4367816091954023]
[C: 0.1, gamma: 0.01, accuracy: 0.4367816091954023]
[C: 0.1, gamma: 0.1, accuracy: 0.4367816091954023]
[C: 0.1, gamma: 1, accuracy: 0.5862068965517241]
[C: 0.1, gamma: 10, accuracy: 0.5977011494252874]
[C: 1, gamma: 0.0001, accuracy: 0.4367816091954023]
[C: 1, gamma: 0.001, accuracy: 0.45977011494252873]
[C: 1, gamma: 0.01, accuracy: 0.47126436781609193]
[C: 1, gamma: 0.1, accuracy: 0.6206896551724138]


  y = column_or_1d(y, warn=True)


[C: 1, gamma: 1, accuracy: 0.6091954022988506]
[C: 1, gamma: 10, accuracy: 0.6436781609195402]
[C: 10, gamma: 0.0001, accuracy: 0.45977011494252873]
[C: 10, gamma: 0.001, accuracy: 0.47126436781609193]
[C: 10, gamma: 0.01, accuracy: 0.6091954022988506]
[C: 10, gamma: 0.1, accuracy: 0.6551724137931034]
[C: 10, gamma: 1, accuracy: 0.7126436781609196]
[C: 10, gamma: 10, accuracy: 0.7701149425287356]
[C: 100, gamma: 0.0001, accuracy: 0.47126436781609193]
[C: 100, gamma: 0.001, accuracy: 0.6206896551724138]
[C: 100, gamma: 0.01, accuracy: 0.6436781609195402]
[C: 100, gamma: 0.1, accuracy: 0.7011494252873564]
[C: 100, gamma: 1, accuracy: 0.7931034482758621]
[C: 100, gamma: 10, accuracy: 0.7816091954022989]
[C: 1000, gamma: 0.0001, accuracy: 0.6091954022988506]
[C: 1000, gamma: 0.001, accuracy: 0.6666666666666666]
[C: 1000, gamma: 0.01, accuracy: 0.7241379310344828]
[C: 1000, gamma: 0.1, accuracy: 0.8160919540229885]
[C: 1000, gamma: 1, accuracy: 0.8045977011494253]
[C: 1000, gamma: 10, accur

  'precision', 'predicted', average, warn_for)


[C: 0.1, gamma: 0.0001, accuracy: 0.4367816091954023]
[C: 0.1, gamma: 0.001, accuracy: 0.4367816091954023]
[C: 0.1, gamma: 0.01, accuracy: 0.4367816091954023]
[C: 0.1, gamma: 0.1, accuracy: 0.47126436781609193]
[C: 0.1, gamma: 1, accuracy: 0.45977011494252873]
[C: 0.1, gamma: 10, accuracy: 0.45977011494252873]
[C: 1, gamma: 0.0001, accuracy: 0.4367816091954023]
[C: 1, gamma: 0.001, accuracy: 0.4367816091954023]
[C: 1, gamma: 0.01, accuracy: 0.6666666666666666]
[C: 1, gamma: 0.1, accuracy: 0.7471264367816092]
[C: 1, gamma: 1, accuracy: 0.6436781609195402]
[C: 1, gamma: 10, accuracy: 0.6436781609195402]
[C: 10, gamma: 0.0001, accuracy: 0.4367816091954023]
[C: 10, gamma: 0.001, accuracy: 0.6896551724137931]
[C: 10, gamma: 0.01, accuracy: 0.7701149425287356]
[C: 10, gamma: 0.1, accuracy: 0.7586206896551724]
[C: 10, gamma: 1, accuracy: 0.6436781609195402]
[C: 10, gamma: 10, accuracy: 0.6436781609195402]
[C: 100, gamma: 0.0001, accuracy: 0.6896551724137931]
[C: 100, gamma: 0.001, accuracy: 0

[C: 1000, gamma: 1, accuracy: 0.6896551724137931]
[C: 1000, gamma: 10, accuracy: 0.6896551724137931]

	> Best C: 10 || Best gamma: 0.01
	> Best training score: 0.9104046242774566
	> Best test score: 0.7441860465116279
	> Best held score: 0.7981651376146789
	 ----------
	> Best n_estimator : 100 || Best max_depth : 0.01
	> Average training score list: [0.8724637681159421, 0.9246376811594202, 0.9277456647398844, 0.930635838150289, 0.9104046242774566]
	> Average testing score list: [0.8390804597701149, 0.7701149425287356, 0.8023255813953488, 0.7906976744186046, 0.7441860465116279]
	> Average held score list: [0.8073394495412844, 0.7798165137614679, 0.7798165137614679, 0.7889908256880734, 0.7981651376146789]

	> Average training score list: 0.9131775152885986
	> Average testing score list: 0.7892809409248864
	> Average held score list: 0.7908256880733945

	Confusion Matrix: Actual (Row) vs Predicted (Column)
	 [[24  0  3  0  0  0]
 [ 2 13  7  0  0  0]
 [ 2  2 48  0  0  0]
 [ 0  0  0  2  0 

[C: 0.1, gamma: 10, accuracy: 0.5287356321839081]
[C: 1, gamma: 0.0001, accuracy: 0.5287356321839081]
[C: 1, gamma: 0.001, accuracy: 0.5517241379310345]
[C: 1, gamma: 0.01, accuracy: 0.7126436781609196]
[C: 1, gamma: 0.1, accuracy: 0.735632183908046]
[C: 1, gamma: 1, accuracy: 0.6666666666666666]
[C: 1, gamma: 10, accuracy: 0.6551724137931034]
[C: 10, gamma: 0.0001, accuracy: 0.5517241379310345]
[C: 10, gamma: 0.001, accuracy: 0.7126436781609196]
[C: 10, gamma: 0.01, accuracy: 0.8045977011494253]
[C: 10, gamma: 0.1, accuracy: 0.7701149425287356]
[C: 10, gamma: 1, accuracy: 0.6666666666666666]
[C: 10, gamma: 10, accuracy: 0.6666666666666666]
[C: 100, gamma: 0.0001, accuracy: 0.7126436781609196]
[C: 100, gamma: 0.001, accuracy: 0.8045977011494253]
[C: 100, gamma: 0.01, accuracy: 0.7701149425287356]
[C: 100, gamma: 0.1, accuracy: 0.7471264367816092]
[C: 100, gamma: 1, accuracy: 0.6781609195402298]
[C: 100, gamma: 10, accuracy: 0.6666666666666666]
[C: 1000, gamma: 0.0001, accuracy: 0.80459

[C: 10, gamma: 0.1, accuracy: 0.632183908045977]
[C: 10, gamma: 1, accuracy: 0.39080459770114945]
[C: 10, gamma: 10, accuracy: 0.4942528735632184]
[C: 100, gamma: 0.0001, accuracy: 0.6206896551724138]
[C: 100, gamma: 0.001, accuracy: 0.7241379310344828]
[C: 100, gamma: 0.01, accuracy: 0.735632183908046]
[C: 100, gamma: 0.1, accuracy: 0.632183908045977]
[C: 100, gamma: 1, accuracy: 0.39080459770114945]
[C: 100, gamma: 10, accuracy: 0.4942528735632184]
[C: 1000, gamma: 0.0001, accuracy: 0.7241379310344828]
[C: 1000, gamma: 0.001, accuracy: 0.735632183908046]
[C: 1000, gamma: 0.01, accuracy: 0.735632183908046]
[C: 1000, gamma: 0.1, accuracy: 0.632183908045977]
[C: 1000, gamma: 1, accuracy: 0.39080459770114945]
[C: 1000, gamma: 10, accuracy: 0.4942528735632184]

	> Best C: 10 || Best gamma: 0.01
	> Best training score: 0.8497109826589595
	> Best test score: 0.7093023255813954
	> Best held score: 0.7981651376146789
----------
[C: 0.1, gamma: 0.0001, accuracy: 0.5287356321839081]
[C: 0.1, ga

[C: 1000, gamma: 0.001, accuracy: 0.7241379310344828]
[C: 1000, gamma: 0.01, accuracy: 0.7471264367816092]
[C: 1000, gamma: 0.1, accuracy: 0.5977011494252874]
[C: 1000, gamma: 1, accuracy: 0.4482758620689655]
[C: 1000, gamma: 10, accuracy: 0.4482758620689655]

	> Best C: 100 || Best gamma: 0.001
	> Best training score: 0.9277456647398844
	> Best test score: 0.813953488372093
	> Best held score: 0.8256880733944955
----------
[C: 0.1, gamma: 0.0001, accuracy: 0.4827586206896552]
[C: 0.1, gamma: 0.001, accuracy: 0.4827586206896552]
[C: 0.1, gamma: 0.01, accuracy: 0.4827586206896552]
[C: 0.1, gamma: 0.1, accuracy: 0.4827586206896552]
[C: 0.1, gamma: 1, accuracy: 0.4827586206896552]
[C: 0.1, gamma: 10, accuracy: 0.4827586206896552]
[C: 1, gamma: 0.0001, accuracy: 0.5172413793103449]
[C: 1, gamma: 0.001, accuracy: 0.5172413793103449]
[C: 1, gamma: 0.01, accuracy: 0.735632183908046]
[C: 1, gamma: 0.1, accuracy: 0.6206896551724138]
[C: 1, gamma: 1, accuracy: 0.4827586206896552]
[C: 1, gamma: 1

[C: 0.1, gamma: 0.001, accuracy: 0.4482758620689655]
[C: 0.1, gamma: 0.01, accuracy: 0.4482758620689655]
[C: 0.1, gamma: 0.1, accuracy: 0.4482758620689655]
[C: 0.1, gamma: 1, accuracy: 0.4482758620689655]
[C: 0.1, gamma: 10, accuracy: 0.4482758620689655]
[C: 1, gamma: 0.0001, accuracy: 0.45977011494252873]
[C: 1, gamma: 0.001, accuracy: 0.45977011494252873]
[C: 1, gamma: 0.01, accuracy: 0.6781609195402298]
[C: 1, gamma: 0.1, accuracy: 0.632183908045977]
[C: 1, gamma: 1, accuracy: 0.5057471264367817]
[C: 1, gamma: 10, accuracy: 0.4827586206896552]
[C: 10, gamma: 0.0001, accuracy: 0.47126436781609193]
[C: 10, gamma: 0.001, accuracy: 0.6781609195402298]
[C: 10, gamma: 0.01, accuracy: 0.7126436781609196]
[C: 10, gamma: 0.1, accuracy: 0.6551724137931034]
[C: 10, gamma: 1, accuracy: 0.4942528735632184]
[C: 10, gamma: 10, accuracy: 0.4827586206896552]
[C: 100, gamma: 0.0001, accuracy: 0.6551724137931034]
[C: 100, gamma: 0.001, accuracy: 0.7011494252873564]
[C: 100, gamma: 0.01, accuracy: 0.74

[C: 1, gamma: 10, accuracy: 0.41379310344827586]
[C: 10, gamma: 0.0001, accuracy: 0.45977011494252873]
[C: 10, gamma: 0.001, accuracy: 0.6551724137931034]
[C: 10, gamma: 0.01, accuracy: 0.7701149425287356]
[C: 10, gamma: 0.1, accuracy: 0.5747126436781609]
[C: 10, gamma: 1, accuracy: 0.41379310344827586]
[C: 10, gamma: 10, accuracy: 0.41379310344827586]
[C: 100, gamma: 0.0001, accuracy: 0.6666666666666666]
[C: 100, gamma: 0.001, accuracy: 0.7471264367816092]
[C: 100, gamma: 0.01, accuracy: 0.7586206896551724]
[C: 100, gamma: 0.1, accuracy: 0.5632183908045977]
[C: 100, gamma: 1, accuracy: 0.41379310344827586]
[C: 100, gamma: 10, accuracy: 0.41379310344827586]
[C: 1000, gamma: 0.0001, accuracy: 0.7471264367816092]
[C: 1000, gamma: 0.001, accuracy: 0.735632183908046]
[C: 1000, gamma: 0.01, accuracy: 0.7586206896551724]
[C: 1000, gamma: 0.1, accuracy: 0.5517241379310345]
[C: 1000, gamma: 1, accuracy: 0.41379310344827586]
[C: 1000, gamma: 10, accuracy: 0.41379310344827586]

	> Best C: 10 || 