In [1]:
################################
# Scientific imports
################################
import gc
import matplotlib.pyplot as plt
import numpy as np
import fnmatch

################################
# General imports
################################
import csv, math, io, os, os.path, sys, random, time, json
import pandas as pd
import seaborn as sb
from tqdm.notebook import tqdm, trange

################################
# SciKitLearn Imports
################################
import sklearn
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

from IPython.display import display

################################
# MatPlotLib Settings
################################
#plt.rcParams["figure.figsize"] = (20,9)
sb.set()

In [2]:
# Reading which LC datafiles we have into a list

jsonList = []

for root, dirs, files in os.walk("./confusionmatrices/"):
    for file in files:
            
        # Get rid of alpha stuff
        if fnmatch.fnmatch(file, '*_alpha_*'):
            print("Alpha file found: {}".format(file))
        elif file.endswith(".json"):
            jsonList.append(os.path.join(root, file))
            #print(os.path.join(root, file))
jsonList.sort()
print("Number of JSON files: {}".format(len(jsonList)))
for num, i in enumerate(jsonList):
    print ("{}: {}".format(num, i))
    

Alpha file found: NaiveBayes_FFT_alpha_2.json
Alpha file found: NaiveBayes_FOLDED_alpha_9.json
Alpha file found: NaiveBayes_FFT_alpha_4.json
Alpha file found: NaiveBayes_FOLDED_alpha_1.json
Alpha file found: NaiveBayes_FOLDED_alpha_5.json
Alpha file found: NaiveBayes_FFT_alpha_8.json
Alpha file found: NaiveBayes_FOLDED_alpha_2.json
Alpha file found: NaiveBayes_FFT_alpha_7.json
Alpha file found: NaiveBayes_FOLDED_alpha_3.json
Alpha file found: NaiveBayes_FOLDED_alpha_10.json
Alpha file found: NaiveBayes_FFT_alpha_1.json
Alpha file found: NaiveBayes_FOLDED_alpha_4.json
Alpha file found: NaiveBayes_FFT_alpha_5.json
Alpha file found: NaiveBayes_FFT_alpha_6.json
Alpha file found: NaiveBayes_FFT_alpha_9.json
Alpha file found: NaiveBayes_FFT_alpha_10.json
Alpha file found: NaiveBayes_FOLDED_alpha_6.json
Alpha file found: NaiveBayes_FFT_alpha_3.json
Alpha file found: NaiveBayes_FOLDED_alpha_8.json
Alpha file found: NaiveBayes_FOLDED_alpha_7.json
Number of JSON files: 9
0: ./confusionmatrices/N

In [3]:
jsondata = [0] * len(jsonList)
for e,i in enumerate(jsonList):
    with open(i) as f:
        jsondata[e] = json.load(f)

#jsondata

In [4]:
algolist = []
algodata = []
for algorithm in jsondata:
    for keys in algorithm:
        algolist.append(keys)
        #print(keys)
#algolist = list(set(algolist))
#algolist.sort()
algolist

['NaiveBayes',
 'NaiveBayes_FFT',
 'NaiveBayes_FOLDED',
 'RandomTree',
 'RandomTree_FFT',
 'RandomTree_FOLDED',
 'SVM',
 'SVM_FFT',
 'SVM_FOLDED']

In [5]:
keylist = []

for algorithm in jsondata:
    for keys in algorithm:
        #print(keys, "\n",algorithm[keys],"\n\n")
        keylist.append(algorithm[keys])
        
keylist

[[{'tstart': '2021-07-05 05:56:47.105082',
   'tdelta': '0:00:37.684380',
   'tfinish': '2021-07-05 05:57:24.789462',
   'TN': '835',
   'FP': '2422',
   'FN': '143',
   'TP': '600',
   'dateran': '2021-07-05 05:56:47'}],
 [{'tstart': '2021-07-12 13:58:58.564361',
   'tdelta': '0:00:38.065440',
   'tfinish': '2021-07-12 13:59:36.629801',
   'TN': '2234',
   'FP': '1023',
   'FN': '489',
   'TP': '254',
   'dateran': '2021-07-12 13:58:58'}],
 [{'tstart': '2021-07-05 05:59:40.714287',
   'tdelta': '0:00:37.274896',
   'tfinish': '2021-07-05 06:00:17.989183',
   'TN': '835',
   'FP': '2422',
   'FN': '143',
   'TP': '600',
   'dateran': '2021-07-05 05:59:40'}],
 [{'tstart': '2021-06-15 02:18:38.289980',
   'tdelta': '0:17:02.313948',
   'tfinish': '2021-06-15 02:35:40.603928',
   'TN': '3108',
   'FP': '149',
   'FN': '719',
   'TP': '24',
   'dateran': '2021-06-15 02:18:38'}],
 [{'tstart': '2021-06-15 02:59:32.451708',
   'tdelta': '0:11:45.243566',
   'tfinish': '2021-06-15 03:11:17.695

In [6]:
columnList = list(keylist[0][0].keys())
columnList

['tstart', 'tdelta', 'tfinish', 'TN', 'FP', 'FN', 'TP', 'dateran']

In [7]:
jsondatalist = []
for i, x in enumerate(keylist):
    jsondatalist.append(list(list(x)[0].values()))
    
print(jsondatalist[0])
    
# Convert TP,TF, etc into ints, not strings
for row in jsondatalist:
    for i in range(3, len(row)-1):
        #print(row[i])
        try:
            row[i] = int(row[i])
        except ValueError:
            continue
            print("Row is a time not an int")

# Print DF
print(jsondatalist)

['2021-07-05 05:56:47.105082', '0:00:37.684380', '2021-07-05 05:57:24.789462', '835', '2422', '143', '600', '2021-07-05 05:56:47']
[['2021-07-05 05:56:47.105082', '0:00:37.684380', '2021-07-05 05:57:24.789462', 835, 2422, 143, 600, '2021-07-05 05:56:47'], ['2021-07-12 13:58:58.564361', '0:00:38.065440', '2021-07-12 13:59:36.629801', 2234, 1023, 489, 254, '2021-07-12 13:58:58'], ['2021-07-05 05:59:40.714287', '0:00:37.274896', '2021-07-05 06:00:17.989183', 835, 2422, 143, 600, '2021-07-05 05:59:40'], ['2021-06-15 02:18:38.289980', '0:17:02.313948', '2021-06-15 02:35:40.603928', 3108, 149, 719, 24, '2021-06-15 02:18:38'], ['2021-06-15 02:59:32.451708', '0:11:45.243566', '2021-06-15 03:11:17.695274', 3247, 10, 742, 1, '2021-06-15 02:59:32'], ['2021-07-05 06:05:16.610343', '1:09:03.704532', '2021-07-05 07:14:20.314875', 3195, 62, 726, 17, '2021-07-05 06:05:16'], ['2021-06-15 03:16:57.728669', '12:17:27.989239', '2021-06-15 15:34:25.717908', 3257, 0, 743, 0, '2021-06-15 03:16:57'], ['2021-0

In [8]:
df = pd.DataFrame(jsondatalist, columns=columnList, index=algolist)
df

Unnamed: 0,tstart,tdelta,tfinish,TN,FP,FN,TP,dateran
NaiveBayes,2021-07-05 05:56:47.105082,0:00:37.684380,2021-07-05 05:57:24.789462,835,2422,143,600,2021-07-05 05:56:47
NaiveBayes_FFT,2021-07-12 13:58:58.564361,0:00:38.065440,2021-07-12 13:59:36.629801,2234,1023,489,254,2021-07-12 13:58:58
NaiveBayes_FOLDED,2021-07-05 05:59:40.714287,0:00:37.274896,2021-07-05 06:00:17.989183,835,2422,143,600,2021-07-05 05:59:40
RandomTree,2021-06-15 02:18:38.289980,0:17:02.313948,2021-06-15 02:35:40.603928,3108,149,719,24,2021-06-15 02:18:38
RandomTree_FFT,2021-06-15 02:59:32.451708,0:11:45.243566,2021-06-15 03:11:17.695274,3247,10,742,1,2021-06-15 02:59:32
RandomTree_FOLDED,2021-07-05 06:05:16.610343,1:09:03.704532,2021-07-05 07:14:20.314875,3195,62,726,17,2021-07-05 06:05:16
SVM,2021-06-15 03:16:57.728669,12:17:27.989239,2021-06-15 15:34:25.717908,3257,0,743,0,2021-06-15 03:16:57
SVM_FFT,2021-06-15 03:17:01.118975,12:11:30.035255,2021-06-15 15:28:31.154230,3257,0,743,0,2021-06-15 03:17:01
SVM_FOLDED,2021-07-08 02:10:34.184898,0:46:36.043480,2021-07-08 02:57:10.228378,553,0,114,0,2021-07-08 02:10:34


accuracy = TP+TN / TP+FP+TN+FN  
precision = TP / TP+FP  
recall = TP / TP+FN  

In [9]:
df['sum'] = df.TP + df.TN + df.FP + df.FN

df['accuracy'] = (df.TP + df.TN) / df['sum']
df['precision'] = df.TP / (df.TP+df.FP)
df['recall'] = df.TP / (df.TP+df.FN)

In [10]:
# DROP COLUMNS

#df['common-ness'] = (df.TP + df.FN)/df['sum']
df = df.drop(columns=['tstart', 'tfinish', 'dateran'])

In [11]:
df

Unnamed: 0,tdelta,TN,FP,FN,TP,sum,accuracy,precision,recall
NaiveBayes,0:00:37.684380,835,2422,143,600,4000,0.35875,0.198544,0.807537
NaiveBayes_FFT,0:00:38.065440,2234,1023,489,254,4000,0.622,0.198904,0.341857
NaiveBayes_FOLDED,0:00:37.274896,835,2422,143,600,4000,0.35875,0.198544,0.807537
RandomTree,0:17:02.313948,3108,149,719,24,4000,0.783,0.138728,0.032301
RandomTree_FFT,0:11:45.243566,3247,10,742,1,4000,0.812,0.090909,0.001346
RandomTree_FOLDED,1:09:03.704532,3195,62,726,17,4000,0.803,0.21519,0.02288
SVM,12:17:27.989239,3257,0,743,0,4000,0.81425,,0.0
SVM_FFT,12:11:30.035255,3257,0,743,0,4000,0.81425,,0.0
SVM_FOLDED,0:46:36.043480,553,0,114,0,667,0.829085,,0.0


make plot for NB of varying alpha -> accuracy, precision, recall, etc

need to include all(?) hyperparams in pipeline, calculate acc/rec/pre, and then save this info so can plot bar/scatter/etc

In [12]:
algolist

['NaiveBayes',
 'NaiveBayes_FFT',
 'NaiveBayes_FOLDED',
 'RandomTree',
 'RandomTree_FFT',
 'RandomTree_FOLDED',
 'SVM',
 'SVM_FFT',
 'SVM_FOLDED']

In [13]:
uniquealgolist = [ x for x in algolist if "_" not in x ]
uniquealgolist

['NaiveBayes', 'RandomTree', 'SVM']

In [14]:
dfTable = []
for X in uniquealgolist:
    dfTable.append(df[df.index.str.startswith(X)])

In [16]:
dfTable[0]

Unnamed: 0,tdelta,TN,FP,FN,TP,sum,accuracy,precision,recall
NaiveBayes,0:00:37.684380,835,2422,143,600,4000,0.35875,0.198544,0.807537
NaiveBayes_FFT,0:00:38.065440,2234,1023,489,254,4000,0.622,0.198904,0.341857
NaiveBayes_FOLDED,0:00:37.274896,835,2422,143,600,4000,0.35875,0.198544,0.807537


In [17]:
dfTable[1]

Unnamed: 0,tdelta,TN,FP,FN,TP,sum,accuracy,precision,recall
RandomTree,0:17:02.313948,3108,149,719,24,4000,0.783,0.138728,0.032301
RandomTree_FFT,0:11:45.243566,3247,10,742,1,4000,0.812,0.090909,0.001346
RandomTree_FOLDED,1:09:03.704532,3195,62,726,17,4000,0.803,0.21519,0.02288


In [15]:
dfTable[2]

Unnamed: 0,tdelta,TN,FP,FN,TP,sum,accuracy,precision,recall
SVM,12:17:27.989239,3257,0,743,0,4000,0.81425,,0.0
SVM_FFT,12:11:30.035255,3257,0,743,0,4000,0.81425,,0.0
SVM_FOLDED,0:46:36.043480,553,0,114,0,667,0.829085,,0.0


So basically this shows me that SVM is essentially useless for what I want it to do