In [None]:
print(dt.now(), "keep instance running")

2021-01-23 14:18:55.853404 keep instance running


# SET UP

## Imports

In [10]:
from datetime import datetime as dt
import json
from json import JSONDecodeError
import traceback

import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
import math
import random
from sklearn.metrics import confusion_matrix , accuracy_score , roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split as split 
from sklearn.model_selection import KFold

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

print(dt.now())

2021-01-23 20:43:49.930882


## read/write


In [11]:
def read_f(path):
    try:
        f = open(path, "r")
        s = f.read()
        f.close()
        return s
    except FileNotFoundError:
        print("file not found")
        return None
    except Exception as e:
        raise e
#end

def append_f(path, s):
    f = open(path, "a+")
    f.write(s)
    f.close()
    return
#end

def write_f(path, s):
    f = open(path, "w+")
    f.write(s)
    f.close()
    return
#end

def write_csv_1d(path, s_list):
    s = "\n".join(s_list)
    write_f(path, s)
    return
#end

def valid_json(path):
    j = read_f(path)
    try:
        json.loads(j)
        return True
    except:
        return False
#end

def print_i(i):
    s = str(i)
    if (i%50)==0:
        s = "\n" + s
    print(s, end=", ")
    return i+1
#end

def date_str():
    now = dt.now()
    date = now.strftime("%d_%H_%M_%S")
    return date
#end

print(dt.now())

2021-01-23 20:43:50.297333


## Global Vars

In [17]:
folder_path = "/content/drive/MyDrive/4th_Year/ML_final_assignment/"
source_files_generic_path = folder_path + "token_split/reviews_tfidf_INFIX.json"
FILE_infix = ["1", "2", "2stop", "3", "3stop", "1_2", "1_3", "2_3", "1_2_3", "1_2stop", "1_3stop", "2stop_3stop", "1_2stop_3stop"]
generic_results_path = folder_path + "results/YTYPE_TIME.txt"
generic_plot_path = folder_path + "results/PLOT_TIME.png"
print(dt.now())

2021-01-23 20:55:42.511720


### read in data

In [18]:
def read_in_json_file(file_path, printbool=False):
    reviews_json = json.loads(read_f(file_path))
    dfnan = DataFrame(reviews_json)
    df = dfnan.fillna(0)
    columns = df.columns.tolist()
    columns = [col.strip() for col in columns]
    parameter_list = (columns[2:]) # Get index column and all token columns

    x = np.zeros(shape=(len(df.index), len(parameter_list)))
    print("   ", dt.now(), "dataframe to numpy array", x.shape)
    col_count = 0
    for parameter_name in parameter_list:
        if printbool: print(col_count, parameter_name)
        feature_index = columns.index(parameter_name)
        col = np.array(df.iloc[:,feature_index])
        for i in range(len(col)):
            x[i, col_count] = col[i]
        col_count += 1
        if col_count%1000 == 0:
            print("     ", dt.now(), "done", col_count)
    print("   ", dt.now(), "finished numpyfying")
    y_voted = np.array(df.iloc[:,0])
    y_access = np.array(df.iloc[:,1])
    print("    x.shape:", x.shape, "-- y_voted:", y_voted.shape, "-- y_access:", y_access.shape, "\n")
    return (x, y_voted, y_access)
#end
print(dt.now())

2021-01-23 20:55:47.636449


# Cross Validation

## general x-valid functions

In [19]:
def fig_path(title):
    title = title.replace(" ", "_").replace(":","").replace("\n", "")
    pth = generic_plot_path.replace("PLOT", title).replace("TIME", date_str())
    return pth

def calcMeanStdDev(array):
    n = len(array)
    mean = sum(array)/n
    print("   mean=", mean)
    running_sum = 0
    for x in array:
        tmp = (x-mean)**2
        running_sum += tmp
    #end for
    var = running_sum/(n-1)
    stddev = math.sqrt(var)
    print("   stddev=", stddev)
    return (mean, stddev)
#end

def plotErrBar(x, means, stddevs, title, x_axis, legendloc="upper left", col="green"):
    # plots an error bar of means and standard deviations on the Y axis against a given X
    x_np = np.array(x)
    means_np = np.array(means)
    err = np.array(stddevs)
    errpoints = np.append(means_np+err, means_np-err)
    x_np2 = np.append(x_np, x_np)

    plt.rc("font", size=18)
    fig, ax = plt.subplots(figsize=(12,8))
    ax.set_xlabel(x_axis)
    ax.set_ylabel("mean misclassification error")
    plt.rc("font", size=14)
    plt.title(title)
    ax.errorbar(x_np, means_np, yerr=err, ecolor=col, fmt='--x', label="standard dev")
    ax.scatter(x_np2, errpoints, color=col, marker='o', label="mean")
    ax.legend(loc=legendloc)
    plt.savefig(fig_path(title))
#end

def plotMisclasses(X_axis, Y_all_misclass, legend=FILE_infix, legendloc="upper right", x_label="C", class_label = "Token", model_name="SVC", output_type="voted_up"):
    # takes in X_axis (C/k values), Y_all_misclass, legend
    # plots X_axis_C against Y[i,:] for all legend_poly[i]
    fig, ax = plt.subplots(figsize=(12,8))
    plt.rc("font", size=18)
    title= model_name + " " + output_type + "\nMisclass Rate " + x_label + " values by " + class_label

    # add scatter for each row in Y_all
    cols = ["#ff028d", "#a03623", "#02ab2e", "#fac205", "#bf77f6", "#13eac9", "#0165fc", "#430541", "#ae7181", "#1f6357", "#fe83cc", "#8d8468", "#ca0147"]
    for i in range(len(legend)):
        print("i=", i)
        lab = (class_label + "=" + str(legend[i]).replace("_", ","))
        ax.scatter(X_axis, Y_all_misclass[i], color=cols[i], marker='o', label=lab)
    #end for

    ax.legend(loc=legendloc)
    ax.set_xlabel(x_label)
    ax.set_ylabel("Misclassification Score")
    plt.title(title)
    plt.savefig(fig_path(title))
    print("plotted", title)
#end plotXY()

def confMatrix(actual, prediction):
    # prints and returns matrix and accuracy
    matrix = confusion_matrix(actual, prediction)
    accuracy = accuracy_score(actual, prediction)
    misclass = 1-accuracy
    s = ("Confusion Matrix=\n" + str(matrix) + 
            "\nAccurracy = " + str(accuracy) +
            "\nMisclass  = " + str(misclass) )
    print("      Misclassification = ", misclass)
    return (s, matrix, accuracy, misclass)
#end

def applyKFold(model, X, y, results_path, f=5):
    kfold = KFold(n_splits=f)
    misclass_array = []
    for train, test in kfold.split(y):
        print("\n   ", dt.now(), "kfold")
        model.fit(X[train],y[train])
        y_pred = model.predict(X[test])
        (s, _, _, misclass) = confMatrix(y[test], y_pred)
        if results_path is not None: append_f(results_path, (s+"\n\n"))
        misclass_array.append(misclass)
    #end for
    return misclass_array
#end

print(dt.now())

2021-01-23 20:56:14.436435


## logistic regression and SVC L2 cross-validation

In [7]:
def tokens_c_values_x_valid(train_voted=True, tokens=None, ModelType=SVC, model_str="SVC", Cs=None, plot_logC=True, plot_errbar=False, results_path=None):
    if Cs is None: Cs = [0.001, 0.01, 0.1, 1, 10]
    if tokens is None: tokens = FILE_infix

    if train_voted: output_type = "voted_up"
    else: output_type = "early_access"
    title = (model_str + " " + output_type + "\nTokens TT: C value CC")

    if plot_logC:
        c_np = np.log10(Cs)
        label = r'$log_{10}(C)$'
    else:
        c_np = np.array(Cs)
        label = "C"

    all_final_misclasses = []
    for t in tokens:
        t_title = title.replace("TT", t)
        print("\n\n       ", dt.now(), ":", t_title, end=" -- ")
        means_t = []
        stddevs_t = []
        final_misclass_t = []
        # get data
        (X, y_voted, y_access) = read_in_json_file(source_files_generic_path.replace("INFIX", t))
        if train_voted: y = y_voted
        else: y = y_access
        print(X[:1],"\n", X[1:])
        print(X.shape)
        for c in Cs:
            print(dt.now(), c)
            c_title = t_title.replace("CC", str(c))
            model = ModelType(C=c, max_iter=5000)
            if results_path is not None: append_f(results_path, ("\n\n\n"+ c_title + "\n"))
            misclass_array = applyKFold(model, X, y, results_path)
            (mean_c, stddev_c) = calcMeanStdDev(misclass_array)
            if results_path is not None:
                s = ("Mean    = " + str(mean_c) + "\nStd Dev = " + str(stddev_c))
                append_f(results_path, (s+"\n\n"))
            means_t.append(mean_c)
            stddevs_t.append(stddev_c)
            final_misclass_t.append(misclass_array[-1])
        #end for
        if plot_errbar:
            title = (t_title.replace("C value CC", "Error Bars"))
            plotErrBar(c_np, means_t, stddevs_t, title, x_label=label)
        all_final_misclasses.append(final_misclass_t)
        #end if
    #end for
    plotMisclasses(c_np, all_final_misclasses, legend=tokens, x_label=label, model_name=model_str, output_type=output_type)
    return
#end

def log_reg_x_valid(tokens=None, Cs=None, plot_logC=True, results_path=None, y_voted=True, plot_errbar=False):
    tokens_c_values_x_valid(train_voted=y_voted, tokens=tokens, ModelType=LogisticRegression, 
                            model_str="LogisticRegression", Cs=Cs, plot_logC=plot_logC, results_path=results_path)

def svc_x_valid(tokens=None, Cs=None, plot_logC=None, results_path=None, y_voted=True, plot_errbar=False):
    tokens_c_values_x_valid(train_voted=y_voted, tokens=tokens, ModelType=SVC, 
                            model_str="SVC", Cs=Cs, plot_logC=plot_logC, results_path=results_path)
print(dt.now())

2021-01-23 17:00:05.208121


## Decision Tree cross-validation

In [20]:
def get_features_value(all_features, feature_param):
    if feature_param is None:
        max_feats = all_features
        feature_param = "None"
    elif feature_param == "sqrt": max_feats = math.sqrt(all_features)
    elif feature_param == "log2": max_feats = math.log2(all_features)
    else: 
        max_feats = (feature_param*all_features)
        feature_param = str(feature_param)
    max_feats = math.ceil(max_feats)
    ft_str = (str(max_feats) + " (" + feature_param + ")")
    return (max_feats, ft_str)

def dec_tree_x_valid(train_voted=True, tokens=None, max_feat=None, plot_errbar=True, results_path=None):
    if max_feat is None: max_feat = [None, "sqrt", "log2", 0.1, 0.25, 0.5, 0.75, 0.9]
    if tokens is None: tokens = FILE_infix

    if train_voted: output_type = "voted_up"
    else: output_type = "early_access"
    title = ("DecisionTree " + output_type + "\nTokens TT: max_features MAX")

    all_final_misclasses = []
    for t in tokens:
        t_title = title.replace("TT", t)
        print("\n\n       ", dt.now(), ":", t_title)
        means_t = []
        stddevs_t = []
        final_misclass_t = []
        # get data
        (X, y_voted, y_access) = read_in_json_file(source_files_generic_path.replace("INFIX", t))
        if train_voted: y = y_voted
        else: y = y_access
        print(X[:1],"\n", X[1:])
        print(X.shape)
        (_, x_feat) = X.shape
        feat_values = []
        for feat in max_feat:
            (ft_val, ft_str) = get_features_value(x_feat, feat)
            feat_values.append(ft_val)
            print(dt.now(), ft_str)
            l_title = t_title.replace("MAX", ft_str)
            model = DecisionTreeClassifier(max_features=feat)

            if results_path is not None: append_f(results_path, ("\n\n\n"+ l_title + "\n"))
            misclass_array = applyKFold(model, X, y, results_path)
            (mean_c, stddev_c) = calcMeanStdDev(misclass_array)
            if results_path is not None:
                s = ("Mean    = " + str(mean_c) + "\nStd Dev = " + str(stddev_c))
                append_f(results_path, (s+"\n\n"))
            means_t.append(mean_c)
            stddevs_t.append(stddev_c)
            final_misclass_t.append(misclass_array[-1])
        #end for
        if plot_errbar:
            eb_title = (t_title.replace("max_features MAX", "Error Bars"))
            try: plotErrBar(feat_values, means_t, stddevs_t, eb_title, x_label="max_features")
            except Exception as e:
                print("\nCould not plot error bars:", e, "\n")
                continue
        all_final_misclasses.append(final_misclass_t)
        #end if
    #end for
    s = "Misclassification Array\n["
    for row in all_final_misclasses:
        s = s + "    [" + ", ".join(row) + "]\n"
    s = s+"]"
    if results_path is not None: append_f(results_path, ("\n\n\n"+ all_final_misclasses))
    # plotMisclasses(plot_max_feat, all_final_misclasses, legend=tokens, x_label="min_samples_leaf", model_name="DecisionTreeClassifier", output_type=output_type)
    return
#end


print(dt.now())

2021-01-23 20:56:17.779900


## RUN X-VALIDATION

In [None]:
def cross_validate(voted=True, model="Logistic", tokens=None, values=None):
    if voted: output = "voted_up"
    else: output = "early_access"
    results_path = generic_results_path.replace("YTYPE", (output+"_"+model)).replace("TIME", date_str())
    print(results_path)
    write_f(results_path, "")
    if model=="SVC": svc_x_valid(results_path=results_path, y_voted=voted, tokens=tokens, Cs=values)
    elif model=="Logistic": log_reg_x_valid(results_path=results_path, y_voted=voted, tokens=tokens, Cs=values)
    elif model=="Decision": dec_tree_x_valid(results_path=results_path, train_voted=voted, tokens=tokens, max_feat=values)
    else: print("not valid model:  ", model)
    print("finished")

#cross_validate(model="Logistic")
#cross_validate(model="SVC")
cross_validate(model="Decision")

/content/drive/MyDrive/4th_Year/ML_final_assignment/results/voted_up_Decision_23_20_56_20.txt


        2021-01-23 20:56:20.792183 : DecisionTree voted_up
Tokens 1: max_features MAX
    2021-01-23 20:56:28.444248 dataframe to numpy array (4791, 6252)
      2021-01-23 20:56:29.959717 done 1000
      2021-01-23 20:56:31.468098 done 2000
      2021-01-23 20:56:33.072789 done 3000
      2021-01-23 20:56:34.697322 done 4000
      2021-01-23 20:56:36.313592 done 5000
      2021-01-23 20:56:37.893596 done 6000
    2021-01-23 20:56:38.318148 finished numpyfying
    x.shape: (4791, 6252) -- y_voted: (4791,) -- y_access: (4791,) 

[[  10.92684516   10.41597967 1002.53804348 ...    0.
     0.            0.        ]] 
 [[  0.           0.           0.         ...   0.           0.
    0.        ]
 [  0.           0.           0.         ...   0.           0.
    0.        ]
 [  0.           0.           0.         ...   0.           0.
    0.        ]
 ...
 [  0.           0.56302593   0.         

# General Functions

## run models

In [None]:
def trainLogReg(X, y, C):
    model = LogisticRegression(penalty="l2", C=C)

# Run Voted_Up

/content/drive/MyDrive/4th_Year/ML_final_assignment/results/voted_up_23_16_05_59.txt


        2021-01-23 16:05:59.491074 : LogisticRegression voted_up
Tokens 1: C value CC --     2021-01-23 16:06:05.524425 dataframe to numpy array (4791, 6252)
      2021-01-23 16:06:06.999792 done 1000
      2021-01-23 16:06:08.500665 done 2000
      2021-01-23 16:06:10.000960 done 3000
      2021-01-23 16:06:11.516687 done 4000
      2021-01-23 16:06:13.111979 done 5000
      2021-01-23 16:06:14.688132 done 6000
    2021-01-23 16:06:15.095580 finished numpyfying
    x.shape: (4791, 6252) -- y_voted: (4791,) -- y_access: (4791,) 

[[  10.92684516   10.41597967 1002.53804348 ...    0.
     0.            0.        ]] 
 [[  0.           0.           0.         ...   0.           0.
    0.        ]
 [  0.           0.           0.         ...   0.           0.
    0.        ]
 [  0.           0.           0.         ...   0.           0.
    0.        ]
 ...
 [  0.           0.56302593   0.         ... 21

# Run Early_Access