# with regularization, original models

In [1]:
import pandas as pd

import numpy as np

import math

import copy

import matplotlib.pyplot as plt

import statistics

In [2]:
def get_average_and_std(lst): 
    return(sum(lst)/len(lst), statistics.stdev(lst))

def remove_nan_and_float(lists):
    newlists = []
    for tr in lists:
        newlists.append([int(x) for x in tr if str(x) != 'nan'])
    return(newlists)

def import_log(filepath):
    df = pd.read_csv(filepath)
    return(remove_nan_and_float(df.values.tolist()))

def count_variant(log, variant): #count how many times a variant comes up in list
    c = 0
    for trace in log:
        if trace == variant:
            c += 1
    return(c)

def get_counts(log, variants):
    counts = []
    for var in variants:
        counts.append(count_variant(log, var))
    return counts

def get_variants_list(lst): #get all of the variants in a list, return as list
    st = set(tuple(i) for i in lst) #convert list into set of tuples
    lst2 = list(st) #convert set of tuples into lsit of tuples
    return [list(e) for e in lst2] 

In [3]:
def get_fitness(occ_each_trvar_sim, occ_each_trvar_tr):
    arr = [min(occ_each_trvar_sim[i], occ_each_trvar_tr[i])/sum(occ_each_trvar_tr) for i in range(0, len(occ_each_trvar_sim))]
    return sum(arr)

def get_precision(occ_each_simvar_sim, occ_each_simvar_trte):
    arr = [min(occ_each_simvar_sim[i], occ_each_simvar_trte[i])/sum(occ_each_simvar_sim) for i in range(0, len(occ_each_simvar_sim))]
    return sum(arr)

def get_generalization(occ_each_tevar_sim, occ_each_tevar_te):
    arr = [min(occ_each_tevar_sim[i], occ_each_tevar_te[i])/sum(occ_each_tevar_te) for i in range(0, len(occ_each_tevar_sim))]
    return sum(arr)

In [4]:
def LOVOCV(modelname):
    variants = import_log('Variants/CSV/Variants_'+modelname+'.csv')
    traintestlog = import_log('Full_Logs/CSV/Log_'+modelname+'.csv')
    
    overall_counts = get_counts(traintestlog, variants) #list for the count of each variant in original log
    
    fitness_arr = []
    precision_arr = []
    generalization_arr = []
    
    for i_var in range(0, len(variants)):
        train_log = import_log('LOVOCV/'+modelname+'/Training_Logs/Train'+str(i_var)+".csv")
        test_log = import_log('LOVOCV/'+modelname+'/Test_Logs/Test'+str(i_var)+".csv")
        sim_log = import_log('LOVOCV/'+modelname+'/Simulated_Logs/Sim'+str(i_var)+".csv")
        
        #needed to calculate metrics
        trvar = get_variants_list(train_log)
        simvar = get_variants_list(sim_log)
        tevar = get_variants_list(test_log) 
        
        occ_each_trvar_sim = get_counts(sim_log, trvar)
        occ_each_tevar_sim = get_counts(sim_log, tevar)
        occ_each_simvar_sim = get_counts(sim_log, simvar)
        
        #get counts for the train log
        occ_each_trvar_tr = get_counts(train_log, trvar)

        #get counts for the test log
        occ_each_tevar_te = get_counts(test_log, tevar)

        #get counts for the train test log
        occ_each_simvar_trte = get_counts(traintestlog, simvar)

        fitness_arr.append(get_fitness(occ_each_trvar_sim, occ_each_trvar_tr))
        precision_arr.append(get_precision(occ_each_simvar_sim, occ_each_simvar_trte))
        generalization_arr.append(get_generalization(occ_each_tevar_sim, occ_each_tevar_te))
    
    generalization_ave, generalization_std = get_average_and_std(generalization_arr)
    precision_ave, precision_std = get_average_and_std(precision_arr)
    fitness_ave, fitness_std = get_average_and_std(fitness_arr)
    
    return fitness_ave, fitness_std, precision_ave, precision_std, generalization_ave, generalization_std

In [5]:
def get_abs_fitness(occ_each_trvar_sim, occ_each_trvar_tr):
    fit = 0
    for i in range(0, len(occ_each_trvar_sim)):
        if occ_each_trvar_sim[i] > 0:
            fit = fit + 1
    fit = fit/len(occ_each_trvar_sim)
    return fit

def get_abs_precision(occ_each_simvar_sim, occ_each_simvar_trte):
    prec = 0
    for i in range(0, len(occ_each_simvar_trte)):
        if occ_each_simvar_trte[i] > 0:
            prec = prec + 1
    prec = prec/len(occ_each_simvar_trte)
    return prec

def get_abs_generalization(occ_each_tevar_sim, occ_each_tevar_te):
    gen = 0.0
    for i in range(0, len(occ_each_tevar_sim)):
        if occ_each_tevar_sim[i] > 0:
            gen = gen + 1.0
    gen = gen/len(occ_each_tevar_sim)
    return gen

def Abs_LOVOCV(modelname):
    variants = import_log('Variants/CSV/Variants_'+modelname+'.csv')
    traintestlog = import_log('Full_Logs/CSV/Log_'+modelname+'.csv')
    
    overall_counts = get_counts(traintestlog, variants) #list for the count of each variant in original log
    
    fitness_arr = []
    precision_arr = []
    generalization_arr = []
    
    for i_var in range(0, len(variants)):
        train_log = import_log('LOVOCV/'+modelname+'/Training_Logs/Train'+str(i_var)+".csv")
        test_log = import_log('LOVOCV/'+modelname+'/Test_Logs/Test'+str(i_var)+".csv")
        sim_log = import_log('LOVOCV/'+modelname+'/Simulated_Logs/Sim'+str(i_var)+".csv")
        
        #needed to calculate metrics
        trvar = get_variants_list(train_log)
        simvar = get_variants_list(sim_log)
        tevar = get_variants_list(test_log) 
        
        occ_each_trvar_sim = get_counts(sim_log, trvar)
        occ_each_tevar_sim = get_counts(sim_log, tevar)
        occ_each_simvar_sim = get_counts(sim_log, simvar)
        
        #get counts for the train log
        occ_each_trvar_tr = get_counts(train_log, trvar)

        #get counts for the test log
        occ_each_tevar_te = get_counts(test_log, tevar)

        #get counts for the train test log
        occ_each_simvar_trte = get_counts(traintestlog, simvar)

        fitness_arr.append(get_abs_fitness(occ_each_trvar_sim, occ_each_trvar_tr))
        precision_arr.append(get_abs_precision(occ_each_simvar_sim, occ_each_simvar_trte))
        generalization_arr.append(get_abs_generalization(occ_each_tevar_sim, occ_each_tevar_te))
    
    generalization_ave, generalization_std = get_average_and_std(generalization_arr)
    precision_ave, precision_std = get_average_and_std(precision_arr)
    fitness_ave, fitness_std = get_average_and_std(fitness_arr)
    
    return fitness_ave, fitness_std, precision_ave, precision_std, generalization_ave, generalization_std

In [6]:
def get_relaxed_fitness(occ_each_trvar_sim, occ_each_trvar_tr):
    fit = 0
    for i in range(0, len(occ_each_trvar_sim)):
        if occ_each_trvar_sim[i] > 0:
            fit = fit + occ_each_trvar_tr[i]
    fit = fit/sum(occ_each_trvar_tr)
    return fit

def get_relaxed_precision(occ_each_simvar_sim, occ_each_simvar_trte):
    prec = 0
    for i in range(0, len(occ_each_simvar_trte)):
        if occ_each_simvar_trte[i] > 0:
            prec = prec + occ_each_simvar_sim[i]
    prec = prec/sum(occ_each_simvar_sim)
    return prec

def get_relaxed_generalization(occ_each_tevar_sim, occ_each_tevar_te):
    gen = 0.0
    for i in range(0, len(occ_each_tevar_sim)):
        if occ_each_tevar_sim[i] > 0:
            gen = gen + occ_each_tevar_te[i]
    gen = gen/sum(occ_each_tevar_te)
    return gen


def Relaxed_LOVOCV(modelname):
    variants = import_log('Variants/CSV/Variants_'+modelname+'.csv')
    traintestlog = import_log('Full_Logs/CSV/Log_'+modelname+'.csv')
    
    overall_counts = get_counts(traintestlog, variants) #list for the count of each variant in original log
    
    fitness_arr = []
    precision_arr = []
    generalization_arr = []
    
    for i_var in range(0, len(variants)):
        train_log = import_log('LOVOCV/'+modelname+'/Training_Logs/Train'+str(i_var)+".csv")
        test_log = import_log('LOVOCV/'+modelname+'/Test_Logs/Test'+str(i_var)+".csv")
        sim_log = import_log('LOVOCV/'+modelname+'/Simulated_Logs/Sim'+str(i_var)+".csv")
        
        #needed to calculate metrics
        trvar = get_variants_list(train_log)
        simvar = get_variants_list(sim_log)
        tevar = get_variants_list(test_log) 
        
        occ_each_trvar_sim = get_counts(sim_log, trvar)
        occ_each_tevar_sim = get_counts(sim_log, tevar)
        occ_each_simvar_sim = get_counts(sim_log, simvar)
        
        #get counts for the train log
        occ_each_trvar_tr = get_counts(train_log, trvar)

        #get counts for the test log
        occ_each_tevar_te = get_counts(test_log, tevar)

        #get counts for the train test log
        occ_each_simvar_trte = get_counts(traintestlog, simvar)

        fitness_arr.append(get_relaxed_fitness(occ_each_trvar_sim, occ_each_trvar_tr))
        precision_arr.append(get_relaxed_precision(occ_each_simvar_sim, occ_each_simvar_trte))
        generalization_arr.append(get_relaxed_generalization(occ_each_tevar_sim, occ_each_tevar_te))
    
    generalization_ave, generalization_std = get_average_and_std(generalization_arr)
    precision_ave, precision_std = get_average_and_std(precision_arr)
    fitness_ave, fitness_std = get_average_and_std(fitness_arr)
    
    return fitness_ave, fitness_std, precision_ave, precision_std, generalization_ave, generalization_std    

In [7]:
print(LOVOCV('Model1'))

(0.9434824035254169, 0.003814911380181415, 0.9425604166666662, 0.003648669265174541, 0.838107257173785, 0.11298382482617117)


In [8]:
print(LOVOCV('Model2'))

(0.9419887144057115, 0.003588618589880502, 0.9417407226562498, 0.003576039807724661, 0.9151766853341539, 0.08741317285643475)


In [9]:
print(LOVOCV('Model3'))

(0.9438307699742784, 0.003393662311613891, 0.9435296630859372, 0.003528518755011565, 0.9116449939885665, 0.09709693944921975)


In [10]:
print(LOVOCV('Model4'))

(0.9239855524734958, 0.0075794129233915635, 0.9201074218750003, 0.009219942711047093, 0.7085783137478495, 0.13618040934169812)


In [11]:
print(LOVOCV('Model5'))

(0.9007110753456445, 0.009274843741363092, 0.8979138321995467, 0.009524925104730679, 0.6747360678994965, 0.21257192074029777)


In [12]:
print(LOVOCV('Model6'))

(0.8561908793042657, 0.024030686233839627, 0.8529492455418376, 0.024692149401689633, 0.8649942842590461, 0.20762063073014608)


In [13]:
print(Abs_LOVOCV('Model1'))

(1.0, 0.0, 0.592817309845936, 0.040743252436084926, 1.0, 0.0)


In [14]:
print(Abs_LOVOCV('Model2'))

(1.0, 0.0, 0.7022721243451426, 0.038507781830217094, 1.0, 0.0)


In [15]:
print(Abs_LOVOCV('Model3'))

(1.0, 0.0, 0.828526207636802, 0.06168067083875466, 1.0, 0.0)


In [16]:
print(Abs_LOVOCV('Model4'))

(1.0, 0.0, 0.36387977912628505, 0.04203203823115827, 1.0, 0.0)


In [17]:
print(Abs_LOVOCV('Model5'))

(1.0, 0.0, 0.24589466628219733, 0.056336116885143055, 1.0, 0.0)


In [18]:
print(Abs_LOVOCV('Model6'))

(0.7380952380952382, 0.036767142673781554, 0.6289286209405094, 0.04321359919758086, 1.0, 0.0)


In [19]:
print(Relaxed_LOVOCV('Model1'))

(1.0, 0.0, 0.9925361111111112, 0.0014195157268760438, 1.0, 0.0)


In [20]:
print(Relaxed_LOVOCV('Model2'))

(1.0, 0.0, 0.9956884765624999, 0.0007988152088445702, 1.0, 0.0)


In [21]:
print(Relaxed_LOVOCV('Model3'))

(1.0, 0.0, 0.9978314208984371, 0.0010187549294445124, 1.0, 0.0)


In [22]:
print(Relaxed_LOVOCV('Model4'))

(1.0, 0.0, 0.9654394531249998, 0.007285332867741843, 1.0, 0.0)


In [23]:
print(Relaxed_LOVOCV('Model5'))

(1.0, 0.0, 0.9608843537414965, 0.010472682020909556, 1.0, 0.0)


In [24]:
print(Relaxed_LOVOCV('Model6'))

(0.9701992143813984, 0.00560463634003266, 0.9519890260631001, 0.011495785661841116, 1.0, 0.0)
