In [None]:
import numpy as np
import sys
from sklearn.metrics import roc_auc_score
from scipy.interpolate import UnivariateSpline
from tqdm.notebook import tqdm
import warnings

%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
from sklearn.metrics import auc
from sklearn.metrics import roc_curve


In [None]:
SIG_NAMES = {
    '1': {
        1: "stop2b1000_neutralino300",
        2: "glgl1400_neutralino1100",
        28: "monojet_Zp2000.0_DM_50.0",
        29: "glgl1600_neutralino800",
        30: "monotop_200_A",
        31: "stlp_st1000",
        32: "sqsq_sq1800_neut800",
        33: "sqsq1_sq1400_neut800",
        999: "Secret"
    },
    '2a': {
        25: "chaneut_cha250_neut150",
        26: "chaneut_cha400_neut200",
        27: "pp24mt_50",
        28: "pp23mt_50",
        29: "gluino_1000.0_neutralino_1.0",
        30: "chaneut_cha200_neut50",
        31: "chaneut_cha300_neut100",
        999: "Secret"
    },
    '2b': {
        1: "pp24mt_50",
        2: "chaneut_cha200_neut50",
        3: "stlp_st1000",
        4: "chacha_cha600_neut200",
        5: "pp23mt_50",
        6: "chaneut_cha250_neut150",
        7: "chacha_cha400_neut60",
        34: "gluino_1000.0_neutralino_1.0",
        35: "chacha_cha300_neut140",
        999: "Secret"
    },
    '3': {
        1: "glgl1600_neutralino800",
        2: "monojet_Zp2000.0_DM_50.0",
        3: "gluino_1000.0_neutralino_1.0",
        4: "stop2b1000_neutralino300",
        5: "sqsq1_sq1400_neut800",
        6: "monotop_200_A",
        7: "monoV_Zp2000.0_DM_1.0",
        8: "stlp_st1000",
        34: "sqsq_sq1800_neut800",
        35: "glgl1400_neutralino1100",
        999: "Secret"
    }
}

In [None]:
def determine_auc(bkg_events, sig_events):
    # bkg_events is a 1D array of anomaly scores for the background dataset
    # sig_events is a 1D array of anomaly scores for the signal dataset
    # Returns: Area under the ROC curve, and signal efficiencies for three background efficiencies: 10^-2, 10^-3, 10^-4

    #Create background and signal labels
    bkg_labels = np.zeros(len(bkg_events))
    sig_labels = np.ones(len(sig_events))
    
    #stitch all results together
    events = np.append(bkg_events, sig_events)
    labels = np.append(bkg_labels, sig_labels)

    #Build ROC curve using sklearns roc_curve function
    FPR, TPR, thresholds = roc_curve(labels, events)

    #Calculate area under the ROC curve
    AUC = auc(FPR, TPR)

    #background efficiencies
    efficiency1 = 10.0**-2
    efficiency2 = 10.0**-3
    efficiency3 = 10.0**-4
    #epsilon values
    epsilon1 = 0.0
    epsilon2 = 0.0
    epsilon3 = 0.0
    #flags to tell when done
    done1 = False
    done2 = False
    done3 = False

    #iterate through bkg efficiencies and get as close as possible to the desired efficiencies.
    for i in range(len(FPR)):
        bkg_eff = FPR[i]
        if bkg_eff >= efficiency1 and done1 == False:
            epsilon1 = TPR[i]
            done1 = True
        if bkg_eff >= efficiency2 and done2 == False:
            epsilon2 = TPR[i]
            done2 = True
        if bkg_eff >= efficiency3 and done3 == False:
            epsilon3 = TPR[i]
            done3 = True

        if done1 and done2 and done3:
            break
            
    return AUC, epsilon1, epsilon2, epsilon3

In [None]:
## AND/OR scores and stuff
algorithms = ['fixed-mse', 'flow']

def calculate_metrics(resultname, bg_vae, sig_vae, bg_flow, sig_flow, calc_type=False):
    all_bkg_ascores = np.array([bg_vae, bg_flow])
    all_sig_ascores = np.array([sig_vae, sig_flow])
    
    N = 10000 
    nbkg = len(all_bkg_ascores[0])
    nsig = len(all_sig_ascores[0])
    x_bkg = np.linspace(0,1,nbkg)
    x_sig = np.linspace(0,1,nsig)

    new_sig_ascores = np.full((len(algorithms), nsig), float(9999))
    new_bkg_ascores = np.full((len(algorithms), nbkg), float(9999))
    CDF_binned = np.full(N, float(9999))
    
    for i in range(len(algorithms)): # normalise each algorithm to uniform background rate
        bkg_hist, bins = np.histogram(all_bkg_ascores[i], bins = N)
        for j in range(N):
            CDF_binned[j] = sum(bkg_hist[j:])/nbkg
        x = bins[:-1]

        CDF = UnivariateSpline(x, CDF_binned, s = 0) #create a function mapping x -> bkg efficiency at x

        # clip signal points outside range!! if you don't do this you'll get background efficiencies greater (less) than 1 (0).
        new_sig_ascores[i] = 1 - CDF(np.clip(all_sig_ascores[i], min(all_bkg_ascores[i]), max(all_bkg_ascores[i])))
        new_bkg_ascores[i] = 1 - CDF(all_bkg_ascores[i])

    AND_sig = np.full(nsig, float(9999))
    AND_bkg = np.full(nbkg, float(9999))
    OR_sig = np.full(nsig, float(9999))
    OR_bkg = np.full(nbkg, float(9999))
    PROD_sig = np.full(nsig, float(9999))
    PROD_bkg = np.full(nbkg, float(9999))
    AVG_sig = np.full(nsig, float(9999))
    AVG_bkg = np.full(nbkg, float(9999))
    for i in range(nsig):
        AND_sig[i] = min(new_sig_ascores[:,i])
        OR_sig[i] = max(new_sig_ascores[:,i])
        PROD_sig[i] = np.prod(new_sig_ascores[:,i])
        AVG_sig[i] = np.average(new_sig_ascores[:,i])

    for i in range(nbkg):
        AND_bkg[i] = min(new_bkg_ascores[:,i])
        OR_bkg[i] = max(new_bkg_ascores[:,i])
        PROD_bkg[i] = np.prod(new_bkg_ascores[:,i])
        AVG_bkg[i] = np.average(new_bkg_ascores[:,i])
    
    if calc_type == 'AND':
        AND_auc = determine_auc(AND_bkg, AND_sig)
        if resultname != False:
            print("Saving scores to ",resultname)
            np.savetxt(resultname.replace('XXX','AND'), AND_sig)
        return AND_auc
    if calc_type == 'OR':
        OR_auc = determine_auc(OR_bkg, OR_sig)
        
        if resultname != False:
            print("Saving scores to ",resultname)
            np.savetxt(resultname.replace('XXX','OR'), OR_sig)
        return OR_auc
    if calc_type == 'AVG':
        AVG_auc = determine_auc(AVG_bkg, AVG_sig)
        if resultname != False:
            print("Saving scores to ",resultname)
            np.savetxt(resultname.replace('XXX','AVG'), AVG_sig)
        return AVG_auc
    if calc_type == 'PROD':
        PROD_auc = determine_auc(PROD_bkg, PROD_sig)
        if resultname != False:
            print("Saving scores to ",resultname)
            np.savetxt(resultname.replace('XXX','PROD'), PROD_sig)
        return PROD_auc
    
    AND_auc = determine_auc(AND_bkg, AND_sig)
    OR_auc = determine_auc(OR_bkg, OR_sig)
    AVG_auc = determine_auc(AVG_bkg, AVG_sig)
    PROD_auc = determine_auc(PROD_bkg, PROD_sig)
    
    if resultname != False:
        print("Saving scores to ",resultname)
        np.savetxt(resultname.replace('XXX','AND'), AND_sig)
        np.savetxt(resultname.replace('XXX','OR'), OR_sig)
        np.savetxt(resultname.replace('XXX','PROD'), PROD_sig)
        np.savetxt(resultname.replace('XXX','AVG'), AVG_sig)
    return AND_auc, OR_auc, AVG_auc, PROD_auc

In [None]:
# fxied taeget
fig=plt.figure(figsize=(20,20))
i=0
j=0

legend_titles = ['VAE', 'Flow', 'AND', 'OR', 'PROD', 'AVG']
legend_colors = ['blue', 'green', 'purple', 'red', 'orange', 'black']
legend_lines = [Line2D([0], [0], color=color, lw=4) for color in legend_colors]
font_size = 16

calc_types = ['AND', 'OR', 'AVG', 'PROD']

for CHANNEL in tqdm(SIG_NAMES):
    i+=1
    j=0
    my_xticks=[]
    x=[]
    plt.subplot(2,2,i)
    plt.title('Channel ' + str(CHANNEL), fontsize=font_size)
    for SIGNAL in tqdm(SIG_NAMES[CHANNEL]):
        j+=1
        AUC_score={}
        for calc_type in tqdm(calc_types):
            need_calc = True
            FNAME = "combined_result_" + CHANNEL + "-" + str(SIGNAL) + '-' + calc_type
            try:
                AND_auc, OR_auc, PROD_auc, AVG_auc = np.loadtxt(FNAME)
            except:
                need_calc = True

            FNAME = "vaef_result_" + CHANNEL + "-" + str(SIGNAL) + '-' + calc_type
            try:
                AUC_vae, AUC_flow = np.loadtxt(FNAME)
            except:
                need_calc = True

            if need_calc == 1:
                calc_name = calc_type

                if calc_name == 'AND':
                    calc_name = 'min'
                elif calc_name == 'OR':
                    calc_name = 'max'
                elif calc_name == 'AVG':
                    calc_name = 'avg'
                elif calc_name == 'PROD':
                    calc_name = 'prod'

                vae_bg = np.loadtxt('data/fixed-mse/fixed_target_combined_scores/'+calc_name+'-bg-'+str(CHANNEL)+'-'+str(SIGNAL))
                vae_sig = np.loadtxt('data/fixed-mse/fixed_target_combined_scores/'+calc_name+'-sig-'+str(CHANNEL)+'-'+str(SIGNAL))

                flow_bg = np.loadtxt('data/flow/scores_effenc_final_'+str(CHANNEL)+'_bg.csv')
                flow_sig = np.loadtxt('data/flow/scores_effenc_final_'+str(CHANNEL)+'_'+str(SIGNAL)+'.csv')

                bg_notinf = np.isfinite(flow_bg)
                sig_notinf = np.isfinite(flow_sig)
                

                vae_bg = vae_bg[bg_notinf]
                flow_bg = flow_bg[bg_notinf]
                vae_sig = vae_sig[sig_notinf]
                flow_sig = flow_sig[sig_notinf]

                max_score = np.amax(np.concatenate((vae_bg, vae_sig)))
                min_score = np.amin(np.concatenate((vae_bg, vae_sig)))
                vae_bg_scores = (vae_bg - min_score)/(max_score - min_score)
                vae_sig_scores = (vae_sig - min_score)/(max_score - min_score)   

                max_score = np.amax(np.concatenate((flow_bg, flow_sig)))
                min_score = np.amin(np.concatenate((flow_bg, flow_sig)))
                flow_bg_scores = 1-(flow_bg - min_score)/(max_score - min_score)
                flow_sig_scores = 1-(flow_sig - min_score)/(max_score - min_score)
                
                save_individual_scores = False
                if SIGNAL == 999:
                    save_individual_scores = "secret999_results/Combined-FixedTarget-Flow-XXX_FixedTargetMSE_FlowLikelihood-chan"+str(CHANNEL)+".csv"
                    print("Saving to ",save_individual_scores)

                FNAME = "combined_result_" + CHANNEL + "-" + str(SIGNAL) + '-' + calc_type
                try:
                    AUC_score[calc_type] = np.loadtxt(FNAME)
                except:
                    AUC_score[calc_type] = calculate_metrics(save_individual_scores, vae_bg_scores, vae_sig_scores, flow_bg_scores, flow_sig_scores, calc_type)
                    np.savetxt(FNAME, AUC_score[calc_type])

                FNAME = "vae_result_" + CHANNEL + "-" + str(SIGNAL) + '-' + calc_type
                try:
                    AUC_score['VAE'] = np.loadtxt(FNAME)
                except:
                    AUC_score['VAE'] = determine_auc(vae_bg_scores, vae_sig_scores)
                    np.savetxt(FNAME, [AUC_score['VAE']])
                
        FNAME = "flow_result_" + CHANNEL + "-" + str(SIGNAL) + '-' + calc_type
        try:
            AUC_score['FLOW'] = np.loadtxt(FNAME)
        except:
            AUC_score['FLOW'] = determine_auc(flow_bg_scores, flow_sig_scores)
            np.savetxt(FNAME, [AUC_score['FLOW']])
        
        plt.scatter(j, AUC_score['VAE'][0], color=legend_colors[0])
        plt.scatter(j, AUC_score['FLOW'][0], color=legend_colors[1])
        plt.scatter(j, AUC_score['AND'][0], color=legend_colors[2])
        plt.scatter(j, AUC_score['OR'][0], color=legend_colors[3])
        plt.scatter(j, AUC_score['PROD'][0], color=legend_colors[4])
        plt.scatter(j, AUC_score['AVG'][0], color=legend_colors[5])
        
        my_xticks.append(SIG_NAMES[CHANNEL][SIGNAL])
        x.append(j)
    plt.xticks(x, my_xticks, rotation=90, fontsize=font_size)
    if i == 1 or i == 3:
        plt.ylabel("AUC", fontsize=font_size)
    if i == 1:
        plt.legend(
            legend_lines, 
            legend_titles, 
            fontsize=font_size, 
            loc="upper right", 
            ncol=6,
            bbox_to_anchor=(0.3, 1.05, 1, 0.1),
            borderaxespad=0, 
            frameon=False
        )

fig.subplots_adjust(hspace=0.6)
fig.subplots_adjust(wspace=0.05)
#     sys.exit()


In [None]:
def append_to_result(row):
    f = open("combined_scores.csv", "a+")
    f.write(','.join(row) + "\n")


In [None]:
# FIXED TARGETs
for CHANNEL in SIG_NAMES:
    for SIGNAL in (SIG_NAMES[CHANNEL]):
        if SIGNAL == 'Secret':
            continue
        FNAME = "combined_result_" + CHANNEL + "-" + str(SIGNAL)
        AND_auc = np.loadtxt(FNAME + '-AND')
        OR_auc = np.loadtxt(FNAME + '-OR')
        AVG_auc = np.loadtxt(FNAME + '-AVG')
        PROD_auc = np.loadtxt(FNAME + '-PROD')
        
        signal = SIG_NAMES[CHANNEL][SIGNAL]
        algorithm = 'Combined-FixedTarget-Flow-'
        anomaly_score = 'Fixed target MSE + Flow Likelihood'
        channel = CHANNEL
        
        append_to_result([signal, algorithm+'AND',anomaly_score, channel, "{:.2f}".format(AND_auc[0]), "{:.2f}".format(AND_auc[1]), "{:.2f}".format(AND_auc[2]), "{:.2f}".format(AND_auc[3])])
        append_to_result([signal, algorithm+'OR',anomaly_score, channel, "{:.2f}".format(OR_auc[0]), "{:.2f}".format(OR_auc[1]), "{:.2f}".format(OR_auc[2]), "{:.2f}".format(OR_auc[3])])
        append_to_result([signal, algorithm+'PROD',anomaly_score, channel, "{:.2f}".format(PROD_auc[0]), "{:.2f}".format(PROD_auc[1]), "{:.2f}".format(PROD_auc[2]), "{:.2f}".format(PROD_auc[3])])
        append_to_result([signal, algorithm+'AVG',anomaly_score, channel, "{:.2f}".format(AVG_auc[0]), "{:.2f}".format(AVG_auc[1]), "{:.2f}".format(AVG_auc[2]), "{:.2f}".format(AVG_auc[3])])
