In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
from tqdm.notebook import tqdm

import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.signal_categories import topological_category_labels, topological_category_colors, topological_category_labels_latex
from src.signal_categories import filetype_category_labels, filetype_category_colors
from src.signal_categories import del1g_simple_category_labels, del1g_simple_category_colors, del1g_detailed_category_labels_latex
from src.signal_categories import del1g_detailed_category_labels, del1g_detailed_category_colors, del1g_detailed_category_labels_latex
from src.signal_categories import train_category_labels, train_category_colors, train_category_labels_latex


In [None]:
import umap
import sklearn
from sklearn.preprocessing import StandardScaler

from sklearn.manifold import TSNE, MDS, Isomap
from sklearn.decomposition import PCA, FastICA, NMF
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

from src.file_locations import intermediate_files_location

print(f"sklearn version: {sklearn.__version__}")
print(f"umap version: {umap.__version__}")


In [None]:
#training = "first_combined_training"
#training = "first_wc_training"
#training = "first_lantern_training"
#training = "first_mixed_del1g_iso_training"
#training = "mixed_del1g_iso_training"
training = "with_numu_generic_pandora_glee"

sig_category_name = "del1g_simple_signal_category"
#sig_category_name = "del1g_detailed_signal_category"

if sig_category_name == "del1g_simple_signal_category":
    sig_categories = train_category_labels
    sig_categories_latex = train_category_labels_latex
    sig_colors = train_category_colors
elif sig_category_name == "del1g_detailed_signal_category":
    sig_categories = del1g_detailed_category_labels
    sig_categories_latex = del1g_detailed_category_labels_latex
    sig_colors = del1g_detailed_category_colors
elif sig_category_name == "topological_signal_category":
    sig_categories = topological_category_labels
    sig_categories_latex = topological_category_labels_latex
    sig_colors = topological_category_colors
else:
    raise ValueError(f"Invalid sig_category_name: {sig_category_name}")


# File Loading

In [None]:
print("loading all_df.pkl...")
with open(f"{intermediate_files_location}/all_df.pkl", "rb") as f:
    all_df = pickle.load(f)
print(f"{all_df.shape=}")

# this only includes predictions for events passing the preselection used during training
print("loading predictions.pkl...")
with open(f"../training_outputs/{training}/predictions.pkl", "rb") as f:
    pred_df = pickle.load(f)
print(f"{pred_df.shape=}")

print("merging all_df and predictions.pkl...")
merged_df = pd.merge(all_df, pred_df, on=["filetype", "run", "subrun", "event"], how="left")

merged_df = merged_df.query("filetype != 'data' and filetype != 'isotropic_one_gamma_overlay'")

In [None]:
merged_df["filetype"].value_counts()

In [None]:
prob_categories = ["prob_" + cat for cat in train_category_labels]

for prob in prob_categories:
    merged_df[prob] = merged_df[prob].fillna(-1)

num_train_events = len(merged_df.query("used_for_training == True"))
num_test_events = len(merged_df.query("used_for_testing == True"))
frac_test = num_test_events / (num_train_events + num_test_events)
print(f"weighting up by the fraction of test events: {frac_test:.3f}")

modified_net_weights = []
used_for_testing = merged_df["used_for_testing"].to_numpy()
wc_net_weights = merged_df["wc_net_weight"].to_numpy()
for i in range(len(merged_df)):
    if used_for_testing[i]:
        modified_net_weights.append(wc_net_weights[i] / frac_test)
    else:
        modified_net_weights.append(wc_net_weights[i])
merged_df["wc_net_weight"] = modified_net_weights         

merged_df = merged_df.query("used_for_testing == True")

print(f"{merged_df.shape=}")
presel_merged_df = merged_df.query("wc_kine_reco_Enu > 0")
print(f"{presel_merged_df.shape=}")

presel_merged_df


In [None]:
#sampled_presel_merged_df = presel_merged_df.sample(n=1000)

sample_frac = 1.5

# empty dataframe
sampled_presel_merged_df = pd.DataFrame()
for sig_category_i, sig_category_label in enumerate(sig_categories):
    curr_df = presel_merged_df.query(f"{sig_category_name} == {sig_category_i}")

    curr_sample_frac = sample_frac
    if "1g" in sig_category_label:
        curr_sample_frac = 1e6

    num_weighted_events = np.sum(curr_df["wc_net_weight"].to_numpy())
    num_sample_events = int(curr_sample_frac * num_weighted_events)
    num_sample_events = min(num_sample_events, len(curr_df))

    print(sig_category_label, num_weighted_events, num_sample_events)

    curr_df = curr_df.sample(n=num_sample_events)

    sampled_presel_merged_df = pd.concat([sampled_presel_merged_df, curr_df])


In [None]:
probs_2d_arr = sampled_presel_merged_df[prob_categories].to_numpy()


# Multi-Class Probability Visualization

In [None]:
probs_scaled = probs_2d_arr

reducer = umap.UMAP()

reducer.fit(probs_scaled)
umap_result = reducer.transform(probs_scaled)


In [None]:
plt.rcParams.update({'font.size': 14})

true_sig_categories = sampled_presel_merged_df[sig_category_name].to_numpy()
plt.figure(figsize=(12, 7))
for i in range(len(sig_categories)):
    if "del1g" in sig_categories[i] or "iso1g" in sig_categories[i]:
        continue
    true_sig_category_mask = true_sig_categories == i
    plt.scatter(umap_result[true_sig_category_mask, 0], umap_result[true_sig_category_mask, 1], alpha=0.2, s=1, c=sig_colors[i])
    plt.scatter([], [], s=50, c=sig_colors[i], label=sig_categories_latex[i])
plt.title('UMAP Visualization of NGEM Multi-Class BDT Probability Scores')
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
min_x, max_x = umap_result[:, 0].min(), umap_result[:, 0].max()
min_y, max_y = umap_result[:, 1].min(), umap_result[:, 1].max()
x_diff = max_x - min_x
y_diff = max_y - min_y
plt.xlim(min_x - x_diff * 0.1, max_x + x_diff * 0.4)
plt.ylim(min_y - y_diff * 0.1, max_y + y_diff * 0.1)

plt.text(0.03, 0.95, 'UMAP projection from 19D Multi-Class\nBDT score space to 2D, preserving\ndistances as much as possible\n\nIncluding nominal prediction\nand Del1g signal sample after\nWC generic $\geq$ 1 shower preselection', ha='left', va='top', transform=plt.gca().transAxes, fontsize=13)

plt.legend(loc="upper right", ncol=1, fontsize=12)
plt.savefig(f"../training_outputs/{training}/score_vis/umap_visualization.png")
plt.savefig(f"../training_outputs/{training}/score_vis/umap_visualization.pdf")
plt.savefig(f"../training_outputs/{training}/score_vis/umap_visualization.jpeg", dpi=400)

print_all = False
if print_all:
    for i in range(len(sig_categories)):
        plt.figure(figsize=(3, 2))
        true_sig_category_mask = true_sig_categories == sig_categories[i]
        plt.scatter(umap_result[true_sig_category_mask, 0], umap_result[true_sig_category_mask, 1], alpha=0.6, s=2, c=sig_colors[i])
        plt.scatter([], [], s=50, c=sig_colors[i], label=del1g_detailed_category_labels_latex[i])
        plt.title(sig_categories[i])
        plt.xlabel('UMAP Dimension 1')
        plt.ylabel('UMAP Dimension 2')
        min_x, max_x = umap_result[:, 0].min(), umap_result[:, 0].max()
        min_y, max_y = umap_result[:, 1].min(), umap_result[:, 1].max()
        x_diff = max_x - min_x
        y_diff = max_y - min_y
        plt.xlim(min_x - x_diff * 0.1, max_x + x_diff * 0.5)
        plt.ylim(min_y - y_diff * 0.1, max_y + y_diff * 0.1)
        plt.savefig(f"../training_outputs/{training}/score_vis/{sig_categories[i]}_umap_visualization.png")
