In [None]:
import os

import pendulum
import joblib
import pandas as pd
import numpy as np
import xgboost as xgb

import plotly.graph_objects as go

from sklearn.utils import shuffle
from sklearn.utils import resample
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    auc,
    precision_recall_curve,
)

import diquark.constants as const
from diquark.plotting import make_histogram, make_histogram_with_double_gaussian_fit
from diquark.helpers import mass_score_cut

import tensorflow as tf

tfkl = tf.keras.layers
tfk = tf.keras

if os.getcwd().split("/")[-1] == "notebooks":
    os.chdir("..")

In [None]:
run_id = pendulum.now().strftime("%Y%m%d%H%M%S")
workdir = f"models/run_{run_id}"

if not os.path.exists(workdir):
    os.makedirs(workdir)
    os.makedirs(f"{workdir}/plots")

# Data Preparation

In [None]:
df = pd.read_parquet("data/full_sample.parquet").fillna(0)

In [None]:
# Separate signal and background events
df_bkg = df[df["target"] == 0]
df_sig = df[df["target"] == 1]

# Separate the signal events
df_sig_train = df_sig.sample(frac=0.8, random_state=0)
df_sig_test = df_sig.drop(df_sig_train.index)

# Split the background events
df_bkg_train = df_bkg.sample(frac=0.8, random_state=0)
df_bkg_test = df_bkg.drop(df_bkg_train.index)

# Oversample the signal class in the training set to match the number of background instances
df_sig_train_oversampled = resample(
    df_sig_train,
    replace=True,  # sample with replacement
    n_samples=len(df_bkg_train),  # match number in majority class
    random_state=0,
)  # reproducible results

# Combine the oversampled signal class with the background class to form the training set
df_train = pd.concat([df_sig_train_oversampled, df_bkg_train])

# Combine signal and background test sets
df_test = pd.concat([df_sig_test, df_bkg_test])

# Shuffle the training and test sets
df_train = shuffle(df_train, random_state=0)
df_test = shuffle(df_test, random_state=0)

scaler = MinMaxScaler()

# Separate features and targets
x_train = df_train.drop(["target", "Truth", "inv_mass"], axis=1).to_numpy()
x_train = scaler.fit_transform(x_train)
x_test = df_test.drop(["target", "Truth", "inv_mass"], axis=1).to_numpy()
x_test = scaler.transform(x_test)
y_train = df_train["target"].to_numpy()
y_test = df_test["target"].to_numpy()

In [None]:
df_train.to_parquet(f"{workdir}/train.parquet")
df_test.to_parquet(f"{workdir}/test.parquet")

In [None]:
test_df = df_test[["Truth", "inv_mass"]].reset_index(drop=True)
train_df = df_train[["Truth", "inv_mass"]].reset_index(drop=True)

m6j_test = {}
for key in test_df["Truth"].unique():
    m6j_test[key] = test_df[test_df["Truth"] == key]["inv_mass"].to_numpy()

m6j_train = {}
for key in train_df["Truth"].unique():
    m6j_train[key] = train_df[train_df["Truth"] == key]["inv_mass"].to_numpy()

joblib.dump(m6j_test, f"{workdir}/m6j_test.data.joblib")
joblib.dump(m6j_train, f"{workdir}/m6j_train.data.joblib")

In [None]:
np.savez(f"{workdir}/data.npz", x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test)

# Tensorflow NN Model

In [None]:
x_train.shape

In [None]:
metric_instance = tfk.metrics.FalsePositives(name="fp")

model = tfk.Sequential(
    [
        tfkl.InputLayer(input_shape=(76)),
        tfkl.Dense(64, activation="relu", name="dense_1"),
        tfkl.Dropout(0.2, name="dropout_1"),
        tfkl.Dense(32, activation="relu", name="dense_2"),
        tfkl.Dropout(0.1, name="dropout_2"),
        tfkl.Dense(32, activation="relu", name="dense_3"),
        tfkl.Dense(1, activation="sigmoid", name="output"),
    ]
)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=[metric_instance, "accuracy"])
model.summary()

In [None]:
history = model.fit(
    x_train, y_train, epochs=5, batch_size=128, validation_data=(x_test, y_test), verbose=1
)

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=history.epoch,
        y=history.history["loss"],
        name="Training Loss",
        mode="lines",
        line=dict(color="blue"),
    )
)
fig.add_trace(
    go.Scatter(
        x=history.epoch,
        y=history.history["val_loss"],
        name="Validation Loss",
        mode="lines",
        line=dict(color="red"),
    )
)
fig.update_layout(
    title="Training and Validation Loss",
    xaxis_title="Epoch",
    yaxis_title="Loss",
    width=800,
    height=600,
)
fig.show()

In [None]:
model.save(f"{workdir}/model.keras")

In [None]:
y_pred_nn = model.predict(x_test)
scores_test_nn = {}
for key in test_df["Truth"].unique():
    scores_test_nn[key] = y_pred_nn.flatten()[test_df[test_df["Truth"] == key].index]

In [None]:
fig = make_histogram(scores_test_nn, 50, clip_top_prc=100, clip_bottom_prc=0, cross=None)
fig.update_layout(
    title_text="Test scores distribution",
    barmode="stack",
    yaxis_type="log",
    xaxis_title="Random Forest Output",
    yaxis_title="Count",
)
fig.write_image(f"{workdir}/plots/NN-output.pdf")
fig.show()

In [None]:
fig = make_histogram_with_double_gaussian_fit(
    mass_score_cut(m6j_test, scores_test_nn, cut=0.99, prc=True),
    20,
    clip_top_prc=100,
    cross=const.CROSS_SECTION_ATLAS_130_85,
)
fig.update_layout(
    title="6-jet Mass",
    xaxis_title="Invariant Mass [GeV]",
    yaxis_title_text="count x sigma",
    barmode="stack",
    bargap=0,
    width=1600 * (5 / 6),
    height=900 * (5 / 6),
)
fig.write_image(f"{workdir}/plots/6jet_mass_NN_cut_fit.pdf")
fig.show()

# Scikit Models

In [None]:
rf_clf = RandomForestClassifier(n_jobs=-1).fit(x_train, y_train)
print("RFC trained")
joblib.dump(rf_clf, f"{workdir}/rfc.joblib", compress=3)

In [None]:
gb_clf = xgb.XGBClassifier(tree_method="hist").fit(x_train, y_train)
print("GBC trained")
joblib.dump(gb_clf, f"{workdir}/gbc.joblib", compress=3)

In [None]:
y_pred_gb = gb_clf.predict_proba(x_test)[:, 1]
y_pred_rf = rf_clf.predict_proba(x_test)[:, 1]

In [None]:
sample_weights = [const.CROSS_SECTION_ATLAS_130_85[label] for label in test_df["Truth"]]

In [None]:
# For the first model
precision_nn, recall_nn, thresholds_nn = precision_recall_curve(y_test, y_pred_nn)
# precision_nn, recall_nn, thresholds_nn = precision_recall_curve(y_test, y_pred_nn, sample_weight=sample_weights)
pr_auc_nn = auc(recall_nn, precision_nn)

# For the Gradient Boosting model
precision_gb, recall_gb, thresholds_gb = precision_recall_curve(y_test, y_pred_gb)
# precision_gb, recall_gb, thresholds_gb = precision_recall_curve(y_test, y_pred_gb, sample_weight=sample_weights)
pr_auc_gb = auc(recall_gb, precision_gb)

# For the Random Forest model
precision_rf, recall_rf, thresholds_rf = precision_recall_curve(y_test, y_pred_rf)
# precision_rf, recall_rf, thresholds_rf = precision_recall_curve(y_test, y_pred_rf, sample_weight=sample_weights)
pr_auc_rf = auc(recall_rf, precision_rf)

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=recall_gb,
        y=precision_gb,
        customdata=thresholds_gb,
        hovertemplate="Threshold=%{customdata}<br>Recall=%{x}<br>Precision=%{y}",
        mode="lines",
        name=f"BDT - AUC={pr_auc_gb:.3f}",
    )
)
fig.add_trace(
    go.Scatter(
        x=recall_rf,
        y=precision_rf,
        customdata=thresholds_rf,
        hovertemplate="Threshold=%{customdata}<br>Recall=%{x}<br>Precision=%{y}",
        mode="lines",
        name=f"RF - AUC={pr_auc_rf:.3f}",
    )
)

fig.add_trace(
    go.Scatter(
        x=recall_nn,
        y=precision_nn,
        customdata=thresholds_nn,
        hovertemplate="Threshold=%{customdata}<br>Recall=%{x}<br>Precision=%{y}",
        mode="lines",
        name=f"NN - AUC={pr_auc_nn:.3f}",
    )
)
fig.update_layout(
    title="Cross-Section Weighted Precision-Recall Curves",
    xaxis_title="Recall",
    yaxis_title="Precision",
    width=1200 * (2 / 3),
    height=800 * (2 / 3),
)
fig.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1))
fig.write_image(f"{workdir}/plots/PR-curve.pdf")
fig.show()

In [None]:
# Get feature importance from Random Forest
rf_importance = rf_clf.feature_importances_
gb_importance = gb_clf.feature_importances_

# Get feature names
feature_names = df_train.drop(["target", "Truth", "inv_mass"], axis=1).columns

# Create a grouped bar chart
fig = go.Figure()

# Add bars for Random Forest
fig.add_trace(
    go.Bar(
        x=feature_names,
        y=rf_importance,
        name="Random Forest",
        offsetgroup=1,
        marker=dict(color="#E4D91B"),
    )
)

# Add bars for Random Forest
fig.add_trace(
    go.Bar(
        x=feature_names,
        y=gb_importance,
        name="Gradient Boosting",
        offsetgroup=2,
        marker=dict(color="#D91BE4"),
    )
)

fig.update_layout(
    title="Feature Importances for Random Forest",
    xaxis_title="Features",
    yaxis_title="Importance Value",
    # legend_title='Classifier',
    xaxis=dict(tickangle=45),
    barmode="group",
    width=1200,
)
fig.write_image(f"{workdir}/plots/feature_importances.pdf")
fig.show()

In [None]:
scores_test_rf = {}
for key in test_df["Truth"].unique():
    scores_test_rf[key] = y_pred_rf.flatten()[test_df[test_df["Truth"] == key].index]

In [None]:
fig = make_histogram(scores_test_rf, 50, clip_top_prc=100, clip_bottom_prc=0, cross=None)
fig.update_layout(
    title_text="Data Sample Content by Model Output Cut",
    barmode="stack",
    yaxis_type="log",
    xaxis_title="RF Output",
    yaxis_title="Probability Density",
    width=1600 * (5 / 6),
    height=900 * (5 / 6),
)
fig.write_image(f"{workdir}/plots/RF-output.pdf")
fig.show()

In [None]:
fig = make_histogram_with_double_gaussian_fit(
    mass_score_cut(m6j_test, scores_test_rf, 0.99, prc=True),
    20,
    clip_top_prc=100,
    cross=const.CROSS_SECTION_ATLAS_130_85,
)
fig.update_layout(
    title="6-jet Mass",
    xaxis_title="Invariant Mass [GeV]",
    yaxis_title_text="count x sigma",
    # yaxis_type="log",
    barmode="stack",
    bargap=0,
    width=1600 * (2 / 3),
    height=900 * (2 / 3),
)
fig.update_legends(
    title_text="",
    itemsizing="constant",
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01,
    font=dict(size=16),
)
# fig.write_image(f"{workdir}/plots/6jet_mass_RF_cut_05_fit.pdf")
fig.show()