In [1]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import xgboost as xgb
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

In [3]:
with open('split_combined_df.pkl', 'rb') as f:
    df_b = pickle.load(f)
with open('split_signal_dfs.pkl', 'rb') as f:
    df_s = pd.concat(pickle.load(f).values(), ignore_index=True)

In [4]:
X = pd.concat([df_b, df_s], ignore_index=True)
y = np.array([0]*len(df_b) + [1]*len(df_s))

X[['trigMatched_1', 'trigMatched_2']] = pd.DataFrame(X['photon_trigMatched'].tolist(), index=X.index)
X = X.drop('photon_trigMatched', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

In [5]:
# Prepare data and labels
dtest = xgb.DMatrix(X_test)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtot = xgb.DMatrix(X, label=y)

# Set parameters, using logistic regression for binary classification (logistic loss)
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',  # Log loss is equivalent to cross-entropy
    'eta': 0.1,  # Learning rate
    'max_depth': 5,
    'subsample': 0.8,  # Subsample to prevent overfitting
    'colsample_bytree': 0.8  # Feature sampling
}

# Train the model with 100 rounds
bst = xgb.train(params, dtrain, num_boost_round=500)

In [None]:
y_pred = bst.predict(dtrain)
print("in sample:", log_loss(y_train, y_pred))

y_pred = bst.predict(dtest)
print("out of sample:", log_loss(y_test, y_pred))

y_pred = bst.predict(dtot)
print("sensitivity:", s:=np.mean(y_pred[len(df_b):]), "false_positives:", fp:=np.mean(y_pred[:len(df_b)]))
plt.hist(y_pred[:len(df_b)], bins=50, range=(0,1), alpha=0.5, label='background', color='r')
plt.hist(y_pred[len(df_b):], bins=50, range=(0,1), alpha=0.5, label='signal', color='b')
plt.legend()
plt.axvline(s, color='r')
plt.axvline(fp, color='b')

in sample: 0.14686226124423926
out of sample: 0.14668331580937274
sensitivity: 0.74243796 false_positives: 0.09476431
