# imports:

In [1]:
from sklearn.svm import SVR as svr
from sklearn.svm import NuSVR
from sklearn.ensemble import ExtraTreesRegressor as etr
from sklearn.ensemble import RandomForestRegressor as rfr
from sklearn.neighbors import KNeighborsRegressor as Kneigh
from sklearn.ensemble import AdaBoostRegressor as Ada
from sklearn.ensemble import HistGradientBoostingRegressor as HG
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import uproot
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# load events

In [2]:
eventsWithReg_ZS_EB = pd.read_pickle("eventsWithReg_ZS_EB.pkl")
eventsWithReg_FULL_PT1_EB = pd.read_pickle("eventsWithReg_FULL_PT1_EB.pkl")
eventsWithReg_FULL_PT2_EB = pd.read_pickle("eventsWithReg_FULL_PT2_EB.pkl")
eventsWithReg_FULL_PT3_EB = pd.read_pickle("eventsWithReg_FULL_PT3_EB.pkl")

In [3]:
eventsWithReg_ZS_EB = eventsWithReg_ZS_EB.sample(100000)
eventsWithReg_FULL_PT1_EB = eventsWithReg_FULL_PT1_EB.sample(80000)
eventsWithReg_FULL_PT2_EB = eventsWithReg_FULL_PT2_EB.sample(100000)
eventsWithReg_FULL_PT3_EB = eventsWithReg_FULL_PT3_EB.sample(100000)

# get regression input variables and target variable:

In [4]:
regVars_ZS_EB = eventsWithReg_ZS_EB[["clusrawE", "clusIetaIx", "clusIphiIy", "ietamod20", "iphimod20", "nhits_mod"]]
tgtvar_ZS_EB = eventsWithReg_ZS_EB[["tgtvar"]] #change target to ln(corrE/genE)

regVars_FULL_PT1_EB = eventsWithReg_FULL_PT1_EB[["clusrawE", "clusIetaIx", "clusIphiIy", "ietamod20", "iphimod20", "nhits_mod"]]
tgtvar_FULL_PT1_EB = eventsWithReg_FULL_PT1_EB[["tgtvar"]]

regVars_FULL_PT2_EB = eventsWithReg_FULL_PT2_EB[["clusrawE", "clusIetaIx", "clusIphiIy", "ietamod20", "iphimod20", "nhits_mod"]]
tgtvar_FULL_PT2_EB = eventsWithReg_FULL_PT2_EB[["tgtvar"]]

regVars_FULL_PT3_EB = eventsWithReg_FULL_PT3_EB[["clusrawE", "clusIetaIx", "clusIphiIy", "ietamod20", "iphimod20", "nhits_mod"]]
tgtvar_FULL_PT3_EB = eventsWithReg_FULL_PT3_EB[["tgtvar"]]


In [5]:
print(len(regVars_ZS_EB))
print(len(regVars_FULL_PT1_EB))
print(len(regVars_FULL_PT2_EB))
print(len(regVars_FULL_PT3_EB))

100000
80000
100000
100000


# split data:

In [14]:
#split data
xtrain_ZS, xtest_ZS, ytrain_ZS, ytest_ZS, events_ZB_train, events_ZB_test = train_test_split(regVars_ZS_EB, tgtvar_ZS_EB,eventsWithReg_ZS_EB, test_size=0.15)
ytrain_ZS = np.ravel(ytrain_ZS)
#events_ZB_train, events_ZB_test = train_test_split(eventsWithReg_ZS_EB, test_size=0.15)


xtrain_FULL_PT1, xtest_FULL_PT1, ytrain_FULL_PT1, ytest_FULL_PT1,  events_FULL_PT1_train, events_FULL_PT1_test = train_test_split(regVars_FULL_PT1_EB, tgtvar_FULL_PT1_EB, eventsWithReg_FULL_PT1_EB, test_size=0.15)
ytrain_FULL_PT1 = np.ravel(ytrain_FULL_PT1)

xtrain_FULL_PT2, xtest_FULL_PT2, ytrain_FULL_PT2, ytest_FULL_PT2,  events_FULL_PT2_train, events_FULL_PT2_test = train_test_split(regVars_FULL_PT2_EB, tgtvar_FULL_PT2_EB, eventsWithReg_FULL_PT2_EB, test_size=0.15)
ytrain_FULL_PT2 = np.ravel(ytrain_FULL_PT2)

In [6]:
xtrain_FULL_PT3, xtest_FULL_PT3, ytrain_FULL_PT3, ytest_FULL_PT3,  events_FULL_PT3_train, events_FULL_PT3_test = train_test_split(regVars_FULL_PT3_EB, tgtvar_FULL_PT3_EB, eventsWithReg_FULL_PT3_EB, test_size=0.15)
ytrain_FULL_PT3 = np.ravel(ytrain_FULL_PT3)

# run models:

In [94]:
#xgbr
XGBR_ZS = XGBRegressor(verbosity = 0).fit(xtrain_ZS, ytrain_ZS)

#ETR
ETR_ZS = etr(n_estimators=100, random_state=0).fit(xtrain_ZS, ytrain_ZS) 

In [None]:
#nusvr
NSVR_ZS = NuSVR(C=1.0, cache_size=200, coef0=0.0, degree=3, gamma='scale', kernel='rbf',
      max_iter=-1, nu=0.5, shrinking=True, tol=0.001, verbose=False).fit(xtrain_ZS, ytrain_ZS)

#svr
SVR_ZS = svr(C=1.0, epsilon=0.2).fit(xtrain_ZS, ytrain_ZS)

#xgbr
XGBR_ZS = XGBRegressor(verbosity = 0).fit(xtrain_ZS, ytrain_ZS)

#HistGBR
HGBR_ZS = HG().fit(xtrain_ZS, ytrain_ZS)

#ADA
ADA_ZS = Ada(random_state=0, n_estimators=100).fit(xtrain_ZS, ytrain_ZS)

#KNR
KNR_ZS = Kneigh(n_neighbors=2).fit(xtrain_ZS, ytrain_ZS)

#RFR
RFR_ZS = rfr(max_depth=2, random_state=0).fit(xtrain_ZS, ytrain_ZS)

In [32]:
#xgbr
XGBR_FULL_PT1 = XGBRegressor(verbosity = 0).fit(xtrain_FULL_PT1, ytrain_FULL_PT1)

#ETR
ETR_FULL_PT1 = etr(n_estimators=100, random_state=0).fit(xtrain_FULL_PT1, ytrain_FULL_PT1) 

In [33]:
#nusvr
NSVR_FULL_PT1 = NuSVR(C=1.0, cache_size=200, coef0=0.0, degree=3, gamma='scale', kernel='rbf',
      max_iter=-1, nu=0.5, shrinking=True, tol=0.001, verbose=False).fit(xtrain_FULL_PT1, ytrain_FULL_PT1)

#svr
SVR_FULL_PT1 = svr(C=1.0, epsilon=0.2).fit(xtrain_FULL_PT1, ytrain_FULL_PT1)

#HistGBR
HGBR_FULL_PT1 = HG().fit(xtrain_FULL_PT1, ytrain_FULL_PT1)

#ADA
ADA_FULL_PT1 = Ada(random_state=0, n_estimators=100).fit(xtrain_FULL_PT1, ytrain_FULL_PT1)

#KNR
KNR_FULL_PT1 = Kneigh(n_neighbors=2).fit(xtrain_FULL_PT1, ytrain_FULL_PT1)

#RFR
RFR_FULL_PT1 = rfr(max_depth=2, random_state=0).fit(xtrain_FULL_PT1, ytrain_FULL_PT1)

In [34]:
#xgbr
XGBR_FULL_PT2 = XGBRegressor(verbosity = 0).fit(xtrain_FULL_PT2, ytrain_FULL_PT2)

#ETR
ETR_FULL_PT2 = etr(n_estimators=100, random_state=0).fit(xtrain_FULL_PT2, ytrain_FULL_PT2) 

In [35]:
#nusvr
NSVR_FULL_PT2 = NuSVR(C=1.0, cache_size=200, coef0=0.0, degree=3, gamma='scale', kernel='rbf',
      max_iter=-1, nu=0.5, shrinking=True, tol=0.001, verbose=False).fit(xtrain_FULL_PT2, ytrain_FULL_PT2)

#svr
SVR_FULL_PT2 = svr(C=1.0, epsilon=0.2).fit(xtrain_FULL_PT2, ytrain_FULL_PT2)

#HistGBR
HGBR_FULL_PT2 = HG().fit(xtrain_FULL_PT2, ytrain_FULL_PT2)

#ADA
ADA_FULL_PT2 = Ada(random_state=0, n_estimators=100).fit(xtrain_FULL_PT2, ytrain_FULL_PT2)

#KNR
KNR_FULL_PT2 = Kneigh(n_neighbors=2).fit(xtrain_FULL_PT2, ytrain_FULL_PT2)

#RFR
RFR_FULL_PT2 = rfr(max_depth=2, random_state=0).fit(xtrain_FULL_PT2, ytrain_FULL_PT2)

In [7]:
#xgbr
XGBR_FULL_PT3 = XGBRegressor(verbosity = 0).fit(xtrain_FULL_PT3, ytrain_FULL_PT3)

#ETR
ETR_FULL_PT3 = etr(n_estimators=100, random_state=0).fit(xtrain_FULL_PT3, ytrain_FULL_PT3) 

In [18]:
#nusvr
NSVR_FULL_PT3 = NuSVR(C=1.0, cache_size=200, coef0=0.0, degree=3, gamma='scale', kernel='rbf',
      max_iter=-1, nu=0.5, shrinking=True, tol=0.001, verbose=False).fit(xtrain_FULL_PT3, ytrain_FULL_PT3)

In [19]:
#svr
SVR_FULL_PT3 = svr(C=1.0, epsilon=0.2).fit(xtrain_FULL_PT3, ytrain_FULL_PT3)

In [8]:
#HistGBR
HGBR_FULL_PT3 = HG().fit(xtrain_FULL_PT3, ytrain_FULL_PT3)

In [9]:
#ADA
ADA_FULL_PT3 = Ada(random_state=0, n_estimators=100).fit(xtrain_FULL_PT3, ytrain_FULL_PT3)

In [10]:
#KNR
KNR_FULL_PT3 = Kneigh(n_neighbors=2).fit(xtrain_FULL_PT3, ytrain_FULL_PT3)

In [11]:
#RFR
RFR_FULL_PT3 = rfr(max_depth=2, random_state=0).fit(xtrain_FULL_PT3, ytrain_FULL_PT3)

In [17]:
ETR_FULL_PT3.score(xtest_FULL_PT3, ytest_FULL_PT3)

0.4804192305312387

# save models:

In [None]:
pickle.dump(XGBR_ZS, open('XGBR_model_ZS.sav', 'wb'))
pickle.dump(ETR_ZS, open('ETR_model_ZS.sav', 'wb'))

pickle.dump(NSVR_ZS, open('NSVR_model_ZS.sav', 'wb'))
pickle.dump(SVR_ZS, open('SVR_model_ZS.sav', 'wb'))
pickle.dump(HGBR_ZS, open('HGBR_model_ZS.sav', 'wb'))
pickle.dump(ADA_ZS, open('ADA_model_ZS.sav', 'wb'))
pickle.dump(KNR_ZS, open('KNR_model_ZS.sav', 'wb'))
pickle.dump(RFR_ZS, open('RFR_model_ZS.sav', 'wb'))

pickle.dump(XGBR_FULL_PT1, open('XGBR_model_FULL_PT1.sav', 'wb'))
pickle.dump(ETR_FULL_PT1, open('ETR_model_FULL_PT1.sav', 'wb'))
pickle.dump(NSVR_FULL_PT1, open('NSVR_model_FULL_PT1.sav', 'wb'))
pickle.dump(SVR_FULL_PT1, open('SVR_model_FULL_PT1.sav', 'wb'))
pickle.dump(HGBR_FULL_PT1, open('HGBR_model_FULL_PT1.sav', 'wb'))
pickle.dump(ADA_FULL_PT1, open('ADA_model_FULL_PT1.sav', 'wb'))
pickle.dump(KNR_FULL_PT1, open('KNR_model_FULL_PT1.sav', 'wb'))
pickle.dump(RFR_FULL_PT1, open('RFR_model_FULL_PT1.sav', 'wb'))

pickle.dump(XGBR_FULL_PT2, open('XGBR_model_FULL_PT2.sav', 'wb'))
pickle.dump(ETR_FULL_PT2, open('ETR_model_FULL_PT2.sav', 'wb'))
pickle.dump(NSVR_FULL_PT2, open('NSVR_model_FULL_PT2.sav', 'wb'))
pickle.dump(SVR_FULL_PT2, open('SVR_model_FULL_PT2.sav', 'wb'))
pickle.dump(HGBR_FULL_PT2, open('HGBR_model_FULL_PT2.sav', 'wb'))
pickle.dump(ADA_FULL_PT2, open('ADA_model_FULL_PT2.sav', 'wb'))
pickle.dump(KNR_FULL_PT2, open('KNR_model_FULL_PT2.sav', 'wb'))
pickle.dump(RFR_FULL_PT2, open('RFR_model_FULL_PT2.sav', 'wb'))

In [20]:
pickle.dump(XGBR_FULL_PT3, open('XGBR_model_FULL_PT3_tgt.sav', 'wb'))
pickle.dump(ETR_FULL_PT3, open('ETR_model_FULL_PT3_tgt.sav', 'wb'))
pickle.dump(NSVR_FULL_PT3, open('NSVR_model_FULL_PT3_tgt.sav', 'wb'))
pickle.dump(SVR_FULL_PT3, open('SVR_model_FULL_PT3_tgt.sav', 'wb'))
pickle.dump(HGBR_FULL_PT3, open('HGBR_model_FULL_PT3_tgt.sav', 'wb'))
pickle.dump(ADA_FULL_PT3, open('ADA_model_FULL_PT3_tgt.sav', 'wb'))
pickle.dump(KNR_FULL_PT3, open('KNR_model_FULL_PT3_tgt.sav', 'wb'))
pickle.dump(RFR_FULL_PT3, open('RFR_model_FULL_PT3_tgt.sav', 'wb'))

In [None]:
xtest_ZS.to_pickle("xtest_ZS.pkl")
ytest_ZS.to_pickle("ytest_ZS.pkl")

xtest_FULL_PT1.to_pickle("xtest_FULL_PT1.pkl")
ytest_FULL_PT1.to_pickle("ytest_FULL_PT1.pkl")

xtest_FULL_PT2.to_pickle("xtest_FULL_PT2.pkl")
ytest_FULL_PT2.to_pickle("ytest_FULL_PT2.pkl")

In [21]:
xtest_FULL_PT3.to_pickle("xtest_FULL_PT3_tgt.pkl")
ytest_FULL_PT3.to_pickle("ytest_FULL_PT3_tgt.pkl")
events_FULL_PT3_test.to_pickle("events_FULL_PT3_test_tgt.pkl")