In [None]:
import uproot
import ROOT 
import matplotlib.pyplot as plt
from math import pi
import awkward as ak
import numpy as np

import vector
vector.register_awkward()

import torch
import torch.nn as nn
import pandas as pd
import math
import random

from ROOT import TCanvas, TH1F, TLegend, TFile

In [None]:
server = "file:/eos/home-d/dkondrat/"
treedata = uproot.open(server+"data.root")["Events"]
treettbar = uproot.open(server+"ttbar.root")["Events"]
treedy = uproot.open(server+"dy.root")["Events"]

In [None]:
muonsdy = ak.Array(ak.zip({"nmu":treedy["nMuon"].array(), "mu_pt":treedy["Muon_pt"].array(),"mu_eta":treedy["Muon_eta"].array(),"mu_charge":treedy["Muon_charge"].array(),\
                        "mu_id":treedy["Muon_isGlobal"].array(),"mu_phi":treedy["Muon_phi"].array(),
                        "mu_mass":treedy["Muon_mass"].array(),"met":treedy["MET_pt"].array()}))

muonstt = ak.Array(ak.zip({"nmu":treettbar["nMuon"].array(), "mu_pt":treettbar["Muon_pt"].array(),"mu_eta":treettbar["Muon_eta"].array(),"mu_charge":treettbar["Muon_charge"].array(),\
                        "mu_id":treettbar["Muon_isGlobal"].array(),"mu_phi":treettbar["Muon_phi"].array(),
                        "mu_mass":treettbar["Muon_mass"].array(), "met":treettbar["MET_pt"].array()}))

muonsdata = ak.Array(ak.zip({"nmu":treedata["nMuon"].array(), "mu_pt":treedata["Muon_pt"].array(),"mu_eta":treedata["Muon_eta"].array(),"mu_charge":treedata["Muon_charge"].array(),\
                        "mu_id":treedata["Muon_isGlobal"].array(),"mu_phi":treedata["Muon_phi"].array(),
                        "mu_mass":treedata["Muon_mass"].array(), "met":treedata["MET_pt"].array()}))



mu_branches = [ 'Muon_pt', 'Muon_charge','Muon_isGlobal','Muon_eta']

met_branches = ['MET_pt', 'MET_phi', 'MET_sumEt']

branches  = mu_branches  + met_branches

dyevents = treedy.arrays(filter_name=branches)

ttbarevents = treettbar.arrays(filter_name=branches)

dataevents = treedata.arrays(filter_name=branches)


## **Plotting using matplotlib**

In [None]:
'''printing the values of some branches from Drell-Yan sample'''


print('muon_pt=', muonsdy.mu_pt, '\n',
      'muon_eta=', muonsdy.mu_eta, '\n')

'''Lets plot using matplotlib'''

plt.figure(figsize=(6,5))
plt.hist(ak.flatten(muonsdy.mu_pt), bins=150, range=(0, 150), histtype='step',linewidth=2, color='blue', label='DY+Jets')
plt.hist(ak.flatten(muonstt.mu_pt), bins=150, range=(0, 150), histtype='step',linewidth=2, color='orange', label='ttbar')
n, bins, patches = plt.hist(ak.flatten(muonsdata.mu_pt), bins=150, range=(0, 150), histtype='step',linewidth=0 )


errory = np.sqrt(n)
plt.errorbar(np.linspace(0,150,150), n,yerr= errory, fmt='o', markersize=3, color='k', label='Data')

plt.xlabel('Muon $p_{\mathrm{T}}$ [GeV]')
plt.ylabel('Events')
plt.yscale('log')
plt.legend()

plt.savefig(f"muon_pt.pdf")
plt.show()
plt.clf()

## **Plotting using PyROOT**

In [None]:
'''Lets plot some variables using matplotlib'''
c1 = TCanvas( 'c1', 'Dynamic Filling Example', 200, 10, 700, 500 )
c1.Clear()
c1.cd()

h = []
for i in range(3):
    h.append(ROOT.TH1F("muon_pt"+str(i),"Muon pT",150,0,150))
    
colors_ = [600, 800, 1]    

for i, obj in enumerate([dyevents['Muon_pt'], ttbarevents['Muon_pt'], dataevents['Muon_pt']]):
    for pt in obj:
        if(len(pt)!=0):
            h[i].Fill(pt[0])
            h[i].SetLineColor(colors_[i])
            h[i].SetLineWidth(2)

In [None]:
'''Lets plot muon pT'''
c1.Clear()
c1.SetLogy()

h[0].Draw()
h[1].Draw('same')
h[2].Draw('lep,same')


h[0].SetStats(0)
h[0].GetYaxis().SetRangeUser(0.2, 1e5)
h[0].GetYaxis().SetTitle("Events")
h[0].GetXaxis().SetTitle("#mu pT [GeV]")

legend = TLegend(0.60, 0.70, 0.88, 0.88)
legend.SetTextSize(0.05)
legend.AddEntry(h[0],"DY+Jets", "lp")
legend.AddEntry(h[1],"ttbar", "lp")
legend.AddEntry(h[2],"Data", "lp")


legend.Draw()

c1.Modified()
c1.Update()

c1.Draw()
c1.SaveAs("muon_pt_hist.pdf")

##  Lets apply some selections 

In [None]:
'''Lets apply some selections such as pT(muon) > 20GeV and |eta(muon)| < 2.4'''

muon_maskdy = (muonsdy.mu_id==1) & (muonsdy.mu_pt > 20) & (abs(muonsdy.mu_eta)<2.4)
muon_masktt = (muonstt.mu_id==1) & (muonstt.mu_pt > 20) & (abs(muonstt.mu_eta)<2.4)
muon_maskdd = (muonsdata.mu_id==1) & (muonsdata.mu_pt > 20) & (abs(muonsdata.mu_eta)<2.4)

good_muonsdy = muonsdy[muon_maskdy]
two_muonsdy = good_muonsdy[(ak.sum(muon_maskdy,axis=-1)==2)]

good_muonstt = muonstt[muon_masktt]
two_muonstt = good_muonstt[(ak.sum(muon_masktt,axis=-1)==2)]

good_muons_data = muonsdata[muon_maskdd]
two_muons_data = good_muons_data[(ak.sum(muon_maskdd,axis=-1)==2)]


## Lets plot pT distribution of the leading muon after the selection


In [None]:
'''Lets plot muon with pT > 20GeV'''
plt.figure(figsize=(5,4))
plt.hist(two_muonsdy.mu_pt[:,0], bins=150, range=[0,150], histtype='step',linewidth=2, color='blue', label='DY+Jets with criteria')
plt.hist(two_muonstt.mu_pt[:,0], bins=150, range=[0,150], histtype='step',linewidth=2, color='orange', label='ttbar with criteria')
n, bins , patches = plt.hist(two_muons_data.mu_pt[:,0], bins=150, range=[0,150], histtype='step',linewidth=0)

errory = np.sqrt(n)
plt.errorbar(np.linspace(0,150,150), n,yerr= errory, fmt='o', markersize=3, color='k', label='Data')


plt.legend()
plt.xlabel(r'Muon $p_T$ (GeV)')
plt.ylabel('Events')
plt.yscale('log')
plt.savefig(f"muon_pt_cuts.pdf")
plt.show()
plt.clf()

## Lets plot the MET

In [None]:
plt.figure(figsize=(5,4))
plt.hist(two_muonsdy.met[:,0], bins=150, range=[0,150], histtype='step',linewidth=2, color='blue', label='DY+Jets')
plt.hist(two_muonstt.met[:,0], bins=150, range=[0,150], histtype='step',linewidth=2, color='orange', label='ttbar')
n, bins, patches = plt.hist(two_muons_data.met[:,0], bins=150, range=[0,150], histtype='step',linewidth=0)

errory = np.sqrt(n)
plt.errorbar(np.linspace(0,150,150), n,yerr= errory, fmt='o', markersize=3, color='k', label='Data')

plt.legend()
plt.xlabel(r'MET (GeV)')
plt.ylabel('Events')

plt.savefig(f"met.pdf")
plt.show()
plt.clf()

## Lets select two muons of opposite charge


In [None]:
'''Lets select two muons of opposite charge'''

two_muonsdy.mu_charge

opp_muonsdy = two_muonsdy.mu_charge[:,0]!=two_muonsdy.mu_charge[:,1]

two_opp_good_muonsdy = two_muonsdy[opp_muonsdy]
two_opp_good_muonsdy

'''We have two good muons now in Drell-Yan'''

'''ttbar'''
two_muonstt.mu_charge

opp_muonstt = two_muonstt.mu_charge[:,0]!=two_muonstt.mu_charge[:,1]

two_opp_good_muonstt = two_muonstt[opp_muonstt]
two_opp_good_muonstt

'''We have two good muons now in ttbar'''

'''Lets select two muons in Data'''

two_muons_data.mu_charge

opp_muons_data = two_muons_data.mu_charge[:,0]!=two_muons_data.mu_charge[:,1]

two_opp_good_muons_data = two_muons_data[opp_muons_data]
two_opp_good_muons_data

'''We have two good muons now in Data as well'''

In [None]:
'''Lets make a four vector for the invariant mass'''

mu_p4dy = ak.Array(ak.zip({"pt":two_opp_good_muonsdy.mu_pt,
                        "eta":two_opp_good_muonsdy.mu_eta,
                        "phi":two_opp_good_muonsdy.mu_phi,
                        "mass":two_opp_good_muonsdy.mu_mass}),with_name = "Momentum4D")

dimuon_p4dy = mu_p4dy[:,0] + mu_p4dy[:,1]

mu_p4tt = ak.Array(ak.zip({"pt":two_opp_good_muonstt.mu_pt,
                        "eta":two_opp_good_muonstt.mu_eta,
                        "phi":two_opp_good_muonstt.mu_phi,
                        "mass":two_opp_good_muonstt.mu_mass}),with_name = "Momentum4D")

dimuon_p4tt = mu_p4tt[:,0] + mu_p4tt[:,1]

mu_p4_data = ak.Array(ak.zip({"pt":two_opp_good_muons_data.mu_pt,
                        "eta":two_opp_good_muons_data.mu_eta,
                        "phi":two_opp_good_muons_data.mu_phi,
                        "mass":two_opp_good_muons_data.mu_mass}),with_name = "Momentum4D")

dimuon_p4_data = mu_p4_data[:,0] + mu_p4_data[:,1]

## Plotting the dimuon mass

In [None]:
'''Plot dimuon invariant mass'''
plt.figure(figsize=(5,4))

dimuon_mass_dy = dimuon_p4dy.M
dimuon_mass_tt = dimuon_p4tt.M
dimuon_mass_dd = dimuon_p4_data.M

plt.hist(dimuon_mass_dy, bins=150, range=[0,500], histtype='step',linewidth=2, color='blue', label='DY+Jets')
plt.hist(dimuon_mass_tt, bins=150, range=[0,500], histtype='step',linewidth=2, color='orange', label='ttbar')
n, bins, patches = plt.hist(dimuon_mass_dd, bins=150, range=[0,500], histtype='step',linewidth=0)

errory = np.sqrt(n)
plt.errorbar(np.linspace(0,500,150), n,yerr= errory, fmt='o', markersize=3, color='k', label='Data')

plt.title('Dimuon invariant mass')
plt.xlabel('Dimuon invariant mass [GeV]')
plt.ylabel('Events')
#plt.xscale('log')
plt.yscale('log')
plt.legend()
plt.savefig(f"dimuon_mass.pdf")
plt.show()
plt.clf()


# Training a DNN using PyTorch to select the Drell-Yan events 

In [None]:
'''create pandas df for each dataset to input this is in DNN'''

for index, obj in enumerate([mu_p4dy, mu_p4tt, mu_p4_data]):
    mu1 = obj[:,0]
    mu2 = obj[:,1]

    dimu = mu1 + mu2
    df1 = pd.DataFrame(mu1.to_list())
    df1 = df1.add_prefix('mu1_')

    df2 = pd.DataFrame(mu2.to_list())
    df2 = df2.add_prefix('mu2_')

    if index ==0:
        df_met = pd.DataFrame(two_opp_good_muonsdy.met[:,0],columns=['MET'])
    elif index ==1:
        df_met = pd.DataFrame(two_opp_good_muonstt.met[:,0],columns=['MET'])
    else:
        df_met = pd.DataFrame(two_opp_good_muons_data.met[:,0],columns=['MET'])
        
    df_mass = pd.DataFrame(dimu.M, columns=['dimuon_mass'])
    df = pd.concat([df1,df2, df_mass, df_met], axis=1)
    if index == 0:
        df_dy = df
    elif index==1:
        df_tt = df
    else:
        df_data = df

load_features = ['mu1_pt', 'mu1_eta', 'mu2_pt', 'mu2_eta', 'dimuon_mass','MET']

df_sig = df_dy[load_features]
df_bkg = df_tt[load_features]

df_sig["label"] = 1.0
df_bkg["label"] = 0.0

dataset = pd.concat([df_sig, df_bkg], ignore_index=True)
dataset = dataset.dropna()



In [None]:
'''Lets train a network and apply some selections'''

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# Seed
seed = 123
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_sizes, num_classes):
        super(NeuralNet, self).__init__()
        layers = []
        layers.append(nn.Linear(input_size, hidden_sizes[0]))
        layers.append(nn.ReLU())
        for i in range(len(hidden_sizes)-1):

            layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i+1]))
            layers.append(nn.ELU())
            #layers.append(nn.Dropout(p=0.1))

        layers.append(nn.Linear(hidden_sizes[-1], num_classes))
        layers.append(nn.Sigmoid())
        self.layers = nn.ModuleList(layers)

    def get_weights(self):
        return self.weight    
    def forward(self, x):
        out = self.layers[0](x)
        for i in range(1, len(self.layers)):
            out = self.layers[i](out)

        return out

    
input_size = len(load_features)
hidden_sizes = [16, 8]
learning_rate = 0.001
num_classes = 1
num_epochs =  20
batch_size =  256


In [None]:
model = NeuralNet(input_size, hidden_sizes, num_classes).to(device)
model = model.float()
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
decayRate = 0.6
my_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=decayRate)

train_size = int(0.7*len(dataset))

train_data = dataset.iloc[:train_size, :]

train = train_data.drop(columns=["label"]).values

val_data = dataset.iloc[train_size:, :]
val = val_data.drop(columns=["label"]).values

train_labels = train_data["label"].values
val_labels = val_data["label"].values

total_step = train_size

print("RUNNING... ")

for epoch in range(num_epochs):
    mean_loss = 0
    tot_wgt = 0
    val_mean_loss = 0
    val_tot_wgt = 0
    for i in range(int(train_size/batch_size)):
        # Move tensors to the configured device
        data = torch.from_numpy(train[i*batch_size: (i+1)*batch_size]).to(device)
        label = torch.from_numpy(train_labels[i*batch_size: (i+1)*batch_size].reshape((batch_size,1))).to(device)

        outputs = model(data.float())
    
        loss = criterion(outputs, label.float())
        weight_loss = loss

        # Backward and optimize
        optimizer.zero_grad()
        weight_loss.mean().backward()
        optimizer.step()
        mean_loss += weight_loss.mean().item()*batch_size

        if i%4 == 0:
            j = int(i/4)
            val_data = torch.from_numpy(val[j*batch_size: (j+1)*batch_size]).to(device)
            val_label = torch.from_numpy(val_labels[j*batch_size: (j+1)*batch_size].reshape(val_labels[j*batch_size: (j+1)*batch_size].shape[0],1)).to(device)
            val_outputs = model(val_data.float())


            val_loss = criterion(val_outputs, val_label.float())

        if (i+1) % 150 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Val Loss: {:.4f}'
                   .format(epoch+1, num_epochs, i+1, int(total_step/batch_size), mean_loss, val_loss))
            mean_loss=0
            tot_wgt=0
            val_mean_loss=0
            val_tot_wgt=0
    my_lr_scheduler.step()

torch.save(model.state_dict(), 'model.ckpt')

'''The model has been saved. Lets now evaluate the results'''



## Plotting the DNN Score

In [None]:
'''Loading the model'''
model.load_state_dict(torch.load("model.ckpt"))
model.eval()

df_sig = df_dy[load_features]
df_bkg = df_tt[load_features]
df_data = df_data[load_features]

df_sig = df_sig.dropna()
df_bkg = df_bkg.dropna()
df_data = df_data.dropna()

sig = df_sig.values
bkg = df_bkg.values
data = df_data.values

sig = torch.from_numpy(sig).to(device)
sig_scores = model(sig.float()) 
sig_scores = sig_scores.cpu().detach().numpy()
sig_scores = sig_scores.ravel()

bkg = torch.from_numpy(bkg).to(device)
bkg_scores = model(bkg.float())
bkg_scores = bkg_scores.cpu().detach().numpy()
bkg_scores = bkg_scores.ravel()

data = torch.from_numpy(data).to(device)
data_scores = model(data.float())
data_scores = data_scores.cpu().detach().numpy()
data_scores = data_scores.ravel()

bins = np.linspace(0, 1, 100)
plt.figure(figsize=(5,4))

plt.hist(sig_scores, bins, alpha=0.3, label='sig')
plt.hist(bkg_scores, bins, alpha=0.3, label='bkg')
plt.xlabel('DNN Score')
plt.ylabel('Events')
plt.legend(loc='upper left')


## Lets check the dimuon mass after selection with DNN based discriminator

In [None]:
'''selecting Drell-Yan events with the DNN based discriminator'''
sig_dy = two_opp_good_muonsdy[sig_scores>0.7]
data_dy = two_opp_good_muons_data[sig_scores>0.7]
bkg_tt = two_opp_good_muonstt[bkg_scores>0.7]

'''Lets make a four vector for the invariant mass after the DNN cut'''

mu_p4_dnn_dy = ak.Array(ak.zip({"pt":sig_dy.mu_pt,
                        "eta":sig_dy.mu_eta,
                        "phi":sig_dy.mu_phi,
                        "mass":sig_dy.mu_mass}),with_name = "Momentum4D")

dimuon_dnn_dy = mu_p4_dnn_dy[:,0] + mu_p4_dnn_dy[:,1]

mu_p4_dnn_tt = ak.Array(ak.zip({"pt":bkg_tt.mu_pt,
                        "eta":bkg_tt.mu_eta,
                        "phi":bkg_tt.mu_phi,
                        "mass":bkg_tt.mu_mass}),with_name = "Momentum4D")

dimuon_dnn_tt = mu_p4_dnn_tt[:,0] + mu_p4_dnn_tt[:,1]

mu_p4_dnn_dd = ak.Array(ak.zip({"pt":data_dy.mu_pt,
                        "eta":data_dy.mu_eta,
                        "phi":data_dy.mu_phi,
                        "mass":data_dy.mu_mass}),with_name = "Momentum4D")

dimuon_dnn_data = mu_p4_dnn_dd[:,0] + mu_p4_dnn_dd[:,1]

In [None]:
'''Plot dimuon invariant mass'''
plt.figure(figsize=(5,4))

dimuon_dnn_dymass = dimuon_dnn_dy.M
dimuon_dnn_ttmass = dimuon_dnn_tt.M
dimuon_dnn_ddmass = dimuon_dnn_data.M

plt.hist(dimuon_dnn_dymass, bins=150, range=[0,500], histtype='step',linewidth=2, color='blue', label='DY+Jets')
plt.hist(dimuon_dnn_ttmass, bins=150, range=[0,500], histtype='step',linewidth=2, color='orange', label='ttbar')
n, bins, patches = plt.hist(dimuon_dnn_ddmass, bins=150, range=[0,500], histtype='step',linewidth=0)

errory = np.sqrt(n)
plt.errorbar(np.linspace(0,500,150), n,yerr= errory, fmt='o', markersize=3, color='k', label='Data')

plt.title('Dimuon invariant mass')
plt.xlabel('Dimuon invariant mass [GeV]')
plt.ylabel('Events')
#plt.xscale('log')
plt.yscale('log')
plt.legend()
plt.savefig(f"dimuon_mass_dnncut.pdf")
plt.show()
plt.clf()

## We see that the selected data events are populated with Drell-Yan events