### pytorch notebook for reweighting using density ratio estimation with calibrated classifiers

The idea behind this notebook is to reweight one distribution $p_0(x)$ to look like another distribution $p_1(x)$.  

The reweighting technique is based on [Approximating Likelihood Ratios with Calibrated Discriminative Classifiers](http://inspirehep.net/record/1377273). 

In this notebook V+jets samples generated with Madgraph5 ($p_0(x)$) and Sherpa ($p_1(x)$) are compared, and the  weights are derived to reweight Madgraph5 to look like Sherpa.  

The performance of the weights, i.e. how well the reweighted original distribution matches the target distribution, is assessed by training a discriminator to differentiate the original distribution with weights applied from a target distribution.  

Work in progress by Leonora Vesterbacka. 

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import theano
from itertools import product
import root_numpy
import pandas as pd
import uproot
import torch
np.random.seed(314)

Welcome to JupyROOT 6.18/00


In [2]:
#the example has much more real data than monte carlo
#this leads to unbalanced dataset
#some techniques deal with that better than others
data_to_use = ["all","max balanced"][0]

# the histogram and kde calibration don't work very well
#with very peaked output score distributions,
#but the isotonic approach does
calibration_type = ["isotonic", "kde", "histogram"][0]

#do either training using all phase space by defining do = "varAll", or just two variables by defining do = "var2"
do = ["two", "all"][0]
normalize = False

In [3]:
if do == "two":
    binning = [range(0, 2400, 200), range(0, 15, 1)]
    variables = ['VpT','Njets']
    vlabels = ['V $\mathrm{p_{T}}$ [GeV]','Number of jets']

    weights = ['normweight']
if do == "all":
    etaV = [-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
    etaJ = [-2.8,-2.4,-2,-1.6,-1.2,-0.8,-0.4,0,0.4,0.8,1.2,1.6,2,2.4,2.8]
    variables = ['VpT','Njets','j1pT', 'j2pT', 'HT','ptmiss', 'l1pT','Veta','j1eta','j2eta']
    vlabels = ['V $\mathrm{p_{T}}$ [GeV]','Number of jets','Leading jet $\mathrm{p_{T}}$ [GeV]','Subleading jet $\mathrm{p_{T}}$ [GeV]', '$\mathrm{H_{T}}$ [GeV]','$\mathrm{p_{T}^{miss}}$ [GeV]', 'Leading lepton $\mathrm{p_{T}}$ [GeV]','V $\eta$','Leading jet $\eta$','Subleading jet $\eta$']
    binning = [range(0, 2400, 100), range(0, 15, 1), range(0, 2700, 100),range(0, 2700, 100),range(0, 4000, 200),range(0, 600, 50),range(0, 1500, 50), etaV, etaJ, etaJ]
    weights = ['normweight']
    #truthWeight is the generator weight
#get original and target samples, madgraph:original, sherpa:target
original  = root_numpy.root2array('/eos/user/m/mvesterb/data/madgraph/one/Nominal.root', branches=variables)
target    = root_numpy.root2array('/eos/user/m/mvesterb/data/sherpa/one/Nominal.root', branches=variables)
#originalW = root_numpy.root2array('/eos/user/m/mvesterb/data/madgraph/one/Nominal.root', branches=weights)
#targetW   = root_numpy.root2array('/eos/user/m/mvesterb/data/sherpa/one/Nominal.root', branches=weights)
#create dataframes to do the training on, and also get the sample weights in separate dataframes for resampling
oDF   = pd.DataFrame(original,columns=variables)
tDF   = pd.DataFrame(target,columns=variables)
#oWDF  = pd.DataFrame(originalW,columns=weights)
#tWDF  = pd.DataFrame(targetW,columns=weights)

oT = torch.tensor(oDF[variables].values)
tT = torch.tensor(tDF[variables].values)
print("oT",oT)

('oT', tensor([[51.6451,  1.0000],
        [ 2.4802,  0.0000],
        [14.1178,  0.0000],
        ...,
        [71.5885,  0.0000],
        [ 9.1785,  0.0000],
        [10.2159,  0.0000]], dtype=torch.float64))


A discriminator is trained to differentiate the original and the target distributions from each other, as well as differentiating the target distribution from the original distributions with the learned carl applied. Well learned weights would make the target and reweighted distributions very similar and indistinguishable for the discriminator. 

In [4]:
#to randomize training and test data
n_target = oT.shape[0]
rand = np.random.choice(range(oT.shape[0]),2*n_target,replace=True)
randomized_original = oT[rand]

X0_all = randomized_original[:n_target,:]
X0_test = randomized_original[n_target:,:]

X1_all = tT
print(oT.shape)
print(X0_all.shape)
print(X0_test.shape)
print(X1_all.shape)

torch.Size([20981, 2])
torch.Size([20981, 2])
torch.Size([20981, 2])
torch.Size([83316, 2])


In [5]:
#make training data from all samples
num1 = X0_all.shape[0]
num2 = X1_all.shape[0]

#X_all = np.vstack((X0_all,X1_all))
#y_all = np.ones(num1 + num2, dtype=np.int)
X_all = torch.cat([X0_all,X1_all])
y_all = torch.ones(num1+num2,dtype=torch.int) 
y_all[num1:] = 0

#randomly sample X0 to have the same number of entries as X1
# assuming X0 is bigger here
X0_s = X0_all[np.random.choice(range(X0_all.shape[0]),num1,replace=True)]
X_s = torch.cat((X0_s, X1_all))
y_s = torch.ones(num1 + num2, dtype=torch.int)
y_s[num1:] = 0

X1_x = X1_all[np.random.choice(range(X1_all.shape[0]),num1,replace=True)]
X_x = torch.cat((X0_all, X1_x))
y_x = torch.ones(num1 + num1, dtype=torch.int)
y_x[num1:] = 0
#now use the flags to decide which of the datasets to use
X, X0, X1, y = None, None, None, None
if data_to_use == "all":
    X, X0, X1, y = X_all, X0_all, X1_all, y_all
elif data_to_use == "max balanced":
    X, X0, X1, y = X_s, X0_s, X1_all, y_s
else:    
    print("error")
X0.requires_grad_(True)
X1.requires_grad_(True)
X.requires_grad_(True)

print(X0.shape)
print(X1.shape)
print(y.shape)
print(X.shape)

torch.Size([20981, 2])
torch.Size([83316, 2])
torch.Size([104297])
torch.Size([104297, 2])
