In [None]:
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn import metrics
import xgboost as xgb

In [None]:
%%time
workdir=os.getcwd()
os.makedirs('fig', exist_ok=True)
fig_dir=workdir+'/fig'
algo_name=['T23', 'S', 'S0', 'S10', 'S20', 'T', 'T0', 'T10', 'T20']
algo={}
for i in range(len(algo_name)):
    algo[i]=pd.read_csv(workdir+'/data/{}.csv'.format(algo_name[i]))

In [None]:
%%time
#cut
ptcut=10
etamin=1.6
etamax=2.9
algo_cut={}
for i in algo:
    sel=algo[i]['genpart_pt']>ptcut
    algo_cut[i]=algo[i][sel]
    sel=np.abs(algo_cut[i]['genpart_exeta'])>etamin
    algo_cut[i]=algo_cut[i][sel]
    sel=np.abs(algo_cut[i]['genpart_exeta'])<etamax
    algo_cut[i]=algo_cut[i][sel]
    algo_cut[i].dropna(inplace=True)
    algo_cut[i]['genpart_pid'].replace([-11,11],0, inplace=True)
    algo_cut[i]['genpart_pid'].replace([-211,211],1, inplace=True)

In [None]:
from sklearn.model_selection import train_test_split
columns=['cl3d_eta','cl3d_showerlength',
       'cl3d_coreshowerlength', 'cl3d_firstlayer', 'cl3d_maxlayer', 'cl3d_szz',
       'cl3d_seetot', 'cl3d_spptot', 'cl3d_srrtot', 'cl3d_srrmean','cl3d_pt']


X_train={}
X_test={}
y_train={}
y_test={}

for i in algo:
    X_train[i], X_test[i], y_train[i], y_test[i] = train_test_split(algo_cut[i][columns], algo_cut[i]['genpart_pid'], test_size=0.2)

In [None]:
X_pt={}
for i in algo:
    X_pt[i]=X_test[i]['cl3d_pt']
    X_test[i]=X_test[i].drop(columns='cl3d_pt')
    X_train[i]=X_train[i].drop(columns='cl3d_pt')
    
columns.remove('cl3d_pt')


In [None]:
train={}
test={}

for i in algo:
    train[i] = xgb.DMatrix(data=X_train[i],label=y_train[i], feature_names=columns)
    test[i] = xgb.DMatrix(data=X_test[i],label=y_test[i],feature_names=columns)

In [None]:
def plot_discr(feature):
    nbins=20
    plt.hist(X_train[i][feature][y_train[i] == 0],
         histtype='step',color='midnightblue',label='electrons', bins=nbins);
    plt.hist(X_train[i][feature][y_train[i] == 1],
         histtype='step',color='firebrick',label='pions', bins=nbins);
    plt.xlabel(feature,fontsize=12);
    plt.ylabel('Events',fontsize=12);
    plt.legend(frameon=False);

In [None]:
%%time
os.makedirs('fig/feature_repart', exist_ok=True)
for i in algo:
    plt.figure(figsize=(15,30))
    j=0
    for feature in columns:
        j+=1
        plt.subplot(4,3,j)
        plot_discr(feature)
        plt.title(algo_name[i])
    plt.savefig(fig_dir+'/feature_repart/{}.png'.format(algo_name[i]))

In [None]:
%%time
#corr matrices for signal/background
os.makedirs(name='fig/corr_matrix', exist_ok=True)
corrsig={}
corrbac={}
for i in algo:
    sel1=algo_cut[i]['genpart_pid']==0
    sel2=algo_cut[i]['genpart_pid']==1
    corrsig[i]=algo_cut[i][sel1][columns].corr()
    corrbac[i]=algo_cut[i][sel2][columns].corr()
    fig, ax = plt.subplots(ncols=2,figsize=(20, 10), sharey=True, sharex=True)
    fig.suptitle(algo_name[i])
    
    axs=ax[0]
    cax=axs.matshow(corrsig[i], cmap='PiYG', vmin=-1, vmax=1)
    plt.xticks(range(len(corrsig[i].columns)), corrsig[i].columns);
    plt.yticks(range(len(corrsig[i].columns)), corrsig[i].columns);
    axs.set_title('signal', pad=70)
    
    plt.colorbar(cax,ax=axs)
    for item in (axs.get_xticklabels()):
        item.set_rotation(90)
    
    axs=ax[1]
    cax=axs.matshow(corrbac[i], cmap='PiYG', vmin=-1,vmax=1)
    plt.xticks(range(len(corrbac[i].columns)), corrbac[i].columns);
    plt.yticks(range(len(corrbac[i].columns)), corrbac[i].columns);
    axs.set_title('background', pad=70)
    
    plt.colorbar(cax,ax=axs)
    
    for item in (axs.get_xticklabels()):
        item.set_rotation(90)
    
    plt.savefig(fig_dir+'/corr_matrix/correlation_matrix_%s.png' %algo_name[i])

In [None]:
%%time
for i in algo:        
    algo_cut[i]['algo']=algo_name[i]
    algo_cut[i]['electron']=(algo_cut[i]['genpart_pid']==0)
    if i==0:
        algo_all=algo_cut[i]
    else:
        algo_all=pd.concat([algo_all,algo_cut[i]])


In [None]:
%%time
#violin plot for all algos

j=0
plt.figure(figsize=(25,60))

for  feature in columns:
    j+=1
    plt.subplot(10,1,j)
    ax=sns.violinplot(y=feature, x='algo',hue='electron', data=algo_all, split=True)
    if j==4:
        ax.set_ylim(top=18)
    if j==5:
        ax.set_ylim(8,35)
    if j==6:
        ax.set_ylim(top=30)   
    
plt.suptitle('Violin plot')
plt.savefig(fig_dir+'/violinplotall.png')