In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import sys

import matplotlib.pyplot as plt

sys.path.append('../..')
from plotting.matplotlib_setup import configure_latex, savefig, set_size_decorator, savefig, thiner_border

tex_dir, images_dir = 'porocilo/main.tex', 'porocilo/images'

configure_latex(style=['science', 'notebook'], global_save_path=images_dir)

%config InlineBackend.figure_format = 'pdf'

In [None]:
from NIST_tests import RNG_test
from random_helper_functions import get_bitstring, binary_tree_walk
from benford_helper_functions import get_first_digit, benfords_test, normalize
from stat_tests import chi2_test, ks_test

https://www.kaggle.com/fedesoriano/cern-electron-collision-data

# Load data

In [None]:
df = pd.read_csv('dielectron_data/dielectron.csv')

In [None]:
df.head()

In [None]:
fig, axs = set_size_decorator(plt.subplots, fraction=1.8, ratio='4:3')(5, 4)

axs[-1, -1].set_visible(False)

axs = df.hist(bins=40, histtype='step', ax=axs.flatten()[:-1], lw=1.5)
axs = [thiner_border(ax) for ax in axs]

for ax in axs:
    ax.grid(False)
    ax.ticklabel_format(style='sci', axis='y', scilimits=(0, 0))
    
# savefig('dielectron_hists', tight_layout=False)

# Naboji

In [None]:
df['Q1'][df['Q1'] == -1] = 0
df['Q2'][df['Q2'] == -1] = 0

In [None]:
Q1 = df['Q1'].values
Q2 = df['Q2'].values

q1 = Q1.astype(str)
q2 = Q2.astype(str)
q1 = ''.join(q1)
q2 = ''.join(q2)

In [None]:
q = np.concatenate((df['Q1'].values, df['Q2'].values)).astype(str)
q = ''.join(q)

In [None]:
Q = np.vstack((Q1, Q2)).T.flatten().astype(str)
Q = ''.join(Q)

In [None]:
t1 = RNG_test(q1, short_df=True)
t2 = RNG_test(q2, short_df=True)
t3 = RNG_test(q, short_df=True)
t4 = RNG_test(Q, short_df=True)

test_q_df = pd.concat((t1, t2, t3, t4))

In [None]:
test_q_df.columns = [i for i in range(1, 15+1)]
test_q_df.index = [r'$p_{Q_1}$', r'$p_{Q_2}$', r'$p_{Q_1 Q_2}$', r'$p_{Q_1, Q_2}$']

In [None]:
test_q_df

In [None]:
np.unique(Q1 == Q2, return_counts=True)

In [None]:
np.unique(Q1, return_counts=True)

In [None]:
np.unique(Q2, return_counts=True)

In [None]:
np.unique(df['Q1'].values, return_counts=True)

In [None]:
np.unique(df['Q2'].values, return_counts=True)

# Mnozenje

In [None]:
ps = [df['px1 '].values, df['py1'].values, df['pz1'].values, df['px2'].values, df['py2'].values, df['pz2'].values]
ps = np.abs(np.array(ps))

r = np.arange(0, len(ps))

dists = []

for i in r:
    m = np.prod(ps[:i+1], axis=0)
    dists.append(m)

In [None]:
fig, ax = set_size_decorator(plt.subplots, fraction=0.8, ratio='golden')(1, 1)

for i, lognorm in enumerate(dists):
    bins = np.logspace(np.floor(np.log10(lognorm.min())), 
                       np.floor(np.log10(lognorm.max())) + 1, 
                       400)

    n, bins = np.histogram(lognorm, bins=bins)
    bins = bins[1:]
    
    ax.plot(np.log10(bins), n, lw=1, label=f'$N={i+1}$')

ax.legend(fontsize=8, loc='upper left')
ax.ticklabel_format(style='sci', axis='y', scilimits=(0, 0))

ax.set_xlabel(r'$\log_{10} X$')
ax.set_ylabel('$N$')

# savefig('dielectron_lognorm')

In [None]:
from benford_helper_functions import benford_ft

In [None]:
f1s = []
first_digits = []
fracs = []
rng_tests = []
chi2_tests, ks_tests = [], []

alpha = 0.01

for i, lognorm in enumerate(dists):
    bins = np.logspace(np.floor(np.log10(lognorm.min())), 
                       np.floor(np.log10(lognorm.max())) + 1, 
                       len(lognorm))
    
    n, bins = np.histogram(lognorm, bins=bins)
    bins = bins[:-1]
    bins = np.log10(bins)
    pdf = normalize(n, bins)
    
    # f1 = benfords_test(n, bins)
    freq, SF, sf, PDF, OST, ost = benford_ft(pdf, bins, shift=True)
    ind = np.argsort(np.abs(SF))
    f1 = np.abs(PDF)[ind[1]]
    
    f1s.append(f1)
    
    first_digit = get_first_digit(lognorm)
    _, c = np.unique(first_digit, return_counts=True)
    c = c / np.sum(c)
    first_digits.append(c)
    
    frac = np.log10(lognorm) % 1
    fracs.append(frac)
    
    chi2_tests.append(chi2_test(frac, n_bins=int(np.sqrt(len(frac))), alpha=alpha))
    ks_tests.append(ks_test(frac, alpha=alpha))
    
    # bits = binary_tree_walk(frac).astype(str)
    bits = get_bitstring(frac, length=32)
    bits = ''.join(bits)
    test = RNG_test(bits, short_df=True)
    rng_tests.append(test)

df = pd.concat([i for i in rng_tests])
df.index = [f'$p_{i}$' for i in range(1, len(df)+1)]
df.columns = [i + 1 for i in range(len(df.columns))]

In [None]:
df

In [None]:
dct = {r'$n_1$': [f'{i[0]:.4f}' for i in first_digits],
       r'$\Delta n_1$': [f'{abs(i[0] - np.log10(2)):.4f}' for i in first_digits],
       r'$f_1$': [f'{i:.5f}' for i in f1s], 
       r'$\chi^2$': [f'{i[0][0][0]:.2f}' for i in chi2_tests],
       r'$d$': [f'{i[0][0][0]:.4f}' for i in ks_tests],
       r'$p_{\chi^2}$': [f'{i[0][0][1]:.4f}' for i in chi2_tests],
       r'$p_d$': [f'{i[0][0][1]:.4f}' for i in ks_tests]}

In [None]:
test_df = pd.DataFrame(dct)
test_df.index = [f'$N={i}$' for i in range(1, len(df)+1)]
test_df.sort_values(by=['$\chi^2$'], inplace=True)