# Import libraries

In [None]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm, truncnorm, multivariate_normal
from sklearn.linear_model import LinearRegression
import pylab
import time


In [None]:
import pickle

# Functions

In [None]:
def dist_trunc_dep(meana,stda,meanb,stdb,covar,n):
    ref_matrix=[[stda**2,covar],[covar,stdb**2]]
    ref_sample=multivariate_normal.rvs(mean=(meana,meanb),cov=ref_matrix, size=3*n)
    ref_sample = ref_sample[ref_sample[:,1] > 0]
    return ref_sample

In [None]:
def var_q(df2):
    for name, group in df2.groupby('article'):
        for energy, data in group.groupby('energy'):
            if energy ==0:
                ab_ref=(data.alpha_fit/data.beta_fit).unique()                
                a_ref=data.alpha_fit.unique()
                a_ref_err=data.alpha_fit_err.unique()
                b_ref=data.beta_fit.unique()
                b_ref_err=data.beta_fit_err.unique()
                covar_ref= data.covar_fit.unique()
                
            df2.loc[(name,energy),'a_fit_ref'] = a_ref
            df2.loc[(name,energy),'a_fit_ref_err'] = a_ref_err
            df2.loc[(name,energy),'b_fit_ref'] = b_ref
            df2.loc[(name,energy),'b_fit_ref_err'] = b_ref_err
            df2.loc[(name,energy),'covar_ref'] = covar_ref

In [None]:
def dict_q(df2,num):
    d={}
    for name, group in df2.groupby('article'):
        d[name]={}
        for energy, data in group.groupby('energy'):

            if energy == 0:
                d[name][0] = {}
                ref_sample = np.array(dist_trunc_dep(data.a_fit_ref.values.max(),
                                                     data.a_fit_ref_err.values.max(),
                                                     data.b_fit_ref.values.max(),
                                                     data.b_fit_ref_err.values,
                                                     data.covar_fit,num))
                d[name][0][ 'ref_alpha_dep']=ref_sample[:,0]
                d[name][0][ 'ref_beta_dep']=ref_sample[:,1]

            else:
                d[name][energy]={}
                ref_sample = np.array(dist_trunc_dep(data.alpha_fit.values.max(),
                                                     data.alpha_fit_err.values.max(),
                                                     data.beta_fit.values.max(),
                                                     data.beta_fit_err.values,data.covar_fit,num))
                d[name][energy][ 'alpha_dep']=ref_sample[:,0]
                d[name][energy][ 'beta_dep']=ref_sample[:,1]
                d[name][energy]['LET']=data.LET.values.max()
    return d

\begin{align}
\frac{\alpha}{\alpha_{phot}} = 1+ \frac{q L}{(\frac{\alpha}{\beta})_{phot}}
\end{align}

In [None]:
def find_q_sk(d,i):
    x=list()
    y=list()
    
    for k,v in d.items():
        for m,n in v.items():
            if m>0:
                y.append(d[k][m]['alpha_dep'][i] /  d[k][0]['ref_alpha_dep'][i] -1)
                x.append(d[k][m]['LET']*d[k][0]['ref_beta_dep'][i]/d[k][0]['ref_alpha_dep'][i])
             
    x= np.asarray(x).reshape(-1,1)
    reg = LinearRegression(fit_intercept=False).fit(x, y)
    q=reg.coef_[0]
    if q<-1 :
        print(k,m,i)
    R=reg.score(x,y)
    return q,R

In [None]:
def save_dict_to_file(data_dict,name):
    a_file = open(name, "wb")
    pickle.dump(data_dict, a_file)
    a_file.close()

In [None]:
params = {'legend.fontsize': '20',
         'xtick.direction' : 'in',  
         'ytick.direction' : 'in', 
         'xtick.top' : True, 
         'figure.figsize': (10,6),
         'axes.labelsize': '26',
         'axes.titlesize':'26',
         'xtick.labelsize':'20',
         'ytick.labelsize':'20',
         'xtick.major.pad':'16',
         'ytick.major.pad':'16'}
    
pylab.rcParams.update(params)

# Open data

In [None]:
open_fname = os.path.join('tmp','fitted_data.h5')
save_fname = os.path.join('tmp','distrib_q.h5')

In [None]:
size =50000

df = pd.read_hdf(open_fname,"data_1")
df = df.groupby(["article",'energy']).max()

var_q(df)
d = dict_q(df,size)

qtmp,Rtmp=zip(*[find_q_sk(d,i) for i in range(size)])  

In [None]:
q= pd.DataFrame(qtmp)
q.to_csv('tmp/best_q.csv')

# Sample size-comparer

In [None]:
sample_size=[100,200,500,1000,2000,5000,10000,20000]#,50000,100000,200000,500000]
index = ['mean','std','median','quantile025','quantile975','R2','time[s]']

In [None]:
sample_compare=pd.DataFrame(columns = sample_size, index = index)

In [None]:
df = pd.read_hdf(open_fname,"data_1")
df = df.groupby(["article",'energy']).max()

for size in sample_size:
    start = time.time()
    var_q(df)
#create dict
    d = dict_q(df,size);

#calculate q values and R^2
    qtmp,Rtmp=zip(*[find_q_sk(d,i) for i in range(size)])  ;
    
    R=pd.DataFrame(np.asarray(Rtmp),columns = ['r']);
    qdf=pd.DataFrame(np.asarray(qtmp),columns = ['q']);
    end = time.time()

#compare calculation time for different sample-size    
    sample_compare[size] = qdf.q.mean(),qdf.q.std(),qdf.q.median(),qdf.q.quantile(0.025),qdf.q.quantile(0.975),R.r.mean(),(end-start)

In [None]:
sample_compare.T

In [None]:
sample_compare.T.to_excel("results/compare_samples_size.xlsx")

# Q-comparer

In [None]:
datas= ["data_1","data_2","data_3","data_4","data_5","data_6","data_7"]
index2 = ['mean','std','median','r2','quantile025','quantile975',"conditions","exp no"]

conditions = ["-","AB","ABEFG","ABCD","ABC","ABE","ABG"]
exp_no=[24,24,10,4,20,13,16]

In [None]:
q_compare=pd.DataFrame(columns = datas, index = index2)

In [None]:
#choose optimal sample size 
num=5000

In [None]:
for data_q,condition,exp in zip(datas,conditions,exp_no):
    df2 = pd.read_hdf(open_fname,data_q)
    df2 = df2.groupby(["article",'energy']).max()
    var_q(df2)
#create dict
    d = dict_q(df2,num)

#calculate q values and R^2
    qtmp,Rtmp=zip(*[find_q_sk(d,i) for i in range(num)])
    
    R=pd.DataFrame(np.asarray(Rtmp),columns = ['r'])
    qdf=pd.DataFrame(np.asarray(qtmp),columns = ['q'])
    
#q distribution parameters: mean, standard deviation, median, r^2   
    q_compare[data_q] = qdf.q.mean(),qdf.q.std(),qdf.q.median(),R.r.mean(),qdf.q.quantile(0.025),qdf.q.quantile(0.975),condition,exp
#save q distributions
    qdf.to_hdf(save_fname, data_q, format='table')
    
    if data_q == 'data_1':
        save_dict_to_file(d,'tmp/sample1_dict_alpha_beta.pkl')


In [None]:
q_compare.T

In [None]:
q_compare.T.to_excel("results/compare_q_distributions.xlsx")