# Import libraries

In [None]:
import os
import pandas as pd
import matplotlib

import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats 
import scikit_posthocs as sp
import statsmodels.api as sm
import pylab

In [None]:
from scipy.stats import kruskal
from scipy.stats import mannwhitneyu
from scipy.stats import ttest_ind

In [None]:
import pingouin as pg
from statsmodels.stats.proportion import proportion_effectsize
from statsmodels.graphics.gofplots import qqplot_2samples

# Functions

In [None]:
style_param = {'legend.fontsize': '16',
         'xtick.direction' : 'in',  
         'ytick.direction' : 'in', 
         'xtick.top' : True, 
         'figure.figsize': (10,6),
         'axes.labelsize': '26',
         'axes.titlesize':'26',
         'xtick.labelsize':'20',
         'ytick.labelsize':'20',
         'xtick.major.pad':'16',
         'ytick.major.pad':'16'}
    
pylab.rcParams.update(style_param)

# Read data

In [None]:
open_fname = os.path.join('tmp','distrib_q.h5')

In [None]:
columns=['0','1','2','3','4','5','6']
datas=['data_0','data_1','data_2','data_3','data_4','data_5','data_6']
colors=['gray','blue','green','purple','orange','pink','black']

In [None]:
df = pd.DataFrame()

In [None]:
for data,sample in zip(datas,columns):
    df[sample] = pd.read_hdf(open_fname, data).q.values[0:5000]

# Q-distribution

In [None]:
qdf= pd.read_hdf(open_fname,"data_0")

In [None]:
fig,ax = plt.subplots(figsize = (10,6))

qdf.q.hist(bins=(400), density=True,color='grey',ax=ax)
qdf.q.plot.kde(ax=ax,color='black')

ax.axvline(x=0.434, label='Wedenberg q = 0.434'.format(0.434),c='red')
ax.axvline(0.366,color='red',ls='--',alpha=0.6)
ax.axvline(0.513,color='red',ls='--',alpha=0.6)
plt.axvline(qdf.q.mean(),color='orange',ls='--', label=('mean q = {:0.3f}'.format(qdf.q.mean())),alpha=0.9)

ax.set_xlim(0,0.6)
ax.set_xlabel('\n'+r'$q\; [Gy\cdot \mu m \cdot keV^{-1}]$')
ax.set_ylabel("Probability distribution [-]\n")
ax.legend()
ax.grid()
ax.minorticks_on()
ax.grid(which='minor', linestyle=':', linewidth='0.2', color='k')

In [None]:
tmp=pd.DataFrame(list(zip(qdf.q)),
                          columns=['q'])
tmp.to_csv("results/figure_5_7")
fig.savefig(fname="results/figure_5_7",dpi= 700)

# Compare all samples qith q-ditributions

In [None]:
vals, names, xs = [],[],[]
for i, col in enumerate(df.columns):
    vals.append(df[col].values)
    names.append(col)
    xs.append(np.random.normal(i + 1, 0.04, df[col].values.shape[0]))  # adds jitter to the data points - can be adjusted

In [None]:
plt.boxplot(vals, labels=names, notch=True, 
            showmeans=False, showfliers=False )

for x, val in zip(xs, vals):
    plt.scatter(x, val, alpha=0.2, color='gray')
    
plt.ylabel(r'$q [Gy\times \mu m \times keV^{-1}]$')
plt.xlabel("Sample number")
plt.axhline(0.434,color='red',label='Wedenberg q = 0.434',alpha=0.6)
plt.axhline(qdf.q.mean(),color='orange',ls='--',label=('mean q = {:0.3f}'.format(qdf.q.mean())),alpha=0.9)
plt.axhline(qdf.q.quantile(0.975),color='orange',ls=':',alpha=0.9)
plt.axhline(qdf.q.quantile(0.025),color='orange',ls=':',alpha=0.9)

plt.legend()
plt.show()

In [None]:
fig,ax = plt.subplots(figsize = (10,6))

for sample,color in zip(columns,colors):
    df[sample].plot.kde(ax=ax,color=color)
    ax.scatter( df[sample].mean(),0)

ax.set_xlabel(r'$q [Gy\times \mu m \times keV^{-1}]$')
ax.set_ylabel("Probability distribution [-]\n")
ax.legend(title='Number \nof sample',title_fontsize=15)

plt.axvline(qdf.q.mean(),color='gray',label="0 sample mean",alpha=0.9)
plt.axvline(qdf.q.quantile(0.975),color='gray',ls='--',alpha=0.9)
plt.axvline(qdf.q.quantile(0.025),color='gray',ls='--',alpha=0.9)


ax.set_xlim(0,0.6)
ax.grid()
ax.minorticks_on()
ax.grid(which='minor', linestyle=':', linewidth='0.2', color='k')
#print ("mean: ",df.q.mean(), "\nstd: ",df.q.std(), "\nmedian: ",df.q.median(),"\nquantile 2.5%: ", df.q.quantile(0.025),"\nquantile 97.5%: ", df.q.quantile(0.975))

Shapiro-Wilk

In [None]:
for data,sample in zip(datas,columns):
    shapiro_test = stats.shapiro(df[sample])
    print("SAMPLE ",sample +"\nshapiro-wilk test = "+ str(shapiro_test[0]) 
          + ", \np-value = " + str(shapiro_test[1])
          +', \nskew = '+ str(df[sample].skew(axis = 0)) 
          +', \nkurtosis = '+str(df[sample].kurtosis(axis = 0))+'\n')

Cohen, ETA, CLES

In [None]:
for data,sample in zip(datas,columns):
    data=df['0']
    data2=df[sample]
    
    cohens_d=pg.compute_effsize(data,data2, eftype='cohen')
    eta=pg.compute_effsize(data, data2, eftype='eta-square')
    cohen_h = proportion_effectsize(np.mean(data.values), np.mean(data2.values))
    cles = pg.compute_effsize(data, data2, eftype='CLES')

    print("SAMPLES ",0,sample, "\ncohen_d: "+ str(cohens_d)+" \neta: "+ str(eta)+"\ncohen_h: "+ str(cohen_h)+"\n"+"CLES: "+ str(cles)+"\n")

Q-Q plot

In [None]:
fig,ax = plt.subplots(sharex=True, sharey=True)

for data,sample in zip(datas,columns):
    data=df['0']
    data2=df[sample]   
    qqplot_2samples(data, data2,ax=ax,line='45')

ax.set_xlabel("First sample")
ax.set_ylabel("Other sample")

plt.show()