In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from bioinfokit.analys import stat
import csv

----------------------------------SSI ANOVA AND TUKEY HSD TESTING----------------------------------

In [5]:
files = ['LDA_SSI_results','SVML_SSI_results','SVMR_SSI_results',
         'MLP1_SSI_results','MLP2_SSI_results','MLP3_SSI_results',
         'NN1_SSI_results','NN2_SSI_results','NN3_SSI_results',]
names = ['LDA','SVML','SVMR',
         'MLP1','MLP2','MLP3',
         'NNET1','NNET2','NNET3',]


# Read the files and get te mean of the results
df1 = pd.read_csv('SSI\\LDA_SSI_results.csv')['mean'].iloc[:-1]
df2 = pd.read_csv('SSI\\SVML_SSI_results.csv')['mean'].iloc[:-1]
df3 = pd.read_csv('SSI\\SVMR_SSI_results.csv')['mean'].iloc[:-1]
df4 = pd.read_csv('SSI\\MLP1_SSI_results.csv')['mean'].iloc[:-1]
df5 = pd.read_csv('SSI\\MLP2_SSI_results.csv')['mean'].iloc[:-1]
df6 = pd.read_csv('SSI\\MLP3_SSI_results.csv')['mean'].iloc[:-1]
df7 = pd.read_csv('SSI\\NN1_SSI_results.csv')['mean'].iloc[:-1]
df8 = pd.read_csv('SSI\\NN2_SSI_results.csv')['mean'].iloc[:-1]
df9 = pd.read_csv('SSI\\NN3_SSI_results.csv')['mean'].iloc[:-1]

# Create a dataframe with these values
df = pd.DataFrame({'LDA':df1,'SVML':df2,'SVMR':df3,'MLP1':df4,'MLP2':df5,'MLP3':df6,'NNET1':df7,'NNET2':df8,'NNET3':df9})
#df.to_csv('SSI_Stats.csv',index=False)

In [None]:
# Box Plot Classifiers Mean
files = ['LDA_SSI_results','SVML_SSI_results','SVMR_SSI_results',
         'MLP1_SSI_results','MLP2_SSI_results','MLP3_SSI_results',
         'NN1_SSI_results','NN2_SSI_results','NN3_SSI_results',]
names = ['LDA','SVML','SVMR',
         'MLP1','MLP2','MLP3',
         'NNET1','NNET2','NNET3',]

values_mean = []

create_table = 0
if(create_table==1):
    with open('SSI_Stats.csv','a',newline='') as f:
        writer = csv.writer(f,delimiter=',')
        writer.writerow(names)
                
        for i in range(len(files)):
            df = pd.read_csv('SSI\\'+files[i]+'.csv')
            values_mean.append(df['mean'].values[:-1])
            #print(files[i])
            #print(df['mean'].values[:-1])
        
        values_bp = np.array(values_mean)
        values_bp = np.reshape(values_bp,(30,9))

        
        for i in range(len(values_bp)):
            writer.writerow(values_bp[i,:])

        plt.boxplot(np.array(values_mean).T,labels=names)
        plt.title('SSI Classifiers Performance')
        
ssi_df = pd.read_csv('SSI_Stats.csv')
# box plot data
plt.boxplot(ssi_df.values,labels=names)


In [6]:
df = pd.read_csv('Stats_Results\\SSI_Stats.csv')
# reshape the d dataframe suitable for statsmodels package 
df_melt = pd.melt(df.reset_index(), 
                  id_vars=['index'], 
                  value_vars=names)

df_melt.columns = ['Index', 'Classifier', 'Mean']

print(df_melt)

# Stats f_oneway functions takes the groups as input and returns ANOVA F and p value
fvalue, pvalue = stats.f_oneway(df['LDA'], df['SVML'], df['SVMR'], df['MLP1'], df['MLP2'], df['MLP3'], df['NNET1'], df['NNET2'], df['NNET3'])
print(fvalue, pvalue)
#
# Ordinary Least Squares (OLS) model
model = ols(formula='Mean ~ C(Classifier)', data=df_melt).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table


     Index Classifier    Mean
0        0        LDA  97.917
1        1        LDA  88.750
2        2        LDA  97.083
3        3        LDA  94.167
4        4        LDA  86.667
..     ...        ...     ...
265     25      NNET3  85.417
266     26      NNET3  80.417
267     27      NNET3  95.833
268     28      NNET3  92.500
269     29      NNET3  77.500

[270 rows x 3 columns]
0.346945261104829 0.9466959608480108


Unnamed: 0,sum_sq,df,F,PR(>F)
C(Classifier),199.248019,8.0,0.346945,0.946696
Residual,18736.288834,261.0,,


In [7]:
# ANOVA testing 
res = stat()
res.anova_stat(df=df_melt, res_var='Mean', anova_model='Mean ~ C(Classifier)')
res.anova_summary
# output (ANOVA F and p value)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(Classifier),8.0,199.248019,24.906002,0.346945,0.946696
Residual,261.0,18736.288834,71.786547,,


In [8]:
# Tukey's HSD testing
res = stat()
res.tukey_hsd(df=df_melt, res_var='Mean', xfac_var='Classifier', anova_model='Mean ~ C(Classifier)')
res.tukey_summary

  mult_group[ele] = df[df[xfac_var] == ele].mean().loc[res_var]


Unnamed: 0,group1,group2,Diff,Lower,Upper,q-value,p-value
0,LDA,SVML,2.639,-4.203924,9.481924,1.705998,0.9
1,LDA,SVMR,2.361233,-4.481691,9.204158,1.526434,0.9
2,LDA,MLP1,2.736233,-4.106691,9.579158,1.768855,0.9
3,LDA,MLP2,3.000033,-3.842891,9.842958,1.93939,0.9
4,LDA,MLP3,2.514,-4.328924,9.356924,1.625191,0.9
5,LDA,NNET1,2.458433,-4.384491,9.301358,1.589269,0.9
6,LDA,NNET2,2.7918,-4.051124,9.634724,1.804776,0.9
7,LDA,NNET3,1.7918,-5.051124,8.634724,1.15832,0.9
8,SVML,SVMR,0.277767,-6.565158,7.120691,0.179564,0.9
9,SVML,MLP1,0.097233,-6.745691,6.940158,0.062857,0.9


----------------------------------LOOP ANOVA AND TUKEY HSD TESTING----------------------------------

In [None]:
files = ['LDA_LOOP_results','SVML_LOOP_results','SVMR_LOOP_results',
         'MLP1_LOOP_results','MLP2_LOOP_results','MLP3_LOOP_results',
         'NN1_LOOP_results','NN2_LOOP_results','NN3_LOOP_results',]
names = ['LDA','SVML','SVMR',
         'MLP1','MLP2','MLP3',
         'NNET1','NNET2','NNET3',]

df1 = pd.read_csv('LOOP\\LDA_LOOP_results.csv')['all'].iloc[:-1]
df2 = pd.read_csv('LOOP\\SVML_LOOP_results.csv')['all'].iloc[:-1]
df3 = pd.read_csv('LOOP\\SVMR_LOOP_results.csv')['all'].iloc[:-1]
df4 = pd.read_csv('LOOP\\MLP1_LOOP_results.csv')['all'].iloc[:-1]
df5 = pd.read_csv('LOOP\\MLP2_LOOP_results.csv')['all'].iloc[:-1]
df6 = pd.read_csv('LOOP\\MLP3_LOOP_results.csv')['all'].iloc[:-1]
df7 = pd.read_csv('LOOP\\NN1_LOOP_results.csv')['all'].iloc[:-1]
df8 = pd.read_csv('LOOP\\NN2_LOOP_results.csv')['all'].iloc[:-1]
df9 = pd.read_csv('LOOP\\NN3_LOOP_results.csv')['all'].iloc[:-1]

df = pd.DataFrame({'LDA':df1,'SVML':df2,'SVMR':df3,'MLP1':df4,'MLP2':df5,'MLP3':df6,'NNET1':df7,'NNET2':df8,'NNET3':df9})
#df.to_csv('LOOP_Stats.csv',index=False)

In [None]:
loop_df = pd.read_csv('LOOP_Stats.csv')
names = ['LDA','SVML','SVMR',
         'MLP1','MLP2','MLP3',
         'NNET1','NNET2','NNET3',]
# box plot data
plt.boxplot(loop_df.values,labels=names)
plt.title('LOOP Classifiers Testing Performance')

In [None]:
df = pd.read_csv('Stats_Resutls\\LOOP_Stats.csv')
# reshape the d dataframe suitable for statsmodels package 
df_melt = pd.melt(df.reset_index(), 
                  id_vars=['index'], 
                  value_vars=names)

df_melt.columns = ['Index', 'Classifier', 'Mean']

print(df_melt)

# stats f_oneway functions takes the groups as input and returns ANOVA F and p value
fvalue, pvalue = stats.f_oneway(df['LDA'], df['SVML'], df['SVMR'], df['MLP1'], df['MLP2'], df['MLP3'], df['NNET1'], df['NNET2'], df['NNET3'])
print(fvalue, pvalue)

# Ordinary Least Squares (OLS) model
model = ols('Mean ~ C(Classifier)', data=df_melt).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table


In [None]:
res = stat()
res.anova_stat(df=df_melt, res_var='Mean', anova_model='Mean ~ Classifier')
res.anova_summary
# output (ANOVA F and p value)

In [None]:
res = stat()
res.tukey_hsd(df=df_melt, res_var='Mean', xfac_var='Classifier', anova_model='Mean ~ Classifier')
res.tukey_summary

In [None]:
# res.anova_std_residuals are standardized residuals obtained from ANOVA (check above)
sm.qqplot(res.anova_std_residuals, line='45')
plt.xlabel("Theoretical Quantiles")
plt.ylabel("Standardized Residuals")
plt.show()

# histogram
plt.hist(res.anova_model_out.resid, bins='auto', histtype='bar', ec='k') 
plt.xlabel("Residuals")
plt.ylabel('Frequency')
plt.show()