In [1]:
import pickle
import pandas as pd
from scipy import stats
import re
import glob
import os

## Result Extraction 

In [2]:
output_path = './data/outputs/' 

bert_single_path = './data/outputs/bert_single/'
bert_all_path = './data/outputs/bert_all/'
textCNN_single_path = './data/outputs/textCNN_single/'
textCNN_all_path = './data/outputs/textCNN_all/'
mdfend_path = './data/outputs/mdfend/'
mdfend_linux_path = './data/outputs/mdfend_all_linux/'

In [3]:
def load_files(path, model_name):
    pattern = model_name + '_'+'*.txt'
    files = glob.glob(path + pattern)
    files.sort()
    return files

In [4]:
def extract_results(files):
    lastlines_list = []
    model_results = []
    
    #extracting the last line in the txt file which contains the required 'metric' result
    for path in files:
        with open(path, 'rt', encoding='utf-16') as file:
            last_line = file.readlines()[-1]
            lastlines_list.append(last_line)
    
    #in the last lines, extract the result using regex
    for line in lastlines_list:
        regex_res = re.search(r"'metric': (\d*\.?\d*)", line)
        model_results.append(float(regex_res.group(1)))
    
    return model_results

### MDFEND (Windows, Linux)  

In [5]:
mdfend_files = load_files(mdfend_path, 'mdfend')
mdfend_files

['./data/outputs/mdfend/mdfend_split1.txt',
 './data/outputs/mdfend/mdfend_split10.txt',
 './data/outputs/mdfend/mdfend_split2.txt',
 './data/outputs/mdfend/mdfend_split3.txt',
 './data/outputs/mdfend/mdfend_split4.txt',
 './data/outputs/mdfend/mdfend_split5.txt',
 './data/outputs/mdfend/mdfend_split6.txt',
 './data/outputs/mdfend/mdfend_split7.txt',
 './data/outputs/mdfend/mdfend_split8.txt',
 './data/outputs/mdfend/mdfend_split9.txt']

In [6]:
mdfend_results = extract_results(mdfend_files)
mdfend_results

[0.9119858785475026,
 0.8981886055318535,
 0.9121668397369331,
 0.9146110261584848,
 0.9118747545185808,
 0.9099491482634673,
 0.9134571797928244,
 0.8992159562144284,
 0.905232480567278,
 0.9074053137365743]

In [7]:
mdfend_linux_files = load_files(mdfend_linux_path, 'mdfend__all_linux')
mdfend_linux_files

['./data/outputs/mdfend_all_linux/mdfend__all_linux_split1.txt',
 './data/outputs/mdfend_all_linux/mdfend__all_linux_split10.txt',
 './data/outputs/mdfend_all_linux/mdfend__all_linux_split2.txt',
 './data/outputs/mdfend_all_linux/mdfend__all_linux_split3.txt',
 './data/outputs/mdfend_all_linux/mdfend__all_linux_split4.txt',
 './data/outputs/mdfend_all_linux/mdfend__all_linux_split5.txt',
 './data/outputs/mdfend_all_linux/mdfend__all_linux_split6.txt',
 './data/outputs/mdfend_all_linux/mdfend__all_linux_split7.txt',
 './data/outputs/mdfend_all_linux/mdfend__all_linux_split8.txt',
 './data/outputs/mdfend_all_linux/mdfend__all_linux_split9.txt']

In [8]:
mdfend_linux_results = extract_results(mdfend_linux_files)
mdfend_linux_results

[0.913111614710914,
 0.9096853074258747,
 0.9176564122533748,
 0.9173472827039489,
 0.9085928842363477,
 0.9050579241853562,
 0.9123715790382456,
 0.8878496236257587,
 0.9122847026345797,
 0.9228234697756938]

### All Domain Models (bert_all, textCNN_all)

In [9]:
bert_all_files = load_files(bert_all_path, 'bert_all')
bert_all_files

['./data/outputs/bert_all/bert_all_split1.txt',
 './data/outputs/bert_all/bert_all_split10.txt',
 './data/outputs/bert_all/bert_all_split2.txt',
 './data/outputs/bert_all/bert_all_split3.txt',
 './data/outputs/bert_all/bert_all_split4.txt',
 './data/outputs/bert_all/bert_all_split5.txt',
 './data/outputs/bert_all/bert_all_split6.txt',
 './data/outputs/bert_all/bert_all_split7.txt',
 './data/outputs/bert_all/bert_all_split8.txt',
 './data/outputs/bert_all/bert_all_split9.txt']

In [10]:
bert_all_results = extract_results(bert_all_files)
bert_all_results

[0.8511725365251586,
 0.8580633762998926,
 0.8621345563266907,
 0.872463025629272,
 0.8620685936151855,
 0.8455070575330759,
 0.8751693153785113,
 0.8625365820611735,
 0.8615209563580597,
 0.8596430577762413]

In [11]:
def load_textCNNall_files(path, model_name):
    files = glob.glob(path + model_name + '*')
    files.sort()
    return files

In [12]:
textCNN_all_files = load_textCNNall_files(textCNN_all_path, 'textCNN_all')
textCNN_all_files

['./data/outputs/textCNN_all/textCNN_all_split1',
 './data/outputs/textCNN_all/textCNN_all_split10',
 './data/outputs/textCNN_all/textCNN_all_split2',
 './data/outputs/textCNN_all/textCNN_all_split3',
 './data/outputs/textCNN_all/textCNN_all_split4',
 './data/outputs/textCNN_all/textCNN_all_split5',
 './data/outputs/textCNN_all/textCNN_all_split6',
 './data/outputs/textCNN_all/textCNN_all_split7',
 './data/outputs/textCNN_all/textCNN_all_split8',
 './data/outputs/textCNN_all/textCNN_all_split9']

In [13]:
for old_name in textCNN_all_files:
    new_name = old_name + '.txt'
    os.rename(old_name, new_name)

In [14]:
textCNN_all_files = load_textCNNall_files(textCNN_all_path, 'textCNN_all')
textCNN_all_files

['./data/outputs/textCNN_all/textCNN_all_split1.txt',
 './data/outputs/textCNN_all/textCNN_all_split10.txt',
 './data/outputs/textCNN_all/textCNN_all_split2.txt',
 './data/outputs/textCNN_all/textCNN_all_split3.txt',
 './data/outputs/textCNN_all/textCNN_all_split4.txt',
 './data/outputs/textCNN_all/textCNN_all_split5.txt',
 './data/outputs/textCNN_all/textCNN_all_split6.txt',
 './data/outputs/textCNN_all/textCNN_all_split7.txt',
 './data/outputs/textCNN_all/textCNN_all_split8.txt',
 './data/outputs/textCNN_all/textCNN_all_split9.txt']

In [15]:
textCNN_all_lastlines = []

for path in textCNN_all_files:
    with open(path, 'rt') as file:
        lines = []
        for line in file:
            if len(line) > 1 or line != '\n':
                lines.append(line)
        last_line = lines[-1]
        textCNN_all_lastlines.append(last_line)
        
textCNN_all_results = []

for line in textCNN_all_lastlines:
    regex_res = re.search(r"'metric': (\d*\.?\d*)", line)
    textCNN_all_results.append(float(regex_res.group(1)))

textCNN_all_results

[0.8865532288800062,
 0.8664752041620086,
 0.8786584623706809,
 0.8904521263200309,
 0.8822460971283461,
 0.8936260504201681,
 0.890503337041157,
 0.8740123909852344,
 0.9014069136880278,
 0.8998356763558197]

### Single Domain Models 

In [16]:
def load_single_files(path, model_name, split):
    pattern = model_name + '_'+ split + '_' + '*.txt'
    files = glob.glob(path + pattern)
    files.sort()
    return files

In [17]:
def Average(lst):
    return sum(lst) / len(lst)

In [18]:
split_list = ['split1', 'split2', 'split3','split4', 'split5','split6','split7','split8','split9','split10']

In [19]:
def get_single_results(model_results_path, model_name):
    
    model_results = []
    
    for split in split_list:
        split_files = load_single_files(model_results_path, model_name, split)
        split_results = extract_results(split_files)
        split_average = Average(split_results)
        model_results.append(split_average)
    
    return model_results

In [20]:
# bert_single_results = []

# bert_sing_sp1 = load_single_files(bert_single_path, 'bert_single', 'split1')
# bert_sing_sp1

# bert_sing_sp1_res = extract_results(bert_sing_sp1)
# bert_sing_sp1_res

# bert_single_results.append(Average(bert_sing_sp1_res))

In [21]:
bert_single_results = get_single_results(bert_single_path, 'bert_single')
bert_single_results

[0.8139784571135675,
 0.8305183182096836,
 0.8357128615401916,
 0.8396200498943195,
 0.8486487485452342,
 0.8108514885644428,
 0.8324167521570492,
 0.8302196045553049,
 0.8521883109448545,
 0.8321764340539288]

In [22]:
textCNN_single_results = get_single_results(textCNN_single_path, 'textcnn_single')
textCNN_single_results

#problem with split 6 and 10: output weird

[0.8383755973025862,
 0.8601500364291469,
 0.8610446193244641,
 0.8488344288810125,
 0.8682147935918467,
 0.8491993629993432,
 0.8572114552778779,
 0.8596245166895511,
 0.8694261935998127,
 0.8771106138438012]

### All Model Results 

In [23]:
pd.set_option("display.precision", 16)

results = pd.DataFrame()
results['bert_single'] = bert_single_results
results['textCNN_single'] = textCNN_single_results
results['bert_all'] = bert_all_results
results['textCNN_all'] = textCNN_all_results
results['mdfend_windows'] = mdfend_results

In [24]:
results

Unnamed: 0,bert_single,textCNN_single,bert_all,textCNN_all,mdfend_windows
0,0.8139784571135675,0.8383755973025862,0.8511725365251586,0.8865532288800062,0.9119858785475026
1,0.8305183182096836,0.8601500364291469,0.8580633762998926,0.8664752041620086,0.8981886055318535
2,0.8357128615401916,0.8610446193244641,0.8621345563266907,0.8786584623706809,0.9121668397369332
3,0.8396200498943195,0.8488344288810125,0.872463025629272,0.8904521263200309,0.9146110261584848
4,0.8486487485452342,0.8682147935918467,0.8620685936151855,0.8822460971283461,0.9118747545185808
5,0.8108514885644428,0.8491993629993432,0.8455070575330759,0.8936260504201681,0.9099491482634672
6,0.8324167521570492,0.8572114552778779,0.8751693153785113,0.890503337041157,0.9134571797928244
7,0.8302196045553049,0.8596245166895511,0.8625365820611735,0.8740123909852344,0.8992159562144284
8,0.8521883109448545,0.8694261935998127,0.8615209563580597,0.9014069136880278,0.905232480567278
9,0.8321764340539288,0.8771106138438012,0.8596430577762413,0.8998356763558197,0.9074053137365744


In [25]:
#results.to_latex()

In [26]:
results_stats=pd.DataFrame()
results_stats["mean"]=results.mean()
results_stats["Std.Dev"]=results.std()
results_stats["Var"]=results.var()

In [27]:
results_stats.T

Unnamed: 0,bert_single,textCNN_single,bert_all,textCNN_all,mdfend_windows
mean,0.8326331025578577,0.8589191617939441,0.8610279057503261,0.8863769487351479,0.9084087183067926
Std.Dev,0.0130392424106784,0.0113303731098349,0.008719707012675,0.0111497513203114,0.0058244156267302
Var,0.0001700218426444,0.0001283773548081,7.60332903869e-05,0.0001243169545048,3.39238173929e-05


In [28]:
#results_stats.to_latex()

### MDFEDN Results: Linux, Windows 

In [29]:
pd.set_option("display.precision", 16)

results_mdfend = pd.DataFrame()
results_mdfend['mdfend_linux'] = mdfend_linux_results
results_mdfend['mdfend_windows'] = mdfend_results

In [30]:
results_mdfend

Unnamed: 0,mdfend_linux,mdfend_windows
0,0.913111614710914,0.9119858785475026
1,0.9096853074258748,0.8981886055318535
2,0.9176564122533748,0.9121668397369332
3,0.9173472827039488,0.9146110261584848
4,0.9085928842363477,0.9118747545185808
5,0.9050579241853562,0.9099491482634672
6,0.9123715790382456,0.9134571797928244
7,0.8878496236257587,0.8992159562144284
8,0.9122847026345796,0.905232480567278
9,0.9228234697756938,0.9074053137365744


In [45]:
#results_mdfend.to_latex()

## Significance Testing 

### Paired Sample t-Test: MDFEND on different operating systems

We perform a test to see if there is a significant performance difference between the MDFEND model output when ran on Linux and Windows. In particular, we want to test whether there is a significantly different model output with respect to the F1-score measured on all domains.

We hence test the Null hypothesis that the expected average performance of the MDFEND model on Linux and on Windows are equal against the alternative that they are different (i.e. we perform a two-tailed test and the H0 will be rejected when the t-value is sufficiently small or large)

Have 9 dof (degrees of freedom): we reject the Null if the absolute value of the statistic |t| > t(95, df = 9) = 2.262 at the 5% level, or |t| > t(99, df=9) = 3.250 at the 1% level 

In [32]:
Index = ['t-statistic', 'p-value', 'df']
mdfend_testing_results = pd.DataFrame(index = Index)

In [33]:
mdfend_test = stats.ttest_rel(results_mdfend['mdfend_linux'], results_mdfend['mdfend_windows'])
mdfend_testing_results['mdfend'] = [mdfend_test.statistic, mdfend_test.pvalue, mdfend_test.df]

In [34]:
mdfend_testing_results

Unnamed: 0,mdfend
t-statistic,0.9001088989262811
p-value,0.3915103919149541
df,9.0


Given that |t| = |0.9001| < t(95, df=9) = 2.262, we cannot reject the Null and conclude that the difference in genre classification f1-score between MDFEND ran on Linux or Microsoft is not significant (Note: the same holds true for the stricter 99% error level).

In [43]:
#mdfend_testing_results.to_latex()

###  Paired Sample t-Test: between models

We perform a test to see if there is a significant performance difference between the MDFEND (ran on Microsoft) and the other models. In particular, we want to test whether the MDFEND model performs better with respect to the F1-score measured on all domains.

We hence test the Null hypothesis that the expected average performance of the MDFEND and a respective other model are equal against the alternative that the expected average performance of the MDFEND model is greater than that of the other model (i.e. we perform a one-tailed test and the H0 will be rejected when the t-value is sufficiently large)

Have 9 dof (degrees of freedom): we reject the Null if the t-statistic is larger than t(99, df = 9) = 2.821 (value at the 1% level)

In [35]:
testing_results = pd.DataFrame(index = Index)

#### MDFEND vs. bert_single

In [36]:
bert_single_test = stats.ttest_rel(results['mdfend_windows'], results['bert_single'], alternative = 'greater')
testing_results['bert_single'] = [bert_single_test.statistic, bert_single_test.pvalue, bert_single_test.df]

#### MDFEND vs. textCNN_single


In [37]:
textCNN_single_test = stats.ttest_rel(results['mdfend_windows'], results['textCNN_single'], alternative = 'greater')
testing_results['textCNN_single'] = [textCNN_single_test.statistic, textCNN_single_test.pvalue, textCNN_single_test.df]

#### MDFEND vs. bert_all


In [38]:
bert_all_test = stats.ttest_rel(results['mdfend_windows'], results['bert_all'], alternative = 'greater')
testing_results['bert_all'] = [bert_all_test.statistic, bert_all_test.pvalue, bert_all_test.df]

#### MDFEND vs. textCNN_all


In [39]:
textCNN_all_test = stats.ttest_rel(results['mdfend_windows'], results['textCNN_all'], alternative = 'greater')
testing_results['textCNN_all'] = [textCNN_all_test.statistic, textCNN_all_test.pvalue, textCNN_all_test.df]

In [40]:
testing_results

Unnamed: 0,bert_single,textCNN_single,bert_all,textCNN_all
t-statistic,16.699281326094393,10.942467904475055,16.143482516753675,7.032337433286879
p-value,2.21368041e-08,8.413499269e-07,2.97530668e-08,3.05165343274e-05
df,9.0,9.0,9.0,9.0


In all cases be reject the Null and conclude that the MDFEND model performs significantly better with respect to classification f1-score compared to all other tested models. 

In [41]:
#testing_results.to_latex()