In [1]:
import pandas as pd
import numpy as np
from os import path

In [2]:
DATA_DIR = "../../data"
EVAL_DIR = "2021-12-03"

In [3]:
models = ["logistic_regression", "random_forest", "gradient_boosting", "SVM", "multilayer_perceptron"]
datasets = ["integer_encoded", "pybiomed", "protparam", "bert", "seqvec", "onehot"]

In [6]:
eval_data = pd.read_csv(path.join(DATA_DIR, "evaluations", EVAL_DIR, "all.csv"), sep="\t")
eval_data.head()

Unnamed: 0,model_name,data,preprocessing,F1,MCC,Acc,Precision,Recall,AUC,filename
0,logistic_regression,integer_encoded,none,0.403101,0.210802,0.677824,0.325,0.530612,0.623201,../../data/evaluations/2021-12-03/logistic_reg...
1,random_forest,integer_encoded,none,0.381579,0.164975,0.606695,0.281553,0.591837,0.601182,../../data/evaluations/2021-12-03/random_fores...
2,gradient_boosting,integer_encoded,none,0.142857,0.025427,0.748954,0.238095,0.102041,0.508915,../../data/evaluations/2021-12-03/gradient_boo...
3,SVM,integer_encoded,none,0.366412,0.156925,0.65272,0.292683,0.489796,0.592266,../../data/evaluations/2021-12-03/SVM_integer_...
4,multilayer_perceptron,integer_encoded,none,0.314607,0.161002,0.74477,0.35,0.285714,0.574436,../../data/evaluations/2021-12-03/multilayer_p...


# Model statistics

In [7]:
eval_data.groupby("model_name")[["F1", "MCC"]].mean()

Unnamed: 0_level_0,F1,MCC
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1
SVM,0.505961,0.341874
gradient_boosting,0.389943,0.300588
logistic_regression,0.517365,0.36533
multilayer_perceptron,0.445881,0.307169
random_forest,0.492545,0.361214


In [8]:
eval_data.groupby("model_name")[["F1", "MCC"]].max()

Unnamed: 0_level_0,F1,MCC
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1
SVM,0.654545,0.55847
gradient_boosting,0.551181,0.446294
logistic_regression,0.631579,0.528096
multilayer_perceptron,0.621359,0.518664
random_forest,0.573529,0.455851


In [9]:
# lines with maximum F1 score
idx = eval_data.groupby(['model_name'])['F1'].transform(max) == eval_data['F1']
eval_data[idx]

Unnamed: 0,model_name,data,preprocessing,F1,MCC,Acc,Precision,Recall,AUC,filename
5,logistic_regression,pybiomed,none,0.631579,0.528096,0.824268,0.553846,0.734694,0.791031,../../data/evaluations/2021-12-03/logistic_reg...
54,multilayer_perceptron,pybiomed,over-sampling,0.621359,0.518664,0.83682,0.592593,0.653061,0.768636,../../data/evaluations/2021-12-03/multilayer_p...
58,SVM,pybiomed,smote,0.654545,0.55847,0.841004,0.590164,0.734694,0.801557,../../data/evaluations/2021-12-03/SVM_pybiomed...
62,gradient_boosting,protparam,over-sampling,0.551181,0.420153,0.761506,0.448718,0.714286,0.743985,../../data/evaluations/2021-12-03/gradient_boo...
96,random_forest,protparam,under-sampling,0.573529,0.455851,0.757322,0.448276,0.795918,0.771643,../../data/evaluations/2021-12-03/random_fores...


In [10]:
eval_data[idx].drop("filename", axis=1)

Unnamed: 0,model_name,data,preprocessing,F1,MCC,Acc,Precision,Recall,AUC
5,logistic_regression,pybiomed,none,0.631579,0.528096,0.824268,0.553846,0.734694,0.791031
54,multilayer_perceptron,pybiomed,over-sampling,0.621359,0.518664,0.83682,0.592593,0.653061,0.768636
58,SVM,pybiomed,smote,0.654545,0.55847,0.841004,0.590164,0.734694,0.801557
62,gradient_boosting,protparam,over-sampling,0.551181,0.420153,0.761506,0.448718,0.714286,0.743985
96,random_forest,protparam,under-sampling,0.573529,0.455851,0.757322,0.448276,0.795918,0.771643


In [11]:
eval_data[idx].drop("filename", axis=1).to_csv(path.join(DATA_DIR, "evaluations/outputs/max_f1_by_model.csv"), index=False)

# Data statistics

In [12]:
eval_data.groupby("data")[["F1", "MCC"]].mean()

Unnamed: 0_level_0,F1,MCC
data,Unnamed: 1_level_1,Unnamed: 2_level_1
bert,0.544577,0.439848
integer_encoded,0.321711,0.143826
protparam,0.407687,0.206984
pybiomed,0.485722,0.384056
seqvec,0.472637,0.346151


In [13]:
eval_data.groupby("data")[["F1", "MCC"]].max()

Unnamed: 0_level_0,F1,MCC
data,Unnamed: 1_level_1,Unnamed: 2_level_1
bert,0.622951,0.518283
integer_encoded,0.403101,0.210802
protparam,0.573529,0.455851
pybiomed,0.654545,0.55847
seqvec,0.592593,0.478405


In [14]:
# lines with maximum F1 score
idx = eval_data.groupby(['data'])['F1'].transform(max) == eval_data['F1']
eval_data[idx]

Unnamed: 0,model_name,data,preprocessing,F1,MCC,Acc,Precision,Recall,AUC,filename
0,logistic_regression,integer_encoded,none,0.403101,0.210802,0.677824,0.325,0.530612,0.623201,../../data/evaluations/2021-12-03/logistic_reg...
30,logistic_regression,seqvec,none,0.592593,0.478405,0.8159,0.542373,0.653061,0.755478,../../data/evaluations/2021-12-03/logistic_reg...
58,SVM,pybiomed,smote,0.654545,0.55847,0.841004,0.590164,0.734694,0.801557,../../data/evaluations/2021-12-03/SVM_pybiomed...
78,SVM,bert,smote,0.622951,0.518283,0.807531,0.520548,0.77551,0.79565,../../data/evaluations/2021-12-03/SVM_bert_smo...
96,random_forest,protparam,under-sampling,0.573529,0.455851,0.757322,0.448276,0.795918,0.771643,../../data/evaluations/2021-12-03/random_fores...


In [15]:
eval_data[idx].drop("filename", axis=1)

Unnamed: 0,model_name,data,preprocessing,F1,MCC,Acc,Precision,Recall,AUC
0,logistic_regression,integer_encoded,none,0.403101,0.210802,0.677824,0.325,0.530612,0.623201
30,logistic_regression,seqvec,none,0.592593,0.478405,0.8159,0.542373,0.653061,0.755478
58,SVM,pybiomed,smote,0.654545,0.55847,0.841004,0.590164,0.734694,0.801557
78,SVM,bert,smote,0.622951,0.518283,0.807531,0.520548,0.77551,0.79565
96,random_forest,protparam,under-sampling,0.573529,0.455851,0.757322,0.448276,0.795918,0.771643


In [16]:
eval_data[idx].drop("filename", axis=1).to_csv(path.join(DATA_DIR, "evaluations/outputs/max_f1_by_data.csv"), index=False)