In [1]:
import pandas as pd
import numpy as np
from os import path

In [2]:
DATA_DIR = "../../data"
EVAL_DIR = "2021-11-28"

In [3]:
models = ["logistic_regression", "random_forest", "gradient_boosting", "SVM", "multilayer_perceptron"]
datasets = ["integer_encoded", "pybiomed", "protparam", "bert", "seqvec", "onehot"]

In [4]:
eval_data = pd.read_csv(path.join(DATA_DIR, "evaluations", EVAL_DIR, "all.csv"), sep="\t")
eval_data.head()

Unnamed: 0,model_name,data,preprocessing,F1,MCC,Acc,Precision,Recall,AUC,filename
0,logistic_regression,integer_encoded,none,0.406015,0.212265,0.669456,0.321429,0.55102,0.62551,../../data/evaluations/2021-11-28/logistic_reg...
1,random_forest,integer_encoded,none,0.138889,0.009999,0.740586,0.217391,0.102041,0.503652,../../data/evaluations/2021-11-28/random_fores...
2,gradient_boosting,integer_encoded,none,0.147059,0.04232,0.757322,0.263158,0.102041,0.514178,../../data/evaluations/2021-11-28/gradient_boo...
3,SVM,integer_encoded,none,0.372093,0.166877,0.661088,0.3,0.489796,0.59753,../../data/evaluations/2021-11-28/SVM_integer_...
4,multilayer_perceptron,integer_encoded,none,0.343434,0.171966,0.728033,0.34,0.346939,0.586627,../../data/evaluations/2021-11-28/multilayer_p...


# Model statistics

In [5]:
eval_data.groupby("model_name")[["F1", "MCC"]].mean()

Unnamed: 0_level_0,F1,MCC
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1
SVM,0.327331,0.120377
gradient_boosting,0.212973,0.060155
logistic_regression,0.348812,0.138786
multilayer_perceptron,0.272503,0.103764
random_forest,0.302095,0.118951


In [6]:
eval_data.groupby("model_name")[["F1", "MCC"]].max()

Unnamed: 0_level_0,F1,MCC
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1
SVM,0.447552,0.270045
gradient_boosting,0.427586,0.23927
logistic_regression,0.459459,0.288307
multilayer_perceptron,0.40367,0.240111
random_forest,0.425197,0.243324


In [7]:
# lines with maximum F1 score
idx = eval_data.groupby(['model_name'])['F1'].transform(max) == eval_data['F1']
eval_data[idx]

Unnamed: 0,model_name,data,preprocessing,F1,MCC,Acc,Precision,Recall,AUC,filename
36,SVM,bert,none,0.447552,0.270045,0.669456,0.340426,0.653061,0.663373,../../data/evaluations/2021-11-28/SVM_bert.json
40,random_forest,seqvec,StandardScaler,0.425197,0.243324,0.694561,0.346154,0.55102,0.6413,../../data/evaluations/2021-11-28/random_fores...
62,gradient_boosting,pybiomed,under-sampling,0.427586,0.23927,0.65272,0.322917,0.632653,0.645274,../../data/evaluations/2021-11-28/gradient_boo...
70,logistic_regression,bert,under-sampling,0.459459,0.288307,0.665272,0.343434,0.693878,0.675886,../../data/evaluations/2021-11-28/logistic_reg...
109,multilayer_perceptron,seqvec,over-sampling,0.40367,0.231812,0.728033,0.366667,0.44898,0.62449,../../data/evaluations/2021-11-28/multilayer_p...


In [8]:
eval_data[idx].drop("filename", axis=1)

Unnamed: 0,model_name,data,preprocessing,F1,MCC,Acc,Precision,Recall,AUC
36,SVM,bert,none,0.447552,0.270045,0.669456,0.340426,0.653061,0.663373
40,random_forest,seqvec,StandardScaler,0.425197,0.243324,0.694561,0.346154,0.55102,0.6413
62,gradient_boosting,pybiomed,under-sampling,0.427586,0.23927,0.65272,0.322917,0.632653,0.645274
70,logistic_regression,bert,under-sampling,0.459459,0.288307,0.665272,0.343434,0.693878,0.675886
109,multilayer_perceptron,seqvec,over-sampling,0.40367,0.231812,0.728033,0.366667,0.44898,0.62449


In [9]:
eval_data[idx].drop("filename", axis=1).to_csv(path.join(DATA_DIR, "evaluations/outputs/max_f1_by_model.csv"), index=False)

# Data statistics

In [10]:
eval_data.groupby("data")[["F1", "MCC"]].mean()

Unnamed: 0_level_0,F1,MCC
data,Unnamed: 1_level_1,Unnamed: 2_level_1
bert,0.327564,0.149342
integer_encoded,0.281498,0.120685
onehot,0.306254,0.123155
protparam,0.207643,-0.022946
pybiomed,0.300015,0.141794
seqvec,0.332593,0.157081


In [11]:
eval_data.groupby("data")[["F1", "MCC"]].max()

Unnamed: 0_level_0,F1,MCC
data,Unnamed: 1_level_1,Unnamed: 2_level_1
bert,0.459459,0.288307
integer_encoded,0.406015,0.212265
onehot,0.394366,0.190154
protparam,0.304147,0.075893
pybiomed,0.427586,0.240111
seqvec,0.425197,0.243324


In [12]:
# lines with maximum F1 score
idx = eval_data.groupby(['data'])['F1'].transform(max) == eval_data['F1']
eval_data[idx]

Unnamed: 0,model_name,data,preprocessing,F1,MCC,Acc,Precision,Recall,AUC,filename
0,logistic_regression,integer_encoded,none,0.406015,0.212265,0.669456,0.321429,0.55102,0.62551,../../data/evaluations/2021-11-28/logistic_reg...
26,SVM,protparam,none,0.304147,-0.032739,0.368201,0.196429,0.673469,0.481472,../../data/evaluations/2021-11-28/SVM_protpara...
40,random_forest,seqvec,StandardScaler,0.425197,0.243324,0.694561,0.346154,0.55102,0.6413,../../data/evaluations/2021-11-28/random_fores...
54,random_forest,onehot,StandardScaler,0.394366,0.189891,0.640167,0.301075,0.571429,0.614662,../../data/evaluations/2021-11-28/random_fores...
62,gradient_boosting,pybiomed,under-sampling,0.427586,0.23927,0.65272,0.322917,0.632653,0.645274,../../data/evaluations/2021-11-28/gradient_boo...
70,logistic_regression,bert,under-sampling,0.459459,0.288307,0.665272,0.343434,0.693878,0.675886,../../data/evaluations/2021-11-28/logistic_reg...


In [13]:
eval_data[idx].drop("filename", axis=1)

Unnamed: 0,model_name,data,preprocessing,F1,MCC,Acc,Precision,Recall,AUC
0,logistic_regression,integer_encoded,none,0.406015,0.212265,0.669456,0.321429,0.55102,0.62551
26,SVM,protparam,none,0.304147,-0.032739,0.368201,0.196429,0.673469,0.481472
40,random_forest,seqvec,StandardScaler,0.425197,0.243324,0.694561,0.346154,0.55102,0.6413
54,random_forest,onehot,StandardScaler,0.394366,0.189891,0.640167,0.301075,0.571429,0.614662
62,gradient_boosting,pybiomed,under-sampling,0.427586,0.23927,0.65272,0.322917,0.632653,0.645274
70,logistic_regression,bert,under-sampling,0.459459,0.288307,0.665272,0.343434,0.693878,0.675886


In [14]:
eval_data[idx].drop("filename", axis=1).to_csv(path.join(DATA_DIR, "evaluations/outputs/max_f1_by_data.csv"), index=False)