## Evaluation of the models for the Price et al dataset

## Useful functions to evaluate the models

In [ ]:
import re
import pandas as pd


def get_unique_labels_by_level(dataset, level):
    final_dataset_test = dataset.copy()
    final_dataset_test = final_dataset_test.loc[:,level]
    final_dataset_test.fillna("0", inplace=True)
    values = pd.Series(final_dataset_test.values.reshape(-1)).str.split(";")
    list_of_unique_labels = np.unique(values.explode()).tolist()
    if "0" in list_of_unique_labels:
        list_of_unique_labels.remove("0")
    list_of_unique_labels_dict = dict(zip(list_of_unique_labels, range(len(list_of_unique_labels))))
    return list_of_unique_labels_dict

def get_final_labels(dataset, all_levels=False):

    if all_levels:
        unique_EC1 = get_unique_labels_by_level(dataset, "EC1")
        unique_EC2 = get_unique_labels_by_level(dataset, "EC2")
        unique_EC3 = get_unique_labels_by_level(dataset, "EC3")
        
        array_EC1 = np.zeros((len(dataset), len(unique_EC1)))
        array_EC2 = np.zeros((len(dataset), len(unique_EC2)))
        array_EC3 = np.zeros((len(dataset), len(unique_EC3)))

    unique_EC4 = get_unique_labels_by_level(dataset, "EC4")

    array_EC4 = np.zeros((len(dataset), len(unique_EC4)))
        
    dataset.fillna("0", inplace=True)

    if all_levels:
        for i, row in dataset.iterrows():
            for ec in ["EC1", "EC2", "EC3", "EC4"]:
                for EC in row[ec].split(";"):
                    if EC != "0":
                        if ec == "EC1":
                            array_EC1[i, unique_EC1[EC]] = 1
                        elif ec == "EC2":
                            array_EC2[i, unique_EC2[EC]] = 1
                        elif ec == "EC3":
                            array_EC3[i, unique_EC3[EC]] = 1
                        elif ec == "EC4":
                            array_EC4[i, unique_EC4[EC]] = 1
    else:
        for i, row in dataset.iterrows():
            for EC in row["EC4"].split(";"):
                if EC != "0":
                    array_EC4[i, unique_EC4[EC]] = 1
    if all_levels:
        array_EC1 = pd.DataFrame(array_EC1, columns=unique_EC1.keys())
        array_EC2 = pd.DataFrame(array_EC2, columns=unique_EC2.keys())
        array_EC3 = pd.DataFrame(array_EC3, columns=unique_EC3.keys())
    array_EC4 = pd.DataFrame(array_EC4, columns=unique_EC4.keys())

    if all_levels:
        dataset = pd.concat((dataset, array_EC1, array_EC2, array_EC3, array_EC4), axis=1)
    else:
        dataset = pd.concat((dataset, array_EC4), axis=1)
    return dataset


def get_ec_from_regex_match(match):
    if match is not None:
        EC = match.group()
        if EC is not None:
            return EC
    return None

def get_labels_based_on_list(dataset, labels, all_levels=True):
    array = np.zeros((len(dataset), len(labels)))
    labels_dataframe = pd.DataFrame(array, columns=labels)
    dataset.fillna("0", inplace=True)
    for i, row in dataset.iterrows():
        for label in row["EC4"].split(";"):
            if label != "0" and label in labels:
                labels_dataframe.at[i, label] = 1
        
        if all_levels:
            for ec in ["EC1", "EC2", "EC3"]:
                for label in row[ec].split(";"):
                    if label != "0" and label in labels:
                        labels_dataframe.at[i, label] = 1

    return pd.concat((dataset, labels_dataframe), axis=1)

def divide_labels_by_EC_level(final_dataset, ec_label):
    EC1_lst = []
    EC2_lst = []
    EC3_lst = []
    EC4_lst = []


    for _, row in final_dataset.iterrows():
        ECs = row[ec_label]
        ECs = ECs.split(";")
        # get the first 3 ECs with regular expression
        EC3 = []
        EC2 = []
        EC1 = []
        EC4 = []
        for EC in ECs:
            new_EC = re.search(r"^\d+.\d+.\d+.n*\d+", EC)
            new_EC = get_ec_from_regex_match(new_EC)
            if isinstance(new_EC, str):
                if new_EC not in EC4:
                    EC4.append(new_EC)

            new_EC = re.search(r"^\d+.\d+.\d+", EC)
            new_EC = get_ec_from_regex_match(new_EC)
            if isinstance(new_EC, str):
                if new_EC not in EC3:
                    EC3.append(new_EC)

            new_EC = re.search(r"^\d+.\d+", EC)
            new_EC = get_ec_from_regex_match(new_EC)
            if isinstance(new_EC, str):
                if new_EC not in EC2:
                    EC2.append(new_EC)

            new_EC = re.search(r"^\d+", EC)
            new_EC = get_ec_from_regex_match(new_EC)
            if isinstance(new_EC, str):
                if new_EC not in EC1:
                    EC1.append(new_EC)

        if len(EC4) == 0:
            EC4_lst.append(np.NaN)
        else:
            EC4_lst.append(";".join(EC4))
        if len(EC3) == 0:
            EC3_lst.append(np.NaN)
        else:
            EC3_lst.append(";".join(EC3))
        if len(EC2) == 0:
            EC2_lst.append(np.NaN)
        else:
            EC2_lst.append(";".join(EC2))
        if len(EC1) == 0:
            EC1_lst.append(np.NaN)
        else:
            EC1_lst.append(";".join(EC1))

    assert None not in EC1_lst
    assert None not in EC2_lst
    assert None not in EC3_lst
    assert None not in EC4_lst

    assert len(EC1_lst) == len(final_dataset)
    assert len(EC2_lst) == len(final_dataset)
    assert len(EC3_lst) == len(final_dataset)
    assert len(EC4_lst) == len(final_dataset)

    final_dataset["EC1"] = EC1_lst
    final_dataset["EC2"] = EC2_lst
    final_dataset["EC3"] = EC3_lst
    final_dataset["EC4"] = EC4_lst

    assert final_dataset["EC1"].isnull().sum() == 0
    print("EC1 is not null")

    return final_dataset

In [2]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def get_ec_levels(labels):
    level_1 = []
    level_2 = []
    level_3 = []
    level_4 = []
    for i, label in enumerate(labels):
        if re.match(r"^\d+.\d+.\d+.n*\d+$", label):
            level_4.append(i)
        elif re.match(r"^\d+.\d+.\d+$", label):
            level_3.append(i)
        elif re.match(r"^\d+.\d+$", label):
            level_2.append(i)
        elif re.match(r"^\d+$", label):
            level_1.append(i)
    return level_1, level_2, level_3, level_4

def get_metrics(y_true, predictions, labels, labels_to_remove, model_name):
    y_true_ = np.delete(y_true, labels_to_remove, axis=1)
    predictions_ = np.delete(predictions, labels_to_remove, axis=1)
    labels_ = np.delete(labels, labels_to_remove)
    level_1, level_2, level_3, level_4 = get_ec_levels(labels_)
    
    metrics = {}
    average = "weighted"
    metrics["f1 overall"] = f1_score(y_true_, predictions_, average=average)
    metrics["f1 level 1"] = f1_score(y_true_[:, level_1], predictions_[:, level_1], average=average)
    metrics["f1 level 2"] = f1_score(y_true_[:, level_2], predictions_[:, level_2], average=average)
    metrics["f1 level 3"] = f1_score(y_true_[:, level_3], predictions_[:, level_3], average=average)
    metrics["f1 level 4"] = f1_score(y_true_[:, level_4], predictions_[:, level_4], average=average)
    metrics["recall level 4"] = recall_score(y_true_[:, level_4], predictions_[:, level_4], average=average)
    metrics["precision level 4"] = precision_score(y_true_[:, level_4], predictions_[:, level_4], average=average)

    return pd.DataFrame(metrics, index=[model_name])

def get_models_predictions(model, dataset, labels, labels_to_remove, model_name):
    predictions = model.predict(dataset)
    y_true = dataset.y

    return get_metrics(y_true, predictions, labels, labels_to_remove, model_name)

def convert_predictions_into_format(predictions, labels_names, new_labels):
    new_predictions = np.zeros((len(predictions), len(new_labels)))
    labels_names = np.array(labels_names)
    new_labels = np.array(new_labels)
    for i, prediction in enumerate(predictions):
        indexes = np.where(prediction == 1)
        result = labels_names[indexes]
        for res in result:
            potential_result = new_labels[new_labels == res]
            if potential_result.size > 0:
                new_predictions[i, new_labels == res] = 1
    return new_predictions

## Load the dataset

In [3]:
from plants_sm.data_structures.dataset.single_input_dataset import SingleInputDataset

test_dataset = SingleInputDataset.from_csv('/scratch/jribeiro/ec_number_prediction/final_data/price.csv',
                                            instances_ids_field="Entry", representation_field="Sequence", sep="\t")

In [4]:
price_dataset = pd.read_csv('/scratch/jribeiro/ec_number_prediction/final_data/price.csv', sep="\t")
price_dataset = divide_labels_by_EC_level(price_dataset, "EC number")
price_dataset_ = get_final_labels(price_dataset, all_levels=True)

EC1 is not null


## ESM2 3B model - predictions

In [5]:
from plants_sm.data_structures.dataset.single_input_dataset import SingleInputDataset

labels_names = SingleInputDataset.from_csv('/scratch/jribeiro/ec_number_prediction/final_data/merged_dataset.csv',
                                            instances_ids_field="accession", representation_field="sequence",
                                            labels_field=slice(8, -1), nrows=2)._labels_names

In [6]:
from plants_sm.models.fc.fc import DNN
from plants_sm.models.pytorch_model import PyTorchModel
from torch import nn
import torch

model = PyTorchModel(model = DNN(2560, [2560], 5743, batch_norm=True), loss_function=nn.BCELoss())
model.model.load_state_dict(
    torch.load("/scratch/jribeiro/results/esm2_t36_3B_UR50D/DNN_esm2_t36_3B_UR50D/DNN_esm2_t36_3B_UR50D_optimization_set_2_all_data_all_data/pytorch_model_weights.pt", map_location=torch.device('cpu')))


  from .autonotebook import tqdm as notebook_tqdm


<All keys matched successfully>

In [7]:
import os
from plants_sm.data_standardization.proteins.standardization import ProteinStandardizer
from plants_sm.data_standardization.truncation import Truncator
from plants_sm.featurization.proteins.bio_embeddings.esm import ESMEncoder

encoding = "esm2_t36_3B_UR50D"

if not os.path.exists("esm2_t36_3B_UR50D_features_price"):
    transformers = [ProteinStandardizer(), Truncator(max_length=884), ESMEncoder(esm_function=encoding, batch_size=1, num_gpus=4)]
    for transformer in transformers:
        halogenases_dataset = transformer.fit_transform(test_dataset)
    test_dataset.save_features("esm2_t36_3B_UR50D_features_price")
else:
    test_dataset.load_features("esm2_t36_3B_UR50D_features_price")

In [8]:
predictions = model.predict(test_dataset)
new_predictions_esm2_3b = convert_predictions_into_format(predictions, labels_names, price_dataset_.iloc[:, 7:].columns)
y_true = price_dataset_.iloc[:, 7:].values
metrics_evaluation = get_metrics(y_true, new_predictions_esm2_3b, price_dataset_.iloc[:, 7:].columns, [], "DNN ESM2 layer 36")
metrics_evaluation

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,f1 overall,f1 level 1,f1 level 2,f1 level 3,f1 level 4,recall level 4,precision level 4
DNN ESM2 layer 36,0.786082,0.962397,0.926765,0.828803,0.433464,0.407895,0.534868


## ESM1b model - predictions

In [9]:
from plants_sm.data_structures.dataset.single_input_dataset import SingleInputDataset

price_dataset = SingleInputDataset.from_csv('/scratch/jribeiro/ec_number_prediction/final_data/price.csv',
                                            instances_ids_field="Entry", representation_field="Sequence", sep="\t")

In [10]:
import os
from plants_sm.data_standardization.proteins.standardization import ProteinStandardizer
from plants_sm.data_standardization.truncation import Truncator
from plants_sm.featurization.proteins.bio_embeddings.esm import ESMEncoder

encoding = "esm1b_t33_650M_UR50S"

if not os.path.exists("ESM1b_features_price"):
    transformers = [ProteinStandardizer(), Truncator(max_length=884), ESMEncoder(esm_function=encoding, batch_size=1, num_gpus=1)]
    for transformer in transformers:
        price_dataset = transformer.fit_transform(price_dataset)
    price_dataset.save_features("ESM1b_features_price")
else:
    price_dataset.load_features("ESM1b_features_price")

In [11]:
from plants_sm.models.pytorch_model import PyTorchModel

model = PyTorchModel(model = DNN(1280, [2560, 5120], 5743, batch_norm=True), loss_function=nn.BCELoss())
model.model.load_state_dict(
    torch.load("/scratch/jribeiro/results/esm1b_t33_650M_UR50S/DNN_esm1b_t33_650M_UR50S/DNN_esm1b_t33_650M_UR50S_optimization_set_4_all_data_all_data/pytorch_model_weights.pt", map_location=torch.device('cpu')))

<All keys matched successfully>

In [12]:
predictions = model.predict(price_dataset)
new_predictions_esm1b = convert_predictions_into_format(predictions, labels_names, price_dataset_.iloc[:, 7:].columns)
y_true = price_dataset_.iloc[:, 7:].values
metrics_evaluation = pd.concat((metrics_evaluation, get_metrics(y_true, new_predictions_esm1b, price_dataset_.iloc[:, 7:].columns, [], "DNN ESM1b")))
metrics_evaluation

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,f1 overall,f1 level 1,f1 level 2,f1 level 3,f1 level 4,recall level 4,precision level 4
DNN ESM2 layer 36,0.786082,0.962397,0.926765,0.828803,0.433464,0.407895,0.534868
DNN ESM1b,0.789731,0.965597,0.922643,0.856848,0.421253,0.381579,0.586623


## ProtBert model - predictions

In [13]:
from plants_sm.models.pytorch_model import PyTorchModel

model = PyTorchModel(model = DNN(1024, [2560], 5743, batch_norm=True), loss_function=nn.BCELoss())
model.model.load_state_dict(
    torch.load("/scratch/jribeiro/results/prot_bert_vectors/DNN_prot_bert_vectors/DNN_prot_bert_vectors_optimization_set_2_all_data_all_data/pytorch_model_weights.pt", map_location=torch.device('cpu')))


import os
from plants_sm.featurization.proteins.bio_embeddings.prot_bert import ProtBert
from plants_sm.data_standardization.proteins.standardization import ProteinStandardizer
from plants_sm.data_standardization.truncation import Truncator

if not os.path.exists("protbert_features_price"):
    transformers = [ProteinStandardizer(), Truncator(max_length=884), ProtBert(device="cuda:0")]
    for transformer in transformers:
        price_dataset = transformer.fit_transform(price_dataset)
    price_dataset.save_features("protbert_features_price")
else:
    price_dataset.load_features("protbert_features_price")

In [14]:
predictions = model.predict(price_dataset)
new_predictions_protein_bert = convert_predictions_into_format(predictions, labels_names, price_dataset_.iloc[:, 7:].columns)
y_true = price_dataset_.iloc[:, 7:].values
metrics_evaluation = pd.concat((metrics_evaluation, get_metrics(y_true, new_predictions_protein_bert, price_dataset_.iloc[:, 7:].columns, [], "DNN ProtBERT")))
metrics_evaluation

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,f1 overall,f1 level 1,f1 level 2,f1 level 3,f1 level 4,recall level 4,precision level 4
DNN ESM2 layer 36,0.786082,0.962397,0.926765,0.828803,0.433464,0.407895,0.534868
DNN ESM1b,0.789731,0.965597,0.922643,0.856848,0.421253,0.381579,0.586623
DNN ProtBERT,0.740511,0.924628,0.88092,0.80664,0.357566,0.315789,0.525658


## Models ensembles and blast results - predictions

In [15]:
# create an ensemble of the predictions
# make a voting classifier for the 3 models
import numpy as np

def determine_ensemble_predictions(threshold=3, *model_predictions):
    model_predictions = list(model_predictions)

    for i, model_prediction in enumerate(model_predictions):
        model_predictions[i] = np.array(model_prediction)


    predictions_voting = np.zeros_like(model_predictions[0])

    for i in range(model_predictions[0].shape[0]):
        # Combine conditions into a single array and sum along the second axis
        combined_conditions = np.sum(np.array([model_predictions[j][i] for j in range(len(model_predictions))]), axis=0)

        # Apply the threshold condition
        predictions_voting[i] = (combined_conditions >= threshold).astype(int)

    # If you want to ensure the resulting array is of integer type
    predictions_voting = predictions_voting.astype(int)
    return predictions_voting

In [16]:
predictions_ensemble = determine_ensemble_predictions(2, new_predictions_esm1b, new_predictions_protein_bert, new_predictions_esm2_3b)
metrics_evaluation = pd.concat((metrics_evaluation, get_metrics(y_true, predictions_ensemble, price_dataset_.iloc[:, 7:].columns, [], "Models Ensemble")))

  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
metrics_evaluation

Unnamed: 0,f1 overall,f1 level 1,f1 level 2,f1 level 3,f1 level 4,recall level 4,precision level 4
DNN ESM2 layer 36,0.786082,0.962397,0.926765,0.828803,0.433464,0.407895,0.534868
DNN ESM1b,0.789731,0.965597,0.922643,0.856848,0.421253,0.381579,0.586623
DNN ProtBERT,0.740511,0.924628,0.88092,0.80664,0.357566,0.315789,0.525658
Models Ensemble,0.779691,0.962156,0.92667,0.845373,0.392363,0.361842,0.525219


# Include ensemble with blast

In [18]:
blast_results = pd.read_csv("price_blast_results.csv")
# Create a new column with the custom order as a categorical type
blast_results['CustomOrder'] = pd.Categorical(blast_results['qseqid'], categories=price_dataset_.Entry, ordered=True)
blast_results.sort_values('CustomOrder', inplace=True)
blast_results.drop(columns=["CustomOrder"], inplace=True)
blast_results

Unnamed: 0,qseqid,sequence,accession,pident,length,mismatch,gapopen,qstart,qend,sstart,evalue,bitscore,name,EC,EC1,EC2,EC3,EC4
5,WP_063460136,MNMLELTVMPKDEFRNPSFLLQHVRDTMRFYHPTVIDASGGFYHFF...,A0A0A1F806,58.313,403.0,160.0,1.0,1.0,403.0,6.0,1.550000e-172,491.0,A0A0A1F806_9BURK,5.3.1.7,5.0,5.3,5.3.1,5.3.1.7
6,WP_063462980,MNTSPNTKTPPRYRGIFPVVPTTFTETGELDLDSQKRAVDFMIDAG...,A0A6J5G3A1,81.973,294.0,53.0,0.0,1.0,294.0,20.0,3.320000e-179,499.0,A0A6J5G3A1_9BURK,4.2.1.43,4.0,4.2,4.2.1,4.2.1.43
7,WP_063462990,MKDSPQLSIHPSLKERTVFVTGGGSGIGAAIVAAFAAQGARVAFVD...,A0A0H2M6K5,81.641,256.0,45.0,2.0,24.0,279.0,4.0,9.330000e-138,391.0,A0A0H2M6K5_VARPD,1.1.1.175,1.0,1.1,1.1.1,1.1.1.175
8,WP_041412631,MKNKQVLRSAAWFGTTDKNGFMYRSWMKNQGIPDHEFQGKPIIGIC...,A0A446ZJH1,73.611,576.0,150.0,2.0,13.0,586.0,2.0,0.000000e+00,899.0,A0A446ZJH1_ACICA,4.2.1.25,4.0,4.2,4.2.1,4.2.1.25
9,WP_011717048,MLRFFSPFGISVGVLILGTLLLVSSTFAPALAADDSSSPATKKSMS...,D2QWX3,40.909,352.0,196.0,4.0,3.0,348.0,45.0,4.820000e-83,260.0,D2QWX3_PIRSD,5.1.3.3,5.0,5.1,5.1.3,5.1.3.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,WP_010207013,MSLTYTAPLDEMRFTLSEVFDAKNFWQQHSQLNHVDIDTVEMILAE...,A0A448GX15,55.254,590.0,258.0,4.0,4.0,592.0,5.0,0.000000e+00,656.0,A0A448GX15_9GAMM,1.3.8.1,1.0,1.3,1.3.8,1.3.8.1
105,WP_010207016,MLEYKAPLRDIRFIIDEVLDSGAVYETLPGYEEATPDLMAAIIEEG...,A0A095XYM8,66.945,599.0,196.0,2.0,1.0,599.0,1.0,0.000000e+00,833.0,A0A095XYM8_9GAMM,1.3.8.7,1.0,1.3,1.3.8,1.3.8.7
91,WP_010207340,MSSNNPQTREWQALSSDHHLAPFSDFKQLKEKGPRIITNAKGVYLW...,A0A6M8MMA4,89.868,454.0,46.0,0.0,1.0,454.0,1.0,0.000000e+00,858.0,A0A6M8MMA4_9PSED,2.6.1.113,2.0,2.6,2.6.1,2.6.1.113
60,WP_010207341,MSVPLRAVQLTEPSLFLQEHPEVQFVDLLISDMNGVVRGKRIERNS...,A0A2R3IXT5,87.555,458.0,57.0,0.0,1.0,458.0,1.0,0.000000e+00,843.0,A0A2R3IXT5_PSEAI,6.3.1.11,6.0,6.3,6.3.1,6.3.1.11


In [19]:
blast_results["EC1"] = blast_results["EC1"].astype(str)
blast_results["EC2"] = blast_results["EC2"].astype(str)
blast_results["EC3"] = blast_results["EC3"].astype(str)
blast_results["EC4"] = blast_results["EC4"].astype(str)

In [20]:
# blast_results = divide_labels_by_EC_level(blast_results, "EC number")
blast_results.fillna("0", inplace=True)
blast_results_ = get_final_labels(blast_results, all_levels=True)
blast_results_.drop(columns=["nan"], inplace=True)
labels_names = [ec_number.replace(".0", "") for ec_number in blast_results_.columns[18:].tolist()]
labels_names

['1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '1.1',
 '1.13',
 '1.2',
 '1.3',
 '1.7',
 '2.3',
 '2.4',
 '2.6',
 '2.7',
 '2.8',
 '3.1',
 '3.2',
 '3.5',
 '3.7',
 '4.1',
 '4.2',
 '4.4',
 '5.1',
 '5.3',
 '6.2',
 '6.3',
 '1.1.1',
 '1.1.2',
 '1.1.3',
 '1.1.99',
 '1.13.12',
 '1.2.1',
 '1.3.8',
 '1.3.99',
 '1.7.3',
 '2.3.1',
 '2.4.1',
 '2.6.1',
 '2.7.1',
 '2.8.3',
 '3.1.1',
 '3.1.3',
 '3.2.1',
 '3.2.2',
 '3.5.1',
 '3.7.1',
 '4.1.2',
 '4.2.1',
 '4.4.1',
 '5.1.3',
 '5.3.1',
 '6.2.1',
 '6.3.1',
 '1.1.1.107',
 '1.1.1.108',
 '1.1.1.175',
 '1.1.1.203',
 '1.1.1.269',
 '1.1.1.292',
 '1.1.1.301',
 '1.1.1.361',
 '1.1.1.369',
 '1.1.1.376',
 '1.1.1.390',
 '1.1.1.60',
 '1.1.2.4',
 '1.1.3.15',
 '1.1.99.14',
 '1.13.12.3',
 '1.2.1.32',
 '1.2.1.4',
 '1.2.1.5',
 '1.3.8.1',
 '1.3.8.7',
 '1.7.3.3',
 '2.3.1.9',
 '2.4.1.19',
 '2.6.1.1',
 '2.6.1.11',
 '2.6.1.113',
 '2.6.1.39',
 '2.6.1.48',
 '2.6.1.57',
 '2.6.1.77',
 '2.7.1.101',
 '2.7.1.4',
 '2.7.1.51',
 '2.7.1.8',
 '2.8.3.19',
 '2.8.3.6',
 '2.8.3.8',
 '3.1.1.15',
 '3.1.1.99'

In [21]:
blast_results_.shape

(149, 138)

In [22]:
new_predictions_protein_bert.shape

(149, 108)

In [23]:
blast_predictions = np.array(blast_results_.iloc[:, 18:])
new_blast_predictions = convert_predictions_into_format(blast_predictions, labels_names, price_dataset_.iloc[:, 7:].columns)
predictions_ensemble = determine_ensemble_predictions(2, new_predictions_esm1b, new_predictions_protein_bert, new_predictions_esm2_3b, new_blast_predictions)
metrics_evaluation = pd.concat((metrics_evaluation, get_metrics(y_true, predictions_ensemble, price_dataset_.iloc[:, 7:].columns, [], "Models Ensemble + BLASTp")))
metrics_evaluation

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,f1 overall,f1 level 1,f1 level 2,f1 level 3,f1 level 4,recall level 4,precision level 4
DNN ESM2 layer 36,0.786082,0.962397,0.926765,0.828803,0.433464,0.407895,0.534868
DNN ESM1b,0.789731,0.965597,0.922643,0.856848,0.421253,0.381579,0.586623
DNN ProtBERT,0.740511,0.924628,0.88092,0.80664,0.357566,0.315789,0.525658
Models Ensemble,0.779691,0.962156,0.92667,0.845373,0.392363,0.361842,0.525219
Models Ensemble + BLASTp,0.794037,0.958943,0.92667,0.853427,0.444152,0.407895,0.599781


In [24]:
metrics_evaluation = pd.concat((metrics_evaluation, get_metrics(y_true, new_blast_predictions, price_dataset_.iloc[:, 7:].columns, [], "BLASTp")))

  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
metrics_evaluation

Unnamed: 0,f1 overall,f1 level 1,f1 level 2,f1 level 3,f1 level 4,recall level 4,precision level 4
DNN ESM2 layer 36,0.786082,0.962397,0.926765,0.828803,0.433464,0.407895,0.534868
DNN ESM1b,0.789731,0.965597,0.922643,0.856848,0.421253,0.381579,0.586623
DNN ProtBERT,0.740511,0.924628,0.88092,0.80664,0.357566,0.315789,0.525658
Models Ensemble,0.779691,0.962156,0.92667,0.845373,0.392363,0.361842,0.525219
Models Ensemble + BLASTp,0.794037,0.958943,0.92667,0.853427,0.444152,0.407895,0.599781
BLASTp,0.744674,0.939441,0.892753,0.781808,0.372193,0.328947,0.49859


In [25]:
metrics_evaluation.to_csv("metrics_evaluation_price.csv", index=False)

In [26]:
metrics_evaluation.iloc[:, 4:]

Unnamed: 0,f1 level 4,recall level 4,precision level 4
DNN ESM2 layer 36,0.433464,0.407895,0.534868
DNN ESM1b,0.421253,0.381579,0.586623
DNN ProtBERT,0.357566,0.315789,0.525658
Models Ensemble,0.392363,0.361842,0.525219
Models Ensemble + BLASTp,0.444152,0.407895,0.599781
BLASTp,0.372193,0.328947,0.49859
