# Test models on the halogenases dataset

In [1]:
# define the directory where the data is
data_path = "/home/jcapela/ec_number_prediction_version_2/ec_numbers_prediction/required_data_ec_number_paper"

## Create useful functions to post process the predictions

In [2]:
import re
import pandas as pd
import numpy as np


def get_unique_labels_by_level(dataset, level):
    final_dataset_test = dataset.copy()
    final_dataset_test = final_dataset_test.loc[:,level]
    final_dataset_test.fillna("0", inplace=True)
    values = pd.Series(final_dataset_test.values.reshape(-1)).str.split(";")
    list_of_unique_labels = np.unique(values.explode()).tolist()
    if "0" in list_of_unique_labels:
        list_of_unique_labels.remove("0")
    list_of_unique_labels_dict = dict(zip(list_of_unique_labels, range(len(list_of_unique_labels))))
    return list_of_unique_labels_dict

def get_final_labels(dataset, all_levels=False):

    if all_levels:
        unique_EC1 = get_unique_labels_by_level(dataset, "EC1")
        unique_EC2 = get_unique_labels_by_level(dataset, "EC2")
        unique_EC3 = get_unique_labels_by_level(dataset, "EC3")
        
        array_EC1 = np.zeros((len(dataset), len(unique_EC1)))
        array_EC2 = np.zeros((len(dataset), len(unique_EC2)))
        array_EC3 = np.zeros((len(dataset), len(unique_EC3)))

    unique_EC4 = get_unique_labels_by_level(dataset, "EC4")

    array_EC4 = np.zeros((len(dataset), len(unique_EC4)))
        
    dataset.fillna("0", inplace=True)

    if all_levels:
        for i, row in dataset.iterrows():
            for ec in ["EC1", "EC2", "EC3", "EC4"]:
                for EC in row[ec].split(";"):
                    if EC != "0":
                        if ec == "EC1":
                            array_EC1[i, unique_EC1[EC]] = 1
                        elif ec == "EC2":
                            array_EC2[i, unique_EC2[EC]] = 1
                        elif ec == "EC3":
                            array_EC3[i, unique_EC3[EC]] = 1
                        elif ec == "EC4":
                            array_EC4[i, unique_EC4[EC]] = 1
    else:
        for i, row in dataset.iterrows():
            for EC in row["EC4"].split(";"):
                if EC != "0":
                    array_EC4[i, unique_EC4[EC]] = 1
    if all_levels:
        array_EC1 = pd.DataFrame(array_EC1, columns=unique_EC1.keys())
        array_EC2 = pd.DataFrame(array_EC2, columns=unique_EC2.keys())
        array_EC3 = pd.DataFrame(array_EC3, columns=unique_EC3.keys())
    array_EC4 = pd.DataFrame(array_EC4, columns=unique_EC4.keys())

    if all_levels:
        dataset = pd.concat((dataset, array_EC1, array_EC2, array_EC3, array_EC4), axis=1)
    else:
        dataset = pd.concat((dataset, array_EC4), axis=1)
    return dataset


def get_ec_from_regex_match(match):
    if match is not None:
        EC = match.group()
        if EC is not None:
            return EC
    return None

def get_labels_based_on_list(dataset, labels, all_levels=True):
    array = np.zeros((len(dataset), len(labels)))
    labels_dataframe = pd.DataFrame(array, columns=labels)
    dataset.fillna("0", inplace=True)
    for i, row in dataset.iterrows():
        for label in row["EC4"].split(";"):
            if label != "0" and label in labels:
                labels_dataframe.at[i, label] = 1
        
        if all_levels:
            for ec in ["EC1", "EC2", "EC3"]:
                for label in row[ec].split(";"):
                    if label != "0" and label in labels:
                        labels_dataframe.at[i, label] = 1

    return pd.concat((dataset, labels_dataframe), axis=1)

def divide_labels_by_EC_level(final_dataset, ec_label):
    EC1_lst = []
    EC2_lst = []
    EC3_lst = []
    EC4_lst = []


    for _, row in final_dataset.iterrows():
        ECs = row[ec_label]
        ECs = ECs.split(";")
        # get the first 3 ECs with regular expression
        EC3 = []
        EC2 = []
        EC1 = []
        EC4 = []
        for EC in ECs:
            new_EC = re.search(r"^\d+.\d+.\d+.n*\d+", EC)
            new_EC = get_ec_from_regex_match(new_EC)
            if isinstance(new_EC, str):
                if new_EC not in EC4:
                    EC4.append(new_EC)

            new_EC = re.search(r"^\d+.\d+.\d+", EC)
            new_EC = get_ec_from_regex_match(new_EC)
            if isinstance(new_EC, str):
                if new_EC not in EC3:
                    EC3.append(new_EC)

            new_EC = re.search(r"^\d+.\d+", EC)
            new_EC = get_ec_from_regex_match(new_EC)
            if isinstance(new_EC, str):
                if new_EC not in EC2:
                    EC2.append(new_EC)

            new_EC = re.search(r"^\d+", EC)
            new_EC = get_ec_from_regex_match(new_EC)
            if isinstance(new_EC, str):
                if new_EC not in EC1:
                    EC1.append(new_EC)

        if len(EC4) == 0:
            EC4_lst.append(np.NaN)
        else:
            EC4_lst.append(";".join(EC4))
        if len(EC3) == 0:
            EC3_lst.append(np.NaN)
        else:
            EC3_lst.append(";".join(EC3))
        if len(EC2) == 0:
            EC2_lst.append(np.NaN)
        else:
            EC2_lst.append(";".join(EC2))
        if len(EC1) == 0:
            EC1_lst.append(np.NaN)
        else:
            EC1_lst.append(";".join(EC1))

    assert None not in EC1_lst
    assert None not in EC2_lst
    assert None not in EC3_lst
    assert None not in EC4_lst

    assert len(EC1_lst) == len(final_dataset)
    assert len(EC2_lst) == len(final_dataset)
    assert len(EC3_lst) == len(final_dataset)
    assert len(EC4_lst) == len(final_dataset)

    final_dataset["EC1"] = EC1_lst
    final_dataset["EC2"] = EC2_lst
    final_dataset["EC3"] = EC3_lst
    final_dataset["EC4"] = EC4_lst

    assert final_dataset["EC1"].isnull().sum() == 0
    print("EC1 is not null")

    return final_dataset

In [3]:
from sklearn.metrics import accuracy_score, confusion_matrix

def get_ec_levels(labels):
    level_1 = []
    level_2 = []
    level_3 = []
    level_4 = []
    for i, label in enumerate(labels):
        if re.match(r"^\d+.\d+.\d+.n*\d+$", label):
            level_4.append(i)
        elif re.match(r"^\d+.\d+.\d+$", label):
            level_3.append(i)
        elif re.match(r"^\d+.\d+$", label):
            level_2.append(i)
        elif re.match(r"^\d+$", label):
            level_1.append(i)
    return level_1, level_2, level_3, level_4

def get_metrics(y_true, predictions, labels, labels_to_remove, model_name):

    from statsmodels.stats.proportion import proportion_confint

    y_true_ = np.delete(y_true, labels_to_remove, axis=1)
    predictions_ = np.delete(predictions, labels_to_remove, axis=1)
    labels_ = np.delete(labels, labels_to_remove)
    level_1, level_2, level_3, level_4 = get_ec_levels(labels_)
    print("level 1", len(level_1))
    print("level 2", len(level_2))
    print("level 3", len(level_3))
    print("level 4", len(level_4))
    
    metrics = {}
    # metrics["accuracy overall"] = accuracy_score(y_true, predictions)
    metrics["model_name"] = [model_name]
    metrics["accuracy level 1"] = accuracy_score(y_true_[:, level_1], predictions_[:, level_1])
    metrics["accuracy level 2"] = accuracy_score(y_true_[:, level_2], predictions_[:, level_2])
    metrics["accuracy level 3"] = accuracy_score(y_true_[:, level_3], predictions_[:, level_3])
    metrics["accuracy level 4"] = accuracy_score(y_true_[:, level_4], predictions_[:, level_4])

    metrics = pd.DataFrame(metrics)

    from scipy.stats import binomtest
    # Calculate accuracy for each level

    levels = {
        "level_1": level_1,  
        "level_2": level_2,  
        "level_3": level_3,  
        "level_4": level_4,  
    }

    # Ensure the matrices are binary (0 or 1) and of integer type
    y_true_ = (y_true_ > 0).astype(int)
    predictions_ = (predictions_ > 0).astype(int)

    statistical_df = pd.DataFrame()

    
    # Evaluate each level
    for level, cols in levels.items():

        # Subset ground truth and predictions for the level
        y_true_level = y_true_[:, cols]
        predictions_level = predictions_[:, cols]

        # Calculate total correct annotations and total annotations for each EC column
        correct_annotations = (y_true_level & predictions_level).sum(axis=0)
        total_annotations = y_true_level.sum(axis=0)

        # Calculate observed proportions and expected proportion (global for the level)
        expected_proportion = correct_annotations.sum() / total_annotations.sum()

        # Perform binomial tests for each EC column
        for i, ec in enumerate(cols):
            if total_annotations[i] == 0:
                print(f"    EC {ec}: No annotations to evaluate.")
                continue
            
            # Binomial test
            test_result = binomtest(
                correct_annotations[i],
                total_annotations[i],
                p=expected_proportion,
                alternative="greater"
            )

            ci = test_result.proportion_ci()

            y_true_level = y_true_[:, ec]
            predictions_level = predictions_[:, ec]
            
            # Append results to the list
            statistical_df = pd.concat((statistical_df, pd.DataFrame({
                "model_name": [model_name],
                "ec_number": [labels[ec]],
                "accuracy": [accuracy_score(y_true_level, predictions_level)],
                f"ci_lower": [ci.low],
                f"ci_upper": [ci.high],
                "expected_proportion": expected_proportion,
                f"p_value": [test_result.pvalue],
                f"significant": [test_result.pvalue < 0.05],  # Mark significant results
                f"sample_size": [total_annotations[i]]
            })), axis=0)

    return metrics, statistical_df

def get_models_predictions(model, dataset, labels, labels_to_remove, model_name):
    predictions = model.predict(dataset)
    y_true = dataset.y

    return get_metrics(y_true, predictions, labels, labels_to_remove, model_name)

def convert_predictions_into_format(predictions, labels_names, new_labels):
    new_predictions = np.zeros((len(predictions), len(new_labels)))
    labels_names = np.array(labels_names)
    new_labels = np.array(new_labels)
    for i, prediction in enumerate(predictions):
        indexes = np.where(prediction == 1)
        result = labels_names[indexes]
        for res in result:
            potential_result = new_labels[new_labels == res]
            if potential_result.size > 0:
                new_predictions[i, new_labels == res] = 1
    return new_predictions

## Read the dataset

In [4]:
dataset = pd.read_csv(f'{data_path}/data/halogenase.csv', sep="\t")

In [5]:
from plants_sm.data_structures.dataset.single_input_dataset import SingleInputDataset

halogenases_dataset = SingleInputDataset.from_csv(f'{data_path}/data/halogenase.csv',
                                            instances_ids_field="Entry", representation_field="Sequence", sep="\t")

In [6]:
import numpy as np

h_dataset = pd.read_csv(f'{data_path}/data/halogenase.csv', sep="\t", nrows=36)
h_dataset = divide_labels_by_EC_level(h_dataset, "EC number")
h_dataset_ = get_final_labels(h_dataset, all_levels=True)

EC1 is not null


In [7]:
from plants_sm.data_structures.dataset.single_input_dataset import SingleInputDataset

labels_names = SingleInputDataset.from_csv(f'{data_path}/data/merged_dataset.csv',
                                           instances_ids_field="accession", representation_field="sequence",
                                           labels_field=slice(8, -1), nrows=2)._labels_names

In [8]:
len(labels_names)

5743

# ESM2 3B - get predictions

In [9]:
from plants_sm.models.fc.fc import DNN
from plants_sm.models.pytorch_model import PyTorchModel
from torch import nn
import torch

model = PyTorchModel(model = DNN(2560, [2560], 5743, batch_norm=True), loss_function=nn.BCELoss())
model.model.load_state_dict(
    torch.load(f'{data_path}/models/DNN_esm2_t36_3B_UR50D_optimization_set_2_all_data/pytorch_model_weights.pt', map_location=torch.device('cpu')))


2025-01-28 14:21:31.043804: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-28 14:21:31.153880: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-28 14:21:31.154095: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-28 14:21:31.154250: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-28 14:21:31.172678: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: A

<All keys matched successfully>

In [10]:
import os
from plants_sm.data_standardization.proteins.standardization import ProteinStandardizer
from plants_sm.data_standardization.truncation import Truncator
from plants_sm.featurization.proteins.bio_embeddings.esm import ESMEncoder

encoding = "esm2_t36_3B_UR50D"

if not os.path.exists(f'{data_path}/features/test_halogenases_esm2_3b'):
    transformers = [ProteinStandardizer(), Truncator(max_length=884), ESMEncoder(esm_function=encoding, batch_size=1, num_gpus=4)]
    for transformer in transformers:
        halogenases_dataset = transformer.fit_transform(halogenases_dataset)
    halogenases_dataset.save_features(f'{data_path}/features/test_halogenases_esm2_3b')
else:
    halogenases_dataset.load_features(f'{data_path}/features/test_halogenases_esm2_3b')

In [11]:
predictions = model.predict(halogenases_dataset)
new_predictions_esm2_3b = convert_predictions_into_format(predictions, labels_names, h_dataset_.iloc[:, 7:].columns)
y_true = h_dataset_.iloc[:, 7:].values
metrics_evaluation, statistical_df = get_metrics(y_true, new_predictions_esm2_3b, h_dataset_.iloc[:, 7:].columns, [], "DNN ESM2 3B")
metrics_evaluation

level 1 3
level 2 4
level 3 6
level 4 7


Unnamed: 0,model_name,accuracy level 1,accuracy level 2,accuracy level 3,accuracy level 4
0,DNN ESM2 3B,0.916667,0.916667,0.5,0.805556


In [12]:
statistical_df

Unnamed: 0,model_name,ec_number,accuracy,ci_lower,ci_upper,expected_proportion,p_value,significant,sample_size
0,DNN ESM2 3B,1,1.0,0.904966,1.0,0.918919,0.079125,False,30
0,DNN ESM2 3B,2,0.944444,0.472871,1.0,0.918919,0.713032,False,4
0,DNN ESM2 3B,3,0.916667,0.0,1.0,0.918919,1.0,False,3
0,DNN ESM2 3B,1.11,1.0,0.472871,1.0,0.918919,0.713032,False,4
0,DNN ESM2 3B,1.14,1.0,0.89117,1.0,0.918919,0.11097,False,26
0,DNN ESM2 3B,2.5,0.944444,0.472871,1.0,0.918919,0.713032,False,4
0,DNN ESM2 3B,3.13,0.916667,0.0,1.0,0.918919,1.0,False,3
0,DNN ESM2 3B,1.11.1,1.0,0.472871,1.0,0.621622,0.149315,False,4
0,DNN ESM2 3B,1.14.11,0.888889,0.605837,1.0,0.621622,0.061055,False,10
0,DNN ESM2 3B,1.14.19,0.805556,0.223955,1.0,0.621622,0.927946,False,13


In [13]:
len(labels_names)

5743

# ESM1b - get predictions

In [14]:
from plants_sm.data_structures.dataset.single_input_dataset import SingleInputDataset

halogenases_dataset = SingleInputDataset.from_csv(f'{data_path}/data/halogenase.csv',
                                            instances_ids_field="Entry", representation_field="Sequence", sep="\t", nrows=36)

In [15]:
model = PyTorchModel(model = DNN(1280, [2560, 5120], 5743, batch_norm=True), loss_function=nn.BCELoss(), device="cpu")
model.model.load_state_dict(
    torch.load(f'{data_path}/models/DNN_esm1b_t33_650M_UR50S_optimization_set_4_all_data/pytorch_model_weights.pt', map_location=torch.device('cpu')))


  torch.load(f'{data_path}/models/DNN_esm1b_t33_650M_UR50S_optimization_set_4_all_data/pytorch_model_weights.pt', map_location=torch.device('cpu')))


<All keys matched successfully>

In [16]:
import os
from plants_sm.data_standardization.proteins.standardization import ProteinStandardizer
from plants_sm.data_standardization.truncation import Truncator
from plants_sm.featurization.proteins.bio_embeddings.esm import ESMEncoder

encoding = "esm1b_t33_650M_UR50S"

if not os.path.exists(f'{data_path}/features/test_halogenases_esm1b'):
    transformers = [ProteinStandardizer(), Truncator(max_length=884), ESMEncoder(esm_function=encoding, batch_size=1, num_gpus=4)]
    for transformer in transformers:
        halogenases_dataset = transformer.fit_transform(halogenases_dataset)
    halogenases_dataset.save_features(f'{data_path}/features/test_halogenases_esm1b')
else:
    halogenases_dataset.load_features(f'{data_path}/features/test_halogenases_esm1b')

In [17]:
predictions = model.predict(halogenases_dataset)
new_predictions_esm1b = convert_predictions_into_format(predictions, labels_names, h_dataset_.iloc[:, 7:].columns)
y_true = h_dataset_.iloc[:, 7:].values
metrics, statistical_df_ = get_metrics(y_true, new_predictions_esm1b, h_dataset_.iloc[:, 7:].columns, [], "DNN ESM1b")
statistical_df = pd.concat((statistical_df, statistical_df_))
metrics_evaluation = pd.concat((metrics_evaluation, metrics))
metrics_evaluation

level 1 3
level 2 4
level 3 6
level 4 7


Unnamed: 0,model_name,accuracy level 1,accuracy level 2,accuracy level 3,accuracy level 4
0,DNN ESM2 3B,0.916667,0.916667,0.5,0.805556
0,DNN ESM1b,0.916667,0.916667,0.5,0.833333


# ProtBERT - get predictions

In [18]:
from plants_sm.data_structures.dataset.single_input_dataset import SingleInputDataset

halogenases_dataset = SingleInputDataset.from_csv(f'{data_path}/data/halogenase.csv',
                                            instances_ids_field="Entry", representation_field="Sequence", sep="\t", nrows=36)

In [19]:
from plants_sm.models.pytorch_model import PyTorchModel

model = PyTorchModel(model = DNN(1024, [2560], 5743, batch_norm=True), loss_function=nn.BCELoss())
model.model.load_state_dict(
    torch.load(f'{data_path}/models/DNN_prot_bert_vectors_optimization_set_2_all_data/pytorch_model_weights.pt', map_location=torch.device('cpu')))


import os
from plants_sm.featurization.proteins.bio_embeddings.prot_bert import ProtBert
from plants_sm.data_standardization.proteins.standardization import ProteinStandardizer
from plants_sm.data_standardization.truncation import Truncator

if not os.path.exists(f'{data_path}/features/test_halogenases_prot_bert'):
    transformers = [ProteinStandardizer(), Truncator(max_length=884), ProtBert(device="cuda:0")]
    for transformer in transformers:
        halogenases_dataset = transformer.fit_transform(halogenases_dataset)
    halogenases_dataset.save_features(f'{data_path}/features/test_halogenases_prot_bert')
else:
    halogenases_dataset.load_features(f'{data_path}/features/test_halogenases_prot_bert')

  torch.load(f'{data_path}/models/DNN_prot_bert_vectors_optimization_set_2_all_data/pytorch_model_weights.pt', map_location=torch.device('cpu')))


In [20]:
predictions = model.predict(halogenases_dataset)
new_predictions_protein_bert = convert_predictions_into_format(predictions, labels_names, h_dataset_.iloc[:, 7:].columns)
y_true = h_dataset_.iloc[:, 7:].values
metrics, statistical_df_ = get_metrics(y_true, new_predictions_protein_bert, h_dataset_.iloc[:, 7:].columns, [], "DNN ProtBERT")
statistical_df = pd.concat((statistical_df, statistical_df_))
metrics_evaluation = pd.concat((metrics_evaluation, metrics))
metrics_evaluation

level 1 3
level 2 4
level 3 6
level 4 7


Unnamed: 0,model_name,accuracy level 1,accuracy level 2,accuracy level 3,accuracy level 4
0,DNN ESM2 3B,0.916667,0.916667,0.5,0.805556
0,DNN ESM1b,0.916667,0.916667,0.5,0.833333
0,DNN ProtBERT,0.805556,0.916667,0.527778,0.75


In [21]:
metrics_evaluation.to_csv("metrics_evaluation_halogenases.csv")

# Test ensemble models on the halogenases dataset

In [22]:
# create an ensemble of the predictions
# make a voting classifier for the 3 models
import numpy as np

def determine_ensemble_predictions(threshold=3, *model_predictions):
    model_predictions = list(model_predictions)

    for i, model_prediction in enumerate(model_predictions):
        model_predictions[i] = np.array(model_prediction)


    predictions_voting = np.zeros_like(model_predictions[0])

    for i in range(model_predictions[0].shape[0]):
        # Combine conditions into a single array and sum along the second axis
        combined_conditions = np.sum(np.array([model_predictions[j][i] for j in range(len(model_predictions))]), axis=0)

        # Apply the threshold condition
        predictions_voting[i] = (combined_conditions >= threshold).astype(int)

    # If you want to ensure the resulting array is of integer type
    predictions_voting = predictions_voting.astype(int)
    return predictions_voting

In [23]:
predictions_ensemble = determine_ensemble_predictions(2, new_predictions_esm1b, new_predictions_protein_bert, new_predictions_esm2_3b)
metrics, statistical_df_ = get_metrics(y_true, predictions_ensemble,  h_dataset_.iloc[:, 7:].columns, [], "Models Ensemble")
statistical_df = pd.concat((statistical_df, statistical_df_))
metrics_evaluation = pd.concat((metrics_evaluation, metrics))
metrics_evaluation

level 1 3
level 2 4
level 3 6
level 4 7


Unnamed: 0,model_name,accuracy level 1,accuracy level 2,accuracy level 3,accuracy level 4
0,DNN ESM2 3B,0.916667,0.916667,0.5,0.805556
0,DNN ESM1b,0.916667,0.916667,0.5,0.833333
0,DNN ProtBERT,0.805556,0.916667,0.527778,0.75
0,Models Ensemble,0.916667,0.916667,0.5,0.833333


In [24]:
blast_results = pd.read_csv(f'{data_path}/halogenase_blast_results.csv')
blast_results.drop_duplicates(subset=["qseqid"], inplace=True)
# Create a new column with the custom order as a categorical type
blast_results['CustomOrder'] = pd.Categorical(blast_results['qseqid'], categories=h_dataset.Entry, ordered=True)
blast_results.sort_values('CustomOrder', inplace=True)
blast_results.drop(columns=["CustomOrder"], inplace=True)

In [25]:
blast_results = pd.read_csv(f'{data_path}/halogenase_blast_results.csv')
blast_results.drop_duplicates(subset=["qseqid"], inplace=True)
unique_ident = blast_results["pident"].unique()
unique_ident.sort()

In [26]:
pd.Series(unique_ident).describe()

count     32.000000
mean      51.053813
std       20.666439
min       25.523000
25%       35.391000
50%       44.651500
75%       62.212000
max      100.000000
dtype: float64

In [27]:
blast_results["EC1"] = blast_results["EC1"].astype(str)
blast_results["EC2"] = blast_results["EC2"].astype(str)
blast_results["EC3"] = blast_results["EC3"].astype(str)
blast_results["EC4"] = blast_results["EC4"].astype(str)

In [28]:
blast_results.fillna("0", inplace=True)
blast_results_ = get_final_labels(blast_results, all_levels=True)
blast_results_.drop(columns=["nan"], inplace=True)
labels_names_blast = [ec_number.replace(".0", "") for ec_number in blast_results_.columns[18:].tolist()]
labels_names_blast

['1',
 '2',
 '1.11',
 '1.14',
 '1.21',
 '2.5',
 '1.11.1',
 '1.14.11',
 '1.14.13',
 '1.14.14',
 '1.14.19',
 '1.14.20',
 '1.21.3',
 '2.5.1',
 '1.11.1.10',
 '1.14.11.16',
 '1.14.13.20',
 '1.14.19.56',
 '1.14.19.9',
 '1.14.20.15',
 '1.21.3.1',
 '2.5.1.63',
 '2.5.1.94']

In [29]:
blast_predictions = np.array(blast_results_.iloc[:, 18:])
new_blast_predictions = convert_predictions_into_format(blast_predictions, labels_names_blast, h_dataset_.iloc[:, 7:].columns)
predictions_ensemble = determine_ensemble_predictions(2, new_predictions_esm1b, new_predictions_protein_bert, new_predictions_esm2_3b, new_blast_predictions)
metrics, statistical_df_ = get_metrics(y_true, predictions_ensemble, h_dataset_.iloc[:, 7:].columns, [], "Models Ensemble + BLASTp")
statistical_df = pd.concat((statistical_df, statistical_df_))
metrics_evaluation = pd.concat((metrics_evaluation, metrics))

level 1 3
level 2 4
level 3 6
level 4 7


In [30]:
metrics, statistical_df_ = get_metrics(y_true, new_blast_predictions, h_dataset_.iloc[:, 7:].columns, [], "BLASTp")
statistical_df = pd.concat((statistical_df, statistical_df_))


metrics_evaluation = pd.concat((metrics_evaluation, metrics))

level 1 3
level 2 4
level 3 6
level 4 7


In [31]:
metrics_evaluation

Unnamed: 0,model_name,accuracy level 1,accuracy level 2,accuracy level 3,accuracy level 4
0,DNN ESM2 3B,0.916667,0.916667,0.5,0.805556
0,DNN ESM1b,0.916667,0.916667,0.5,0.833333
0,DNN ProtBERT,0.805556,0.916667,0.527778,0.75
0,Models Ensemble,0.916667,0.916667,0.5,0.833333
0,Models Ensemble + BLASTp,0.916667,0.916667,0.555556,0.833333
0,BLASTp,0.75,0.638889,0.194444,0.583333


In [32]:
import pandas as pd

clean_prot_infer_results = pd.read_csv(f'{data_path}/clean_prote_infer_halogenase_predictions.csv')
clean_prot_infer_results

Unnamed: 0,Name,Uniprot ID,CLEAN,ProteInfer
0,NapH1,A7KH27,1.11.1.10,1.-.-.-
1,MarH1,A0A0F7N9T7,1.11.1.10,1.-.-.-
2,MarH2,A0A559V0A1,1.11.1.10,3.1.-.-
3,MarH3,A0A559V0T8,1.11.1.10,2.4.1.-
4,KtzR,A8CF74,1.14.19.9,
5,StaI,Q8KLM0,1.14.19.-,1.14.14.-
6,Tjp10,A0A6H0DY41,1.14.19.9,1.1.-.-
7,VirX1,M4SKV1,1.14.19.9,1.-.-.-
8,PlBmp2,A0A162BNF2,1.14.19.-,1.-.-.-
9,HrmQ,C1IHU5,1.14.19.56,1.-.-.-


In [33]:
clean_prot_infer_results = divide_labels_by_EC_level(clean_prot_infer_results, "CLEAN")
clean_prot_infer_results = get_final_labels(clean_prot_infer_results, all_levels=True)
labels_names_clean = [ec_number.replace(".0", "") for ec_number in clean_prot_infer_results.columns[8:].tolist()]
labels_names_clean

EC1 is not null


['1',
 '2',
 '3',
 '1.11',
 '1.14',
 '1.3',
 '2.5',
 '3.13',
 '1.11.1',
 '1.14.11',
 '1.14.13',
 '1.14.19',
 '1.14.20',
 '1.3.7',
 '2.5.1',
 '3.13.1',
 '1.11.1.10',
 '1.14.11.18',
 '1.14.11.26',
 '1.14.11.46',
 '1.14.11.74',
 '1.14.13.209',
 '1.14.19.56',
 '1.14.19.9',
 '1.14.20.15',
 '1.3.7.6',
 '2.5.1.63',
 '2.5.1.94',
 '3.13.1.8']

In [34]:
clean_prot_infer_results['CustomOrder'] = pd.Categorical(clean_prot_infer_results['Uniprot ID'], categories=h_dataset.Entry, ordered=True)
clean_prot_infer_results.sort_values('CustomOrder', inplace=True)
clean_prot_infer_results.drop(columns=["CustomOrder"], inplace=True)
clean_prot_infer_results

Unnamed: 0,Name,Uniprot ID,CLEAN,ProteInfer,EC1,EC2,EC3,EC4,1,2,...,1.14.11.46,1.14.11.74,1.14.13.209,1.14.19.56,1.14.19.9,1.14.20.15,1.3.7.6,2.5.1.63,2.5.1.94,3.13.1.8
0,NapH1,A7KH27,1.11.1.10,1.-.-.-,1,1.11,1.11.1,1.11.1.10,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,MarH1,A0A0F7N9T7,1.11.1.10,1.-.-.-,1,1.11,1.11.1,1.11.1.10,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,MarH2,A0A559V0A1,1.11.1.10,3.1.-.-,1,1.11,1.11.1,1.11.1.10,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,MarH3,A0A559V0T8,1.11.1.10,2.4.1.-,1,1.11,1.11.1,1.11.1.10,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,KtzR,A8CF74,1.14.19.9,0,1,1.14,1.14.19,1.14.19.9,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,StaI,Q8KLM0,1.14.19.-,1.14.14.-,1,1.14,1.14.19,0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Tjp10,A0A6H0DY41,1.14.19.9,1.1.-.-,1,1.14,1.14.19,1.14.19.9,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,VirX1,M4SKV1,1.14.19.9,1.-.-.-,1,1.14,1.14.19,1.14.19.9,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,PlBmp2,A0A162BNF2,1.14.19.-,1.-.-.-,1,1.14,1.14.19,0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,HrmQ,C1IHU5,1.14.19.56,1.-.-.-,1,1.14,1.14.19,1.14.19.56,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
len(labels_names_clean)

29

In [36]:
clean_prot_infer_results_predictions = np.array(clean_prot_infer_results.iloc[:, 8:])
new_clean_prot_infer_results_predictions = convert_predictions_into_format(clean_prot_infer_results_predictions, labels_names_clean, h_dataset_.iloc[:, 7:].columns)
metrics, statistical_df_ = get_metrics(y_true, new_clean_prot_infer_results_predictions, h_dataset_.iloc[:, 7:].columns, [], "CLEAN")
statistical_df = pd.concat((statistical_df, statistical_df_))
metrics_evaluation = pd.concat((metrics_evaluation, metrics))
metrics_evaluation

level 1 3
level 2 4
level 3 6
level 4 7


Unnamed: 0,model_name,accuracy level 1,accuracy level 2,accuracy level 3,accuracy level 4
0,DNN ESM2 3B,0.916667,0.916667,0.5,0.805556
0,DNN ESM1b,0.916667,0.916667,0.5,0.833333
0,DNN ProtBERT,0.805556,0.916667,0.527778,0.75
0,Models Ensemble,0.916667,0.916667,0.5,0.833333
0,Models Ensemble + BLASTp,0.916667,0.916667,0.555556,0.833333
0,BLASTp,0.75,0.638889,0.194444,0.583333
0,CLEAN,1.0,0.972222,0.944444,0.944444


In [37]:
clean_prot_infer_results = pd.read_csv(f'{data_path}/clean_prote_infer_halogenase_predictions.csv')

clean_prot_infer_results.fillna("0", inplace=True)
clean_prot_infer_results = divide_labels_by_EC_level(clean_prot_infer_results, "ProteInfer")
clean_prot_infer_results = get_final_labels(clean_prot_infer_results, all_levels=True)
labels_names_clean = [ec_number.replace(".0", "") for ec_number in clean_prot_infer_results.columns[8:].tolist()]

clean_prot_infer_results['CustomOrder'] = pd.Categorical(clean_prot_infer_results['Uniprot ID'], categories=h_dataset.Entry, ordered=True)
clean_prot_infer_results.sort_values('CustomOrder', inplace=True)
clean_prot_infer_results.drop(columns=["CustomOrder"], inplace=True)

clean_prot_infer_results_predictions = np.array(clean_prot_infer_results.iloc[:, 8:])
new_clean_prot_infer_results_predictions = convert_predictions_into_format(clean_prot_infer_results_predictions, labels_names_clean, h_dataset_.iloc[:, 7:].columns)

metrics, statistical_df_ = get_metrics(y_true, new_clean_prot_infer_results_predictions, h_dataset_.iloc[:, 7:].columns, [], "ProteInfer")
statistical_df = pd.concat((statistical_df, statistical_df_))
metrics_evaluation = pd.concat((metrics_evaluation, metrics))

EC1 is not null
level 1 3
level 2 4
level 3 6
level 4 7


In [38]:
metrics_evaluation

Unnamed: 0,model_name,accuracy level 1,accuracy level 2,accuracy level 3,accuracy level 4
0,DNN ESM2 3B,0.916667,0.916667,0.5,0.805556
0,DNN ESM1b,0.916667,0.916667,0.5,0.833333
0,DNN ProtBERT,0.805556,0.916667,0.527778,0.75
0,Models Ensemble,0.916667,0.916667,0.5,0.833333
0,Models Ensemble + BLASTp,0.916667,0.916667,0.555556,0.833333
0,BLASTp,0.75,0.638889,0.194444,0.583333
0,CLEAN,1.0,0.972222,0.944444,0.944444
0,ProteInfer,0.611111,0.388889,0.083333,0.694444


In [39]:
metrics_evaluation

Unnamed: 0,model_name,accuracy level 1,accuracy level 2,accuracy level 3,accuracy level 4
0,DNN ESM2 3B,0.916667,0.916667,0.5,0.805556
0,DNN ESM1b,0.916667,0.916667,0.5,0.833333
0,DNN ProtBERT,0.805556,0.916667,0.527778,0.75
0,Models Ensemble,0.916667,0.916667,0.5,0.833333
0,Models Ensemble + BLASTp,0.916667,0.916667,0.555556,0.833333
0,BLASTp,0.75,0.638889,0.194444,0.583333
0,CLEAN,1.0,0.972222,0.944444,0.944444
0,ProteInfer,0.611111,0.388889,0.083333,0.694444


In [40]:
statistical_df[statistical_df["ec_number"].isin(["1","2","3"])]

Unnamed: 0,model_name,ec_number,accuracy,ci_lower,ci_upper,expected_proportion,p_value,significant,sample_size
0,DNN ESM2 3B,1,1.0,0.904966,1.0,0.918919,0.079125,False,30
0,DNN ESM2 3B,2,0.944444,0.472871,1.0,0.918919,0.713032,False,4
0,DNN ESM2 3B,3,0.916667,0.0,1.0,0.918919,1.0,False,3
0,DNN ESM1b,1,1.0,0.904966,1.0,0.918919,0.079125,False,30
0,DNN ESM1b,2,0.944444,0.472871,1.0,0.918919,0.713032,False,4
0,DNN ESM1b,3,0.916667,0.0,1.0,0.918919,1.0,False,3
0,DNN ProtBERT,1,0.888889,0.720385,1.0,0.810811,0.304605,False,30
0,DNN ProtBERT,2,0.944444,0.472871,1.0,0.810811,0.432193,False,4
0,DNN ProtBERT,3,0.916667,0.0,1.0,0.810811,1.0,False,3
0,Models Ensemble,1,1.0,0.904966,1.0,0.918919,0.079125,False,30


In [41]:
statistical_df.to_csv("halogenase_statistical_results.csv", index=False)

In [44]:
statistical_df[statistical_df["significant"]==True]

Unnamed: 0,model_name,ec_number,accuracy,ci_lower,ci_upper,expected_proportion,p_value,significant,sample_size
0,DNN ESM2 3B,1.11.1.10,1.0,0.472871,1.0,0.466667,0.047427,True,4
0,DNN ESM1b,1.14.11,0.916667,0.605837,1.0,0.567568,0.029898,True,10
0,DNN ProtBERT,1.14.11,0.888889,0.605837,1.0,0.540541,0.02023,True,10
0,DNN ProtBERT,2.5.1.63,1.0,0.368403,1.0,0.333333,0.037037,True,3
0,Models Ensemble,1.14.11,0.888889,0.605837,1.0,0.567568,0.029898,True,10
0,ProteInfer,2.5.1,0.944444,0.248605,1.0,0.108108,0.004644,True,4
0,ProteInfer,2.5.1.63,1.0,0.368403,1.0,0.2,0.008,True,3


In [43]:
metrics_evaluation.to_csv("metrics_evaluation_halogenases.csv")