# Generate predictions for the ensembles

In [5]:
# define the directory where the data is
data_path = "/home/jcapela/ec_numbers_prediction/required_data_ec_number_paper"

## Load models predictions

In [6]:
# read predictions from pickle file
import pickle

with open(f"{data_path}/predictions/predictions_prot_bert.pkl", "rb") as f:
    predictions_prot_bert = pickle.load(f)

In [7]:
# read predictions from pickle file
import pickle

with open(f"{data_path}/predictions/predictions_esm2_3b.pkl", "rb") as f:
    predictions_esm2_3b = pickle.load(f)

In [8]:
# read predictions from pickle file
import pickle

with open(f"{data_path}/predictions/predictions_esm1b.pkl", "rb") as f:
    predictions_esm1b = pickle.load(f)

In [9]:
from plants_sm.data_structures.dataset.single_input_dataset import SingleInputDataset

test_dataset = SingleInputDataset.from_csv(f'{data_path}/data/test.csv',
                                            instances_ids_field="accession", representation_field="sequence",
                                            labels_field=slice(8, 2779))

  return pd.read_csv(self.path, **self.kwargs)


## Create ensemble of predictions

In [10]:
# make a voting classifier for the 3 models
import numpy as np

def determine_ensemble_predictions(threshold=3, *model_predictions):
    model_predictions = list(model_predictions)

    for i, model_prediction in enumerate(model_predictions):
        model_predictions[i] = np.array(model_prediction)

    predictions_voting = np.zeros_like(model_predictions[0])

    for i in range(model_predictions[0].shape[0]):
        # Combine conditions into a single array and sum along the second axis
        combined_conditions = np.sum(np.array([model_predictions[j][i] for j in range(len(model_predictions))]), axis=0)

        # Apply the threshold condition
        predictions_voting[i] = (combined_conditions >= threshold).astype(int)

    # If you want to ensure the resulting array is of integer type
    predictions_voting = predictions_voting.astype(int)
    return predictions_voting


In [11]:
predictions_voting = determine_ensemble_predictions(2, predictions_esm1b, predictions_prot_bert, predictions_esm2_3b)

In [12]:
# save predictions in pickle file
import pickle

with open(f"{data_path}/predictions/predictions_models_voting.pkl", "wb") as f:
    pickle.dump(predictions_voting, f)

In [13]:
from sklearn.metrics import f1_score, recall_score, precision_score
print(f1_score(test_dataset.y, predictions_voting, average='macro'))
print(recall_score(test_dataset.y, predictions_voting, average='macro'))
print(precision_score(test_dataset.y, predictions_voting, average='macro'))

0.9011672051082994
0.885065468521875
0.9274756923443451


# Get BLASTp predictions 

In [14]:
import pandas as pd

blast_results = pd.read_csv(f"{data_path}/test_blast_predictions_right_format.csv")

  blast_results = pd.read_csv(f"{data_path}/test_blast_predictions_right_format.csv")


In [15]:
blast_results.head()

Unnamed: 0,qseqid,EC,EC1,EC2,EC3,EC4,1,2,3,4,...,7.4.2.5,7.4.2.8,7.5.2.11,7.6.2.1,7.6.2.11,7.6.2.13,7.6.2.2,7.6.2.3,7.6.2.5,7.6.2.8
0,A0A009IHW8,3.2.2.6,3,3.2,3.2.2,3.2.2.6,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A0A009YU83,2.3.3.5,2,2.3,2.3.3,2.3.3.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,A0A010ZGY3,3.7.1.12,3,3.7,3.7.1,3.7.1.12,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A0A011MZP2,2.6.1.50,2,2.6,2.6.1,2.6.1.50,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,A0A011N9Z1,7.4.2.5,7,7.4,7.4.2,7.4.2.5,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
accessions = test_dataset.dataframe["accession"]

In [17]:
# Create a new column with the custom order as a categorical type
blast_results['CustomOrder'] = pd.Categorical(blast_results['qseqid'], categories=accessions, ordered=True)
blast_results.sort_values('CustomOrder', inplace=True)
blast_results.drop(columns=["CustomOrder"], inplace=True)
blast_results

Unnamed: 0,qseqid,EC,EC1,EC2,EC3,EC4,1,2,3,4,...,7.4.2.5,7.4.2.8,7.5.2.11,7.6.2.1,7.6.2.11,7.6.2.13,7.6.2.2,7.6.2.3,7.6.2.5,7.6.2.8
64342,Q7XQ85,4.4.1.14,4,4.4,4.4.1,4.4.1.14,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1935,A0A0P0WIY3,4.4.1.14,4,4.4,4.4.1,4.4.1.14,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54941,Q43309,4.4.1.14,4,4.4,4.4.1,4.4.1.14,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42238,P29535,4.4.1.14,4,4.4,4.4.1,4.4.1.14,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1932,A0A0P0UZP7,4.4.1.14,4,4.4,4.4.1,4.4.1.14,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
968,A0A0D0TJI9,1.14.15.13,1,1.14,1.14.15,1.14.15.13,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12834,A0A5C6EG74,1.3.99.26,1,1.3,1.3.99,1.3.99.26,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15759,A0A7C9CV01,2.4.1.-,2,2.4,2.4.1,0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13578,A0A644YY06,3.1.4.57,3,3.1,3.1.4,3.1.4.57,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# write to pickle file
import pickle

with open(f"{data_path}/predictions/blast_predictions.pkl", "wb") as f:
    pickle.dump(blast_results.iloc[:, 6:], f)

# Create ensemble with BLASTp and models 

In [19]:
# open pickle file
import pickle

with open(f"{data_path}/predictions/blast_predictions.pkl", "rb") as f:
    blast_predictions = pickle.load(f)

In [20]:
# make a voting classifier for the 3 models and blast

voting_predictions_blast = determine_ensemble_predictions(2, predictions_esm1b, predictions_prot_bert, predictions_esm2_3b, blast_predictions)

In [21]:
# save predictions in pickle file
import pickle

with open(f"{data_path}/predictions/predictions_models_voting_blast.pkl", "wb") as f:
    pickle.dump(voting_predictions_blast, f)

In [22]:
# read predictions from pickle file
import pickle

with open(f"{data_path}/predictions/predictions_models_voting_blast.pkl", "rb") as f:
    voting_predictions_blast = pickle.load(f)

In [23]:
from sklearn.metrics import f1_score, recall_score, precision_score
print(f1_score(test_dataset.y, voting_predictions_blast, average='macro'))
print(recall_score(test_dataset.y, voting_predictions_blast, average='macro'))
print(precision_score(test_dataset.y, voting_predictions_blast, average='macro'))

0.9090857401638838
0.9049003181814923
0.9204416806398337


## Generate metrics for all the ensembles

In [24]:
import re
from sklearn.metrics import f1_score, recall_score, precision_score


def get_ec_levels(labels):
    level_1 = []
    level_2 = []
    level_3 = []
    level_4 = []
    for i, label in enumerate(labels):
        if re.match(r"^\d+.\d+.\d+.n*\d+$", label):
            level_4.append(i)
        elif re.match(r"^\d+.\d+.\d+$", label):
            level_3.append(i)
        elif re.match(r"^\d+.\d+$", label):
            level_2.append(i)
        elif re.match(r"^\d+$", label):
            level_1.append(i)
    return level_1, level_2, level_3, level_4

def get_metrics(y_true_, predictions_, model_name, labels_):
    level_1, level_2, level_3, level_4 = get_ec_levels(labels_)  
    metrics = {}
    metrics["mF1 overall"] = f1_score(y_true_, predictions_, average="macro")
    metrics["mF1 level 1"] = f1_score(y_true_[:, level_1], predictions_[:, level_1], average="macro")
    metrics["mF1 level 2"] = f1_score(y_true_[:, level_2], predictions_[:, level_2], average="macro")
    metrics["mF1 level 3"] = f1_score(y_true_[:, level_3], predictions_[:, level_3], average="macro")
    metrics["mF1 level 4"] = f1_score(y_true_[:, level_4], predictions_[:, level_4], average="macro")

    metrics["mPrecision overall"] = precision_score(y_true_, predictions_, average="macro")
    metrics["mPrecision level 1"] = precision_score(y_true_[:, level_1], predictions_[:, level_1], average="macro")
    metrics["mPrecision level 2"] = precision_score(y_true_[:, level_2], predictions_[:, level_2], average="macro")
    metrics["mPrecision level 3"] = precision_score(y_true_[:, level_3], predictions_[:, level_3], average="macro")
    metrics["mPrecision level 4"] = precision_score(y_true_[:, level_4], predictions_[:, level_4], average="macro")

    metrics["mRecall overall"] = recall_score(y_true_, predictions_, average="macro")
    metrics["mRecall level 1"] = recall_score(y_true_[:, level_1], predictions_[:, level_1], average="macro")
    metrics["mRecall level 2"] = recall_score(y_true_[:, level_2], predictions_[:, level_2], average="macro")
    metrics["mRecall level 3"] = recall_score(y_true_[:, level_3], predictions_[:, level_3], average="macro")
    metrics["mRecall level 4"] = recall_score(y_true_[:, level_4], predictions_[:, level_4], average="macro")
    return pd.DataFrame(metrics, index=[model_name])

In [25]:
labels_ = test_dataset._labels_names
true_values = test_dataset.y
del test_dataset

In [26]:
results = get_metrics(true_values, voting_predictions_blast, "ensemble_blast", labels_)

In [27]:
results.index.name = "ensemble"

In [28]:
results

Unnamed: 0_level_0,mF1 overall,mF1 level 1,mF1 level 2,mF1 level 3,mF1 level 4,mPrecision overall,mPrecision level 1,mPrecision level 2,mPrecision level 3,mPrecision level 4,mRecall overall,mRecall level 1,mRecall level 2,mRecall level 3,mRecall level 4
ensemble,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ensemble_blast,0.909086,0.984783,0.963836,0.943704,0.904108,0.920442,0.985747,0.969475,0.953077,0.915839,0.9049,0.983822,0.958839,0.936733,0.900196


In [29]:
results.to_csv("results_ensemble_blast.csv", index=True)

In [30]:
# save predictions in pickle file
import pickle

with open(f"{data_path}/predictions/predictions_models_voting.pkl", "rb") as f:
    voting_predictions = pickle.load(f)

results = get_metrics(true_values, voting_predictions, "ensemble", labels_)

In [None]:
results.index.name = "ensemble"
results.to_csv("results_models_ensemble.csv", index=True)

In [None]:
# transform columns into rows 
import pandas as pd
import numpy as np

results = pd.read_csv("results_models_ensemble.csv")
results = results.set_index("ensemble")
results = results.transpose()
new_results = pd.DataFrame(columns=["model", "metric", "train", "test"])
new_results["model"] = ["models ensemble"] * 15
new_results["metric"] = results.index
new_results["train"] = np.NaN 
new_results["test"] = results["ensemble"].values
new_results.to_csv("results_models_ensemble2.csv", index=False)

In [None]:
results = pd.read_csv("results_ensemble_blast.csv")
results = results.set_index("ensemble")
results = results.transpose()
new_results = pd.DataFrame(columns=["model", "metric", "train", "test"])
new_results["model"] = ["models + BLASTp ensemble"] * 15
new_results["metric"] = results.index
new_results["train"] = np.NaN 
new_results["test"] = results["ensemble_blast"].values
new_results.to_csv("results_ensemble_blast2.csv", index=False)