# Setup

In [None]:
import os
import pickle
import random
import re
import sys
from collections import defaultdict
from copy import deepcopy
from functools import partial
from typing import Callable, List

import matplotlib.pyplot as plt
import numpy as np
import optuna
import seaborn as sns
import torch
from Bio import SeqIO
from Bio.pairwise2 import align
from sklearn.model_selection import train_test_split, StratifiedKFold
from torch import nn
from torch.utils.tensorboard import SummaryWriter
from tqdm.notebook import tqdm

from src.logs import logger, delete_all_runs, tb_run, reinit_tensorboard_local
from src.metrics import f1_score, precision_score, matthews_corrcoef, recall_score, accuracy_score
from src.models import HybridModel, SimpleCNN
from src.parameters import ModelParameters, Hyperparameters
from src.processing import SequenceAugmentator, OneHotEncoder, get_single_seq_patches
from src.training import Trainer
from src.validation import nested_cv, NestedCVresults

# Load data

In [None]:
DATA_PATH = "data"
OTHER_PROTEINS_PATH = os.path.join(DATA_PATH, "other_proteins.fasta")
NOT_APIDAECINS_PATH = os.path.join(DATA_PATH, "not_apidaecins.fasta")
APIDAECINS_PATH = os.path.join(DATA_PATH, "apidaecins.fasta")

In [None]:
# Load all apidaecin sequences (includes unfiltered peptides that do not belong to the group)
apidaecins_sequences = list(set(map(lambda rec: str(rec.seq), SeqIO.parse(APIDAECINS_PATH, "fasta"))))
len(apidaecins_sequences)

In [None]:
# Load protein non-AMP sequences
other_proteins_sequences = list(set(map(lambda rec: str(rec.seq), SeqIO.parse(OTHER_PROTEINS_PATH, "fasta"))))
len(other_proteins_sequences)

In [None]:
# Load antipositive sequences
antipos_sequences = set()
for record in SeqIO.parse(NOT_APIDAECINS_PATH, "fasta"):
    if "Apidaecin" in record.description or "apidaecin" in record.description:
        continue
    antipos_sequences.add(str(record.seq))
antipos_sequences = list(antipos_sequences)
len(antipos_sequences)

# Filter data

In [None]:
# Pro-apidaecins are filtered apidaecins without signal peptides
PRO_APIDAECINS_PATH = os.path.join(DATA_PATH, "pro-apidaecins.fasta")

In [None]:
# Signal peptides are cut based on alignment position of first linker
linkers = ["RREAKPEAEP", "RREAEPEAEP", "RREAEPDP", "RREAEPDP", "RREAGPEPEP", "RREPDPEP", "REAKPEPEPEP", "RREPDPEP", "RREAEPDP", "EADPAKP", "QAEPGKP"]

approved_records = []

approved_seqs = set()
for record in SeqIO.parse(APIDAECINS_PATH, "fasta"):
    if "PREDICTED" not in record.description:
        seq_alignments = []
        for linker in linkers:
            alignments = align.localxs(record.seq, linker, -11, -1)
            first_linker_algn = min(alignments, key=lambda x: x.start)
            seq_alignments.append(first_linker_algn)
        max_score = max(seq_alignments, key=lambda x: x.score).score
        best_alignments = list(filter(lambda x: x.score == max_score, seq_alignments))
        best_alignment = min(best_alignments, key=lambda x: x.start)
        if 80 > best_alignment.start > 12 and best_alignment.score > 6:
            new_seq = record.seq[best_alignment.start:]
            if new_seq not in approved_seqs:
                record.seq = new_seq
                approved_records.append(record)
            approved_seqs.add(new_seq)

# Uncomment line below to save filtered sequences to file
# SeqIO.write(approved_records, PRO_APIDAECINS_PATH, "fasta")

In [None]:
# Load filtered apidaecins
pro_apidaecins_sequences = list(set(map(lambda rec: str(rec.seq), SeqIO.parse(PRO_APIDAECINS_PATH, "fasta"))))
len(pro_apidaecins_sequences)

In [None]:
# Take a look on sequences of interest
for record in SeqIO.parse(PRO_APIDAECINS_PATH, "fasta"):
    print(record.id, record.seq)

# Augmentations

## Test augmentations

In [None]:
# Set up augmentation
augmentator = SequenceAugmentator("BLOSUM45", replacement_proba_factor=50)
patches = get_single_seq_patches(pro_apidaecins_sequences[0])

In [None]:
# Test augmentation on small portion of data. 'False' means augmentation occured
for patch in patches[:10]:
    new_str = augmentator.apply_augmentation(patch)
    print(patch, new_str, patch==new_str, sep="\n", end="\n\n")

# Train-test training

## Training setup

In [None]:
# Here you can set up adjustable training and model parameters 
torch.manual_seed(42)

DEVICE = "cuda"

mp = ModelParameters(
    n_classes=2,
    embedding_size=20,
    conv_channels=64,
    conv_kernel_size=13,
    dropout_rate=0.2,

    blstm_output_size=128,
    lstm_output_size=128,

    activation="relu"
)

hp = Hyperparameters(
        device=DEVICE,
        batch_size=1000,
        patch_size=50,
        patch_stride=1,
        substitution_matrix="BLOSUM45",
        replacement_proba_factor=250,
        pos_proba=0.1,
        antipos_proba=0.1,

        model_parameters=mp,
        encoder=OneHotEncoder(alphabet="prot", device=DEVICE),

        criterion=nn.CrossEntropyLoss(),
        optimizer="adam",
        scheduler=None,
        lr=1e-4,

        metric_fns=(matthews_corrcoef, accuracy_score, precision_score, recall_score, f1_score)
)

In [None]:
# Train-test split of data
raw_X = np.array(pro_apidaecins_sequences + other_proteins_sequences + antipos_sequences)
pos_labels = [1] * len(pro_apidaecins_sequences)
neg_labels = [0] * len(other_proteins_sequences) + [2] * len(antipos_sequences)
raw_y = np.array(pos_labels + neg_labels)
X_train, X_test, y_train, y_test = train_test_split(raw_X, raw_y, random_state=42, shuffle=True, test_size=0.2)

In [None]:
trainer = Trainer(HybridModel, X_train, X_test,
                  y_train, y_test, hp, setup=True)

In [None]:
reinit_tensorboard_local(tb_run("Model2"))

In [None]:
# Start training
visualize_sequences = ["RREAEPEAEPGNNRPVYIPPPRPPHPRLRREAEPEAEPGNNRPVYIPQPRPPHPRTAVALVSASRSFFSFAVPRALEQQRFRYSHATKR",
                       "RRIRPRPPRLPRPRPRPLPFPRPGPRPIPRPLPFPRPGPRPIPRPLPFPRPGPRPIPRPL",
                       "MKKIYVAGGCFWGVQGFLKTIKGIKKTTVGYANSLLENPTYELVKSHVTDAVETVEVIYDENILSLKDIVKKLFAVIDPTARNYQGPDHGRQYRNGFYFVDQEDGVMLRELMLEFSKKYEKPLATEILPLDNYYLAEDYHQDYFDKHPNAVCHIKF"]

writer = SummaryWriter(tb_run("Model2"))
trainer.train(n_epochs=100, valid=1, writer=writer, vis_seqs=visualize_sequences, cache_embeddings=False)

## Validate on anti-positive peptides

Code below shows antipositive peptides, whose fragments were predicted as apidaecin-like AMP. In the best case there should not be proline rich AMP, that are not apidaecins

In [None]:
for record in SeqIO.parse(NOT_APIDAECINS_PATH, "fasta"):
    mask = trainer.predict_mask(str(record.seq))
    if sum(mask) > 0:
        print(record.id)
        print(record.seq)
        print(*mask, sep="")

# Hyperparameter optimization & Nested CV

## Functions

Set up objective function that executes code for single optuna trial. This includes parameters suggesting and cross-validation.

You can manually adjust suggested parameters and their boundaries

In [None]:
def objective(trial: optuna.Trial, trainer_preset: Callable, X: List[str], y: List[int],
              kfold: StratifiedKFold, mp_preset: Callable, hp_preset: Callable, outer_split: int, use_tensorboard: bool = False):
    """
    Function to maximize precision by optuna
    
    :param trial: Optuna single trial
    :type trial: optuna.Trial
    :param trainer_preset: Trainer class with preset arguments
    :type trainer_preset: Callable
    :param X: A list of sequences of any length
    :type X: List[str]
    :param y: A list of labels corresponding to each sequence in X
    :type y: List[int]
    :param kfold: Sklearn Kfold object producing splits for data
    :type kfold: StratifiedKfold
    :param mp_preset: ModelParameters class with preset parameters
    :type mp_preset: Callable
    :param hp_preset: Hyperparameters class with preset parameters
    :type hp_preset: Callable
    :param outer_split: Current outer split
    :type outer split: int
    :param use_tensorboard: Whether to use Tensorboard for logging
    :type use_tensorboard: bool
    """
    logger.info(f"Running nested CV, outer split {outer_split[0]}/{outer_split[1]} trial #{trial.number}")

    patch_size = trial.suggest_int("patch_size", 20, 60)
    patch_stride = trial.suggest_int("patch_stride", 1, patch_size, log=True)
    substitution_matrix = trial.suggest_categorical("substitution_matrix", ["BLOSUM45", "BLOSUM62", "BLOSUM90"])
    embedding_size = trial.suggest_int("embedding_size", 5, 40)
    conv_kernel_size = trial.suggest_int("conv_kernel_size", 2, patch_size, log=True)
    lr = trial.suggest_loguniform("lr", 1e-4, 5e-3)
    optimizer = trial.suggest_categorical("optimizer", list(Trainer._optimizers_map.keys()))
    activation = trial.suggest_categorical("activation", list(Trainer._activations_map.keys()))
    # dropout_rate = trial.suggest_float("dropout_rate", 0.05, 0.6)
    # antipos_proba = trial.suggest_float("antipos_proba", 0.05, 0.2)
    # pos_proba = trial.suggest_float("pos_proba", 0.05, 0.2)
    # replacement_proba_factor = trial.suggest_int("replacement_proba_factor", 1, 1000)
    
    suggested_parameters = dict(patch_size=patch_size, patch_stride=patch_stride,
                                substitution_matrix=substitution_matrix, embedding_size=embedding_size,
                                conv_kernel_size=conv_kernel_size, lr=lr,
                                optimizer=optimizer, activation=activation)
    logger.info(f"Suggested parameters: {suggested_parameters}")
    
    mp = mp_preset(
        embedding_size=embedding_size,
        conv_kernel_size=conv_kernel_size,
        activation=activation,
        # dropout_rate=dropout_rate,
    )

    hp = hp_preset(
        substitution_matrix=substitution_matrix,
        patch_size=patch_size,
        optimizer=optimizer,
        lr=lr,
        patch_stride=patch_stride,
        # antipos_proba=antipos_proba,
        # pos_proba=pos_proba,
        # replacement_proba_factor=replacement_proba_factor,
        model_parameters=mp,
    )

    crossval_metrics = defaultdict(list)
    for current_split, (train_indices, val_indices) in enumerate(kfold.split(X, y)):
        logger.info(f"Running nested CV, outer split {outer_split[0]}/{outer_split[1]}, Training inner split {current_split+1}/{kfold.n_splits}")
        logger.debug(f"Performing kfold split. Train split sample: {train_indices[0:31:2]}, Val split sample: {val_indices[0:31:2]}")
        X_train, X_val = X[train_indices], X[val_indices]
        y_train, y_val = y[train_indices], y[val_indices]
        trainer = trainer_preset(X_train=X_train, X_val=X_val,
                                 y_train=y_train, y_val=y_val,
                                 hyperparameters=deepcopy(hp))
        if use_tensorboard:
            inner_split_tb_run = tb_run(f"nested_CV/outer_split_{outer_split[0]}/trial_{trial.number}/inner_split_{current_split+1}")
            reinit_tensorboard_local(inner_split_tb_run, clear_log=True)
            writer = SummaryWriter(log_dir=inner_split_tb_run)
        else:
            writer = None
        loss, metrics = trainer.train(n_epochs=30, valid=3, writer=writer)
        for metric, value in metrics.items():
            crossval_metrics[metric].append(value)
    
    crossval_metrics_mean = {metric: sum(values) / len(values)
                             for metric, values in crossval_metrics.items()}
    if use_tensorboard:
        hp_logdir = tb_run(f"nested_CV/outer_split_{outer_split[0]}/trial_{trial.number}/hyperparameters")
        reinit_tensorboard_local(hp_logdir, clear_log=True)
        writer = SummaryWriter(log_dir=hp_logdir)
        writer.add_hparams(suggested_parameters, crossval_metrics_mean)
        writer.flush()

    return -crossval_metrics_mean["precision_score"]

## Load data

In [None]:
raw_X = np.array(pro_apidaecins_sequences + other_proteins_sequences + antipos_sequences)
pos_labels = [1] * len(pro_apidaecins_sequences)
neg_labels = [0] * len(other_proteins_sequences) + [2] * len(antipos_sequences)
raw_y = np.array(pos_labels + neg_labels)

## Setup fixed parameters

Here are specified fixed parameters that shall not change

In [None]:
DEVICE = "cuda"

mp_preset = partial(
    ModelParameters,
    conv_channels=64,
    blstm_output_size=128,
    lstm_output_size=128,
)

hp_preset = partial(
    Hyperparameters,
    device=DEVICE,
    encoder=OneHotEncoder(device=DEVICE),
    metric_fns=(precision_score, recall_score)
)

trainer_preset = partial(
    Trainer,
    model_class=HybridModel,
    setup=True
)

In [None]:
n_trials = 30
outer_k = 5
inner_k = 2
print(f"Total learning rounds: {n_trials * outer_k * inner_k}")

## Run nested cv

In [None]:
nested_cv_results = nested_cv(
    trainer_preset=trainer_preset,
    objective=objective,
    X=raw_X,
    y=raw_y,
    hp_preset=hp_preset,
    mp_preset=mp_preset,
    n_trials=n_trials,
    outer_k=outer_k,
    inner_k=inner_k,
    random_state=42,
    use_tensorboard=True)

## Save nested cv results to file

In [None]:
with open(os.path.join("cv_results", "nested_cv_results.pk"), "wb") as file:
    pickle.dump(nested_cv_results, file=file)

## Load nested cv results from file

In [None]:
with open(os.path.join("cv_results", "nested_cv_results.pk"), "rb") as file:
    loaded_cv_results = pickle.load(file)

In [None]:
# Take a look on cv results
for idx, results in enumerate(loaded_cv_results):
    print(f"Outer split {idx+1}")
    metrics_format = "\t".join([f"{metric}={value:.4f}" for metric, value in results.metrics.items()])
    hp_format = "\t".join([f"{parameter}={value}" for parameter, value in results.best_params.items()])
    print(f"Validation metrics: {metrics_format}")
    print(f"Best hyperparameters: {hp_format}")
    print()

In [None]:
for result in loaded_cv_results:
    print(result.best_params, result.metrics)

# Evaluation

## Visualize latent representation

Build 2 components UMAP images of latent data representation (before FC layer)

In [None]:
ModelParameters()

In [None]:
# Here you can set up adjustable training and model parameters 

torch.manual_seed(42)

DEVICE = "cuda"

mp = ModelParameters(
    n_classes=2,
    embedding_size=20,
    conv_channels=64,
    conv_kernel_size=13,
    dropout_rate=0.2,

    blstm_output_size=128,
    lstm_output_size=128,

    activation="relu"
)

hp = Hyperparameters(
        device=DEVICE,
        batch_size=1000,
        patch_size=50,
        patch_stride = 1,
        substitution_matrix="BLOSUM45",
        replacement_proba_factor=250,
        pos_proba=0.1,
        antipos_proba=0.1,

        model_parameters=mp,
        encoder=OneHotEncoder(alphabet="prot", device=DEVICE),

        criterion=nn.CrossEntropyLoss(),
        optimizer="adam",
        scheduler=None,
        lr=1e-4,

        metric_fns=(matthews_corrcoef, accuracy_score, precision_score, recall_score, f1_score)
)

In [None]:
raw_X = np.array(pro_apidaecins_sequences + other_proteins_sequences + antipos_sequences)
pos_labels = [1] * len(pro_apidaecins_sequences)
neg_labels = [0] * len(other_proteins_sequences) + [2] * len(antipos_sequences)
raw_y = np.array(pos_labels + neg_labels)
X_train, X_test, y_train, y_test = train_test_split(raw_X, raw_y, random_state=42, shuffle=True, test_size=0.2)

In [None]:
trainer = Trainer(HybridModel, X_train, X_test,
                  y_train, y_test, hp, setup=True)

❗`cache_embeddings` option is very memory consuming, especially if you use SimpleSNN model❗

In [None]:
trainer.train(n_epochs=100, valid=1, writer=None, vis_seqs=None, cache_embeddings=True)

In [None]:
embedding_history = {}
for epoch, (embeddings, labels) in tqdm(trainer.embedding_cache.items()):
    um = umap.UMAP()
    reduced_embedding = um.fit_transform(embeddings)
    embedding_history[epoch - 1] = (reduced_embedding, labels)

Create UMAP images and save them to directory `images` in alphabetic order

In [None]:
if "images" in os.listdir():
    shutil.rmtree("images")
else:
    os.mkdir("images")

plt.rc("figure", figsize=(15, 15))

suffixes = product(ascii_lowercase, repeat=2)

for epoch in sorted(list(embedding_history.keys())):
    embeddings, labels = embedding_history[epoch]
    new_labels = []
    for label in labels:
        if label:
            new_labels.append("Apidaecins")
        else:
            new_labels.append("Other proteins")
    plt.figure()
    sns.scatterplot(embeddings[:, 0], embeddings[:, 1], hue=new_labels)
    plt.title(f"Epoch {epoch}")
    plt.axis("off")
    suffix = "".join(next(suffixes))
    plt.savefig(os.path.join("images", f"HybridModel_embeddings_{suffix}.png"), )

## Final training

Below are the best hyperparameters selected on nested cross validation with optuna

In [None]:
torch.manual_seed(42)

DEVICE = "cuda"

mp = ModelParameters(
    n_classes=2,
    embedding_size=39,
    conv_channels=128,
    conv_kernel_size=12,
    dropout_rate=0.2,

    blstm_output_size=256,
    lstm_output_size=256,

    activation="relu"
)

hp = Hyperparameters(
        device=DEVICE,
        batch_size=1000,
        patch_size=33,
        patch_stride=1,
        substitution_matrix="BLOSUM45",
        replacement_proba_factor=250,
        pos_proba=0.1,
        antipos_proba=0.1,

        model_parameters=mp,
        encoder=OneHotEncoder(alphabet="prot", device=DEVICE),

        criterion=nn.CrossEntropyLoss(),
        optimizer="adam",
        scheduler=None,
        lr=0.0047474,

        metric_fns=()
)

In [None]:
raw_X = np.array(pro_apidaecins_sequences + other_proteins_sequences + antipos_sequences)
pos_labels = [1] * len(pro_apidaecins_sequences)
neg_labels = [0] * len(other_proteins_sequences) + [2] * len(antipos_sequences)
raw_y = np.array(pos_labels + neg_labels)

In [None]:
trainer = Trainer(HybridModel, X_train=raw_X, X_val=None,
                  y_train=raw_y, y_val=None, hyperparameters=hp, setup=True)

In [None]:
run_name = tb_run("Final_training")
reinit_tensorboard_local(run_name, clear_log=True)
writer = SummaryWriter(log_dir=run_name)

trainer.train(n_epochs=100, writer=writer, valid=False)

## Model save

Save model weights and hyperparameters to files. `models` directory contains 2 subdirectories: `weights` and `params`. `weights` contain model weights saved by pytorch, all files have versions. `params` contain saved hyperparameters of model, files also have versions

In [None]:
# This will produce `models/weights/HybridModel_vX.pt` and `models/params/HybridModel_vX.pk`
trainer.save_model("models", "HybridModel")

## Model load

In [None]:
# Predictor is instance of Trainer class with pretrained model
# It uses the most recent version of model found in `models` directory
predictor = Trainer.make_predictor("models", "HybridModel")

## Leave one-out proteome validation 

### Set up test data

In [None]:
test_species = [
    'Vespa_mandarinia',
    'Bombus_pyrosoma',
    'Megalopta_genalis',
    'Leptopilina_heterotoma',
    'Apis_mellifera_caucasica'
]

In [None]:
species_pro_apidaecins = {}
for species in test_species:
    parser = SeqIO.parse(PRO_APIDAECINS_PATH, "fasta")
    species_api_records = filter(lambda rec: species.replace("_", " ") in rec.description, parser)
    species_pro_apidaecins[species] = {record.id: str(record.seq) for record in species_api_records}
    print(species, "has", len(species_pro_apidaecins[species]), "selected apidaecins")

In [None]:
species_apidaecins = {}
for species in test_species:
    parser = SeqIO.parse(APIDAECINS_PATH, "fasta")
    species_api_records = filter(lambda rec: species.replace("_", " ") in rec.description, parser)
    species_apidaecins[species] = {record.id: str(record.seq) for record in species_api_records}
    print(species, "has", len(species_apidaecins[species]), "unfiltered apidaecins")

### Load true labels for proteomes

In [None]:
true_masks = defaultdict(dict)

for species, species_api_records_dict in species_pro_apidaecins.items():
    proteome_records = list(SeqIO.parse(os.path.join("data", "proteomes", f"{species}.faa"), "fasta"))
    for record in proteome_records:
        if record.id in species_api_records_dict:
            match = re.search(species_api_records_dict[record.id], str(record.seq))
            true_masks[species][record.id] = [0] * len(record)
            true_masks[species][record.id][match.start():match.end()] = [1] * (match.end() - match.start())
        else:
            true_masks[species][record.id] = [0] * len(record)

In [None]:
true_masks_unfiltered = defaultdict(dict)

for species, species_api_records_dict in species_apidaecins.items():
    proteome_records = list(SeqIO.parse(os.path.join("data", "proteomes", f"{species}.faa"), "fasta"))
    for record in proteome_records:
        if record.id in species_api_records_dict:
            match = re.search(species_api_records_dict[record.id], str(record.seq))
            true_masks_unfiltered[species][record.id] = [0] * len(record)
            true_masks_unfiltered[species][record.id][match.start():match.end()] = [1] * (match.end() - match.start())
        else:
            true_masks_unfiltered[species][record.id] = [0] * len(record)

### Set up training

In [None]:
# Set hyperparameters

torch.manual_seed(42)

DEVICE = "cuda"

mp = ModelParameters(
    n_classes=2,
    embedding_size=39,
    conv_channels=128,
    conv_kernel_size=12,
    dropout_rate=0.2,

    blstm_output_size=256,
    lstm_output_size=256,

    activation="relu"
)

hp = Hyperparameters(
        device=DEVICE,
        batch_size=1000,
        patch_size=33,
        patch_stride=1,
        substitution_matrix="BLOSUM45",
        replacement_proba_factor=250,
        pos_proba=0.1,
        antipos_proba=0.1,

        model_parameters=mp,
        encoder=OneHotEncoder(alphabet="prot", device=DEVICE),

        criterion=nn.CrossEntropyLoss(),
        optimizer="adam",
        scheduler=None,
        lr=0.0015,

        metric_fns=()
)

### Partial cross-validation

In [None]:
predicted_masks = defaultdict(dict)

for species in test_species:
    # Load data
    train_pro_apidaecins_records = filter(lambda rec: species not in rec.description, SeqIO.parse(PRO_APIDAECINS_PATH, "fasta"))
    train_pro_apidaecins_sequences = list(map(lambda rec: str(rec.seq), train_pro_apidaecins_records))
    
    # Make training set
    raw_X = np.array(train_pro_apidaecins_sequences + other_proteins_sequences + antipos_sequences)
    pos_labels = [1] * len(train_pro_apidaecins_sequences)
    neg_labels = [0] * len(other_proteins_sequences) + [2] * len(antipos_sequences)
    raw_y = np.array(pos_labels + neg_labels)
    
    # Train model
    trainer = Trainer(HybridModel, X_train=raw_X, X_val=None,
                      y_train=raw_y, y_val=None, hyperparameters=hp, setup=True)
    trainer.train(n_epochs=100, valid=False)
    
    # ====================== Validation ==============================================================
    proteome_records = list(SeqIO.parse(os.path.join("data", "proteomes", f"{species}.faa"), "fasta"))
    
    for record in tqdm(proteome_records):
        mask = trainer.predict_mask(str(record.seq), stride=50)
        mask_sum = mask.sum()
        if mask_sum:
            mask = trainer.predict_mask(str(record.seq), stride=1).tolist()
        else:
            mask = [0] * len(record)
            
        predicted_masks[species][record.id] = mask

### Positive predictions fractions

In [None]:
predicted_fractions = []
n_proteins = 0
for species, masks in predicted_masks.items():
    for recid, mask in masks.items():
        mask_frac = sum(mask) / len(mask)
        if mask_frac:
            predicted_fractions.append(mask_frac)
        n_proteins += 1

In [None]:
plt.rc("figure", figsize=(15, 15))
plt.rc("font", size=22)
sns.histplot(predicted_fractions)
plt.title(f"Total proteins: {n_proteins}")
_ = plt.xticks(np.arange(0, 1, 0.1))
plt.xlabel("Positive predctions fraction per protein")

### Calculate overall metrics

In [None]:
for species, pred_masks in predicted_masks.items():
    true_species_mask = []
    pred_species_mask = []
    for recid, mask in pred_masks.items():
        true_species_mask += true_masks[species][recid]
        pred_species_mask += predicted_masks[species][recid]
    print(f"Overall metrics for {species}: ", end="")
    print(f"precision={precision_score(true_species_mask, pred_species_mask):.2f}, ", end="")
    print(f"recall={recall_score(true_species_mask, pred_species_mask):.2f}, ", end="")
    print(f"f1={f1_score(true_species_mask, pred_species_mask):.2f}, ", end="")
    print(f"accuracy={accuracy_score(true_species_mask, pred_species_mask):.2f}")

### Calculate metrics on filtered predictions

In [None]:
for species, pred_masks in predicted_masks.items():
    true_species_mask = []
    pred_species_mask = []
    for recid, mask in pred_masks.items():
        if sum(mask) / len(mask) > 0.5:
            true_species_mask += true_masks_unfiltered[species][recid]
            pred_species_mask += predicted_masks[species][recid]
    print(f"Metrics after filtering for {species}: ", end="")
    print(f"precision={precision_score(true_species_mask, pred_species_mask):.2f}, ", end="")
    print(f"recall={recall_score(true_species_mask, pred_species_mask):.2f}, ", end="")
    print(f"f1={f1_score(true_species_mask, pred_species_mask):.2f}, ", end="")
    print(f"accuracy={accuracy_score(true_species_mask, pred_species_mask):.2f}")

Note that **recall** metric may be inacurate, because true labels are assigned to whole protein (including signal peptides, which are nnot participating in training)