# EFAM VPF prediction

requires efam sequence:embedding dictionary (download- https://storage.googleapis.com/viral_protein_family_plm_embeddings/efam/identifier_to_vector_protbert_bdf_11012022_dict.pkl) in same directory as notebook


In [None]:
from tensorflow import keras
import pickle
from ast import literal_eval
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
from tqdm import tqdm

In [None]:
phrog_palette = {
    'DNA, RNA and nucleotide metabolism': 'red',
    'connector': 'blue',
    'head and packaging': 'green',
    'integration and excision': 'pink',
    'lysis': 'gray',
    'moron, auxiliary metabolic gene and host takeover': 'brown',
    'other': 'purple',
    'tail': 'darkorange',
    'transcription regulation': 'cyan',
    'unknown': 'black'
}

## load efam super annotations file

In [None]:
efam_super_annote = pd.read_csv('../Final_Super_Condensed_Annotations-updated_efam.tsv', sep='\t')
efam_super_annote['Proteins'] = efam_super_annote['Proteins'].apply(literal_eval)

## load trained model

In [None]:
model = keras.models.load_model('models/model_unknown_80_07092023/')

In [None]:
classes = pickle.load(open('models/model_unknown_80_07092023_lb.pkl', 'rb'))

## load necessary dictionaries for efam identifier, cluser lookups

In [None]:
ci = {}
for c in tqdm(efam_super_annote['Cluster']):
    ps = efam_super_annote[efam_super_annote['Cluster'] == c]['Proteins'].item()
    ci[c] = ps

iv = pickle.load(open('identifier_to_vector_protbert_bdf_11012022_dict.pkl', 'rb'))

## make predictions

In [None]:
cs = list(ci.keys())

In [None]:
cs_preds = []
cs_prediction_per_class = []
for c in tqdm(cs):
    
    vs = [iv[i] for i in ci[c] if i in iv.keys()]
    if len(vs) < len(ci[c]):
        print(c)
        if len(vs) < 1:
            continue
    
    vs = np.vstack(vs)
    
    pred = model.predict(vs, verbose=0)
    

    pred_mean = np.mean(pred, axis=0)
    pred_sd = np.std(pred, axis=0)

    pred_score = pred_mean[pred_mean.argmax()]
    pred_sd = pred_sd[pred_mean.argmax()]
    pred_c = classes.classes_[pred_mean.argmax()]
    
    cs_preds.append((c, pred_c, pred_score, pred_sd, len(vs)))
    cs_prediction_per_class.append((
        c, 
        pred_mean[0],
        pred_mean[1],
        pred_mean[2],
        pred_mean[3],
        pred_mean[4],
        pred_mean[5],
        pred_mean[6],
        pred_mean[7],
        pred_mean[8],
        pred_mean[9]
    ))

In [None]:
cs_assignment = pd.DataFrame(cs_preds, columns=['cluster', 'label', 'average_prob', 'sd_prob', 'number_of_sequences'])
cs_assignment.to_csv('efam_clusters_predictions.tsv', sep='\t', index=False)

In [None]:
cols = ('cluster', 
        classes.classes_[0],
        classes.classes_[1],
        classes.classes_[2],
        classes.classes_[3],
        classes.classes_[4],
        classes.classes_[5],
        classes.classes_[6],
        classes.classes_[7],
        classes.classes_[8],
        classes.classes_[9]
       ) 
cs_assignment_1 = pd.DataFrame(cs_prediction_per_class, columns=cols)
cs_assignment_1.to_csv('efam_clusters_all_probabilities.tsv', sep='\t', index=False)