# PHROG data

PHROG_data must contain:  

protbert_bfd_embeddings_phrog: download all phrog family embedding pkl objects from google cloud. Follow instructions on repo README.

In [None]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras import backend as K
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import random
import os
from tqdm import tqdm

In [None]:
phrog_data_dir = 'PHROG_data/'

In [None]:
phrog_metadata = pd.read_csv('../PHROG_index_revised_v4_10292022.csv')

In [None]:
sequence_number_per_family = 1000000

### still only condsider the labels from phrogs v3

In [None]:
phrog_metadata['Category'].value_counts()

In [None]:
phrog_known = phrog_metadata[~phrog_metadata['Category'].isna()]
phrog_known = phrog_known[~phrog_known['Category'].isin(['unknown function'])]
len(phrog_known)

In [None]:
cs = set(phrog_known['Category'])

In [None]:
## dict for family:label -> {fl}
## dict for family:vectors -> {fv}
## dict for label:families -> {lf}
fl = {}
fv = {}
lf = {}

for c in cs:
    ps = phrog_known[phrog_known['Category'] == c]['#phrog']
    for p in ps:
        fl[p] = c
        try:
            fv[p] = pickle.load(open('{0}/protbert_bfd_embeddings_phrog/{1}.pkl' ''.format(phrog_data_dir, p), 'rb'))
        except:
            print('{0} embeddings not found' ''.format(p))
            pass
    lf[c] = list(set(ps).intersection(set(fv.keys())))

### building the training and testing splits for PHROG family leave out

In [None]:
from typing import List, Dict

In [None]:
def subset_training_data(
    vectors: Dict, 
    labels: Dict, 
    tr_families: List,
    num_train_seq: int):
    
    tr_vectors = [random.sample(list(vectors[f]), min(num_train_seq, len(vectors[f]))) for f in tr_families]
    tr_vectors = np.vstack(tr_vectors)
    tr_label = [[labels[f]] * min(num_train_seq, len(vectors[f])) for f in tr_families]
    tr_label = [j for i in tr_label for j in i]

    return tr_vectors, tr_label

In [None]:
train_families = list(set(fv.keys()))

In [None]:
train_x, train_y = subset_training_data(
    vectors=fv, 
    labels=fl, 
    tr_families=train_families, 
    num_train_seq=sequence_number_per_family)

In [None]:
np.unique(np.array(train_y), return_counts=True)

## feed forward neural network

In [None]:
# label binarize
# convert the labels from integers to vectors
lb = LabelBinarizer()
trainY = lb.fit_transform(train_y)
trainX = train_x

In [None]:
# model architechture
model = Sequential()
model.add(Dense(512, input_shape=(1024,), activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(256, input_shape=(512,), activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(128, input_shape=(256,), activation="relu"))
model.add(Dense(9, activation="softmax"))

In [None]:
n_epoch = 5
opt = Adam(0.0001)
model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
H = model.fit(trainX, trainY, epochs=n_epoch, batch_size=60)

In [None]:
# plot the training loss and accuracy
plt.style.use("ggplot")
plt.figure()
plt.plot(np.arange(0, n_epoch), H.history["loss"], label="train_loss")
plt.plot(np.arange(0, n_epoch), H.history["accuracy"], label="train_acc")
plt.title("Training Loss and Accuracy")
plt.xlabel("Epoch #")
plt.ylabel("Loss/Accuracy")
plt.legend()



## test the PHROG unknown protein families

In [None]:
phrog_unknown = phrog_metadata[~phrog_metadata['Category'].isna()]
phrog_unknown = phrog_unknown[phrog_unknown['Category'].isin(['unknown function'])]

In [None]:
ufv = {}
for p in phrog_unknown['#phrog']:
    try:
        ufv[p] = pickle.load(open('{0}/protbert_bfd_embeddings_phrog/{1}.pkl' ''.format(phrog_data_dir,p), 'rb'))
    except:
        print('{0} embeddings not found' ''.format(p))
        pass

In [None]:
confidence = 0.8
confident_unknown = []
unconfident_unknown = []
for f in tqdm(ufv.keys()):
    pred_f = model.predict(ufv[f], verbose=0)
    pred_f = np.mean(pred_f, axis=0)
    if sum(pred_f > confidence) > 0:
        confident_unknown.append(f)
    else:
        unconfident_unknown.append(f)

In [None]:
len(unconfident_unknown)

In [None]:
ufv_vectors = [random.sample(list(ufv[f]), min(sequence_number_per_family, len(ufv[f]))) for f in unconfident_unknown]
ufv_vectors = np.vstack(ufv_vectors)
ufv_label = ['unknown'] * len(ufv_vectors)

In [None]:
len(ufv_vectors)

### train a new model with an 'unknown' class

In [None]:
vectors = np.concatenate((train_x, ufv_vectors))
label = np.concatenate((train_y, ufv_label))

In [None]:
np.unique(np.array(label), return_counts=True)

In [None]:
trainX = vectors
trainY = label

In [None]:
# label binarize
# convert the labels from integers to vectors
lb = LabelBinarizer()
trainY = lb.fit_transform(trainY)

In [None]:
# model architechture
model2 = Sequential()
model2.add(Dense(512, input_shape=(1024,), activation="relu"))
model2.add(Dropout(0.2))
model2.add(Dense(256, input_shape=(512,), activation="relu"))
model2.add(Dropout(0.2))
model2.add(Dense(128, input_shape=(256,), activation="relu"))
model2.add(Dense(10, activation="softmax"))

In [None]:
n_epoch = 5
opt = Adam(0.0001)
model2.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
H2 = model2.fit(trainX, trainY, epochs=n_epoch, batch_size=60)

In [None]:
# plot the training loss and accuracy
plt.rcParams["figure.figsize"]=8,8
plt.style.use("ggplot")
plt.figure()
plt.plot(np.arange(0, n_epoch), H2.history["loss"], label="train_loss")

plt.plot(np.arange(0, n_epoch), H2.history["accuracy"], label="train_acc")

plt.title("Training Loss and Accuracy")
plt.xlabel("Epoch #")
plt.ylabel("Loss/Accuracy")
plt.legend()



## SAVE MODEL for external testing on protein sequence

In [None]:
#os.mkdir('models')
model2.save('models/model_unknown_80_07092023')
pickle.dump(lb, open('models/model_unknown_80_07092023_lb.pkl', 'wb'))