In [1]:
# global libs
import torch as pt
from glob import glob
import os
import numpy as np
# import sys

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# project libs
from src.data_encoding import encode_res, all_resnames, selected_locations, encode_location
from config import config_data, config_runtime
from model import RNN
from src.dataset import fasta_to_vector, read_fasta, SeqDataset

# Load model

In [3]:
# define 
nres = config_data['sequence_max_length']
device = pt.device("cpu")
n_letters = len(all_resnames)
n_categories = len(selected_locations)
learning_rate = config_runtime['learning_rate']
n_hidden = config_runtime['hidden_size']
n_layers = config_runtime['layers']
# create model
model = RNN(nres, n_hidden, n_layers, n_categories, device)
# reload model
model.load_state_dict(pt.load("model.pt", map_location=pt.device("cpu")))
# set model to inference
model = model.eval().to(device)
sm = pt.nn.Softmax(dim=1)

In [4]:
n_seq = config_data['sequence_max_length']
dataset = SeqDataset(config_data['dataset_filepath'], n_seq)

In [5]:
selected_locations

['membrane', 'nucle']

In [6]:

TP = np.zeros(len(selected_locations))
FP = np.zeros(len(selected_locations))
TN = np.zeros(len(selected_locations))
FN = np.zeros(len(selected_locations))
P = np.zeros(len(selected_locations))
N = np.zeros(len(selected_locations))
threshold = 0.5
#34607
for i in range(34607):
    x,y = dataset[i]
    X = x[None, :]
    results = sm(model(X))[0].detach().numpy()
    bin_results = results > threshold
    
    bin_results = bin_results.astype(int)
    bin_y = y.detach().numpy().astype(int)
    
    TP += (bin_results == 1) & (bin_y == 1)
    FP += (bin_results == 1) & (bin_y == 0)
    TN += (bin_results == 0) & (bin_y == 0)
    FN += (bin_results == 0) & (bin_y == 1)
    P += (bin_y == 1)
    N += (bin_y == 0)


In [7]:
ACC = (TP+TN)/(P+N)
TPR = TP/P
TNR = TN/N

In [8]:
BA = 0.5*(TPR + TNR)
BA

array([0.72878642, 0.75851449])

In [13]:
FN

array([1124., 1980.])

In [14]:
# data parameters

data_path = "../examples/"
sm = pt.nn.Softmax(dim=1)

In [15]:
pdb_filepaths = glob(os.path.join(data_path, "*.*"))

In [16]:
all_vec = []
all_names = []
for fasta_file in pdb_filepaths:
    a,b = read_fasta(fasta_file)
    
    v = fasta_to_vector(a, nres)
    
    if not(v is None):
        with pt.no_grad():
            results = sm(model(v))[0]
        print()
        print(f'===== {b: ^15} ====')
        j = results.detach().numpy()

        for il, l in enumerate(selected_locations):
            print(f'{l: <15} p={j[il]:>5.2f}')


=====   SAR1_YEAST    ====
membrane        p= 0.57
nucle           p= 0.43

=====   CCL20_MOUSE   ====
membrane        p= 0.81
nucle           p= 0.19

=====   GID4_MOUSE    ====
membrane        p= 1.00
nucle           p= 0.00

=====   HDAC3_DANRE   ====
membrane        p= 0.91
nucle           p= 0.09

=====    XCL1_RAT     ====
membrane        p= 0.76
nucle           p= 0.24

=====   ICP27_HHV1E   ====
membrane        p= 0.23
nucle           p= 0.77

=====   MCL1_MOUSE    ====
membrane        p= 0.95
nucle           p= 0.05

=====   FTSZ_BACSU    ====
membrane        p= 0.35
nucle           p= 0.65

=====   MIM2_SCHPO    ====
membrane        p= 0.63
nucle           p= 0.37

=====    ACKR2_RAT    ====
membrane        p= 1.00
nucle           p= 0.00

=====   CLD1_MOUSE    ====
membrane        p= 0.91
nucle           p= 0.09

=====    DELE1_RAT    ====
membrane        p= 0.33
nucle           p= 0.67

=====    CCR2_RAT     ====
membrane        p= 1.00
nucle           p= 0.00

=====    IL