In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES']="0"
import pandas as pd
import numpy as np
from collections import defaultdict
from itertools import combinations
from itertools import chain
from collections import namedtuple
import pickle
import os.path
import shutil
import inspect 
from glob import glob
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils
import torch.utils.data

from pytorch_utils import *
from pas_utils import *


In [2]:
MODEL_DIR="./pytorch_models"
PROCESSED_DIR="./APA_ML/processed"

params={
    "batch_size":32,
    "lr":1e-3,
    "beta":1e-3,
    "net_type":"Multi-Conv-Net",    
    "conv1d_kernel_size":12,

    "conv1d_out_dim_1":40,
    "pool_size_1":3,
    "conv1d_out_dim_2":40,
    "pool_size_2":4,

    "linear1_dim":200,
    "seq_len":455,
    "lstm_output_size":100,
    "device":"cuda" if torch.cuda.is_available() else "cpu",
    "parental_model_file":os.path.join(MODEL_DIR,"parental_model_bl-multi.pt"),
    "f1_model_file":os.path.join(MODEL_DIR,"f1_model_from_bl-multi.pt"),
    "dropout_rate":0.7,
    "fold":5
}

In [3]:

# create model
model=APAModel(params)
model.to(params["device"])


APAModel(
  (conv1d_1): Conv1d(4, 40, kernel_size=(12,), stride=(1,))
  (batchnorm_1): BatchNorm1d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv1d_2): Conv1d(40, 40, kernel_size=(12,), stride=(1,))
  (batchnorm_2): BatchNorm1d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (linear1): Linear(in_features=1360, out_features=200, bias=True)
  (lstm): LSTM(200, 50, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.7, inplace=False)
  (linear2): Linear(in_features=100, out_features=1, bias=True)
)

In [4]:

print("reload the best model and test")
model.load_state_dict(torch.load(params["parental_model_file"]))


reload the best model and test


<All keys matched successfully>

In [20]:

def predict(logits, pas_numbers):
    for i in range(len(pas_numbers)):
        logits[i, pas_numbers[i]:] = -1e32
    softmax = F.softmax(logits, -1)
    return softmax

def model_predict(model, data_set, params, softmax=True, up_to_pas_number=1):
    with torch.no_grad():
        model.eval()
        #data_set.set_shuffle(False)
        predictions = []
        for local_batch, local_pas_numbers in data_set:
            local_batch = (local_batch).to(params["device"])
            local_pas_numbers = (local_pas_numbers).to(params["device"])
            local_outputs = model(local_batch, local_pas_numbers)
            if softmax:
                local_pred = predict(local_outputs, local_pas_numbers)
            else:
                local_pred = local_outputs
            for i in range(len(local_pas_numbers)):
                predictions.append(local_pred[i][:up_to_pas_number].tolist())

        predictions = np.array(predictions)
        return predictions


In [9]:
#Load dataframe of sequences to predict APA for

df = pd.read_csv('../PolyApredictors/apa_leslie_derti_apadb_pair_data_df_pair.csv', sep='\t')


In [10]:
#Pad sequences to fit DeepPASTA input format

df['seq_p'] = df['wide_seq_ext_prox']#.str.slice(175-70, 175-70+205-5)
df['seq_d'] = df['wide_seq_ext_dist']#.str.slice(175-70, 175-70+205-5)


In [11]:
#Create sequence feature encodings of required shape

xs = []
for _, row in df.iterrows() :
    
    seq_p = row['seq_p']
    seq_d = row['seq_d']

    up_pad = "TTTTTATGCTGATAGAAGCA"
    dn_pad = "ACTCAAAGTGTTTCTAGGGGTTAAAAAGGTCCCATCCAGAGAGGATAGAGGCAGTGGTCTTCTGTCCCACCACCTGAGA"

    seq_prox = up_pad + seq_p.replace("X", "O") + dn_pad
    seq_dist = up_pad + seq_d.replace("X", "O") + dn_pad

    x = np.concatenate([
        np.expand_dims(np.expand_dims(dna_one_hot(seq_prox), axis=0), axis=0),
        np.expand_dims(np.expand_dims(dna_one_hot(seq_dist), axis=0), axis=0)
    ], axis=1)
    
    xs.append(x)

x = np.concatenate(xs, axis=0)

n_signals = 2

s = np.tile(np.array([n_signals], dtype=np.int), (x.shape[0],))

data_set = [[
    torch.FloatTensor(x),
    torch.LongTensor(s)
]]


In [21]:
#Make proximal isoform predictions

y_pred = model_predict(model, data_set, params, softmax=True)
logodds_pred = model_predict(model, data_set, params, softmax=False, up_to_pas_number=2)


In [26]:
#Store predictions in df

df['usage_prox'] = y_pred[:, 0]

df['score_prox'] = logodds_pred[:, 0]
df['score_dist'] = logodds_pred[:, 1]


In [27]:
#Re-save prediction df

df[['gene_id', 'seq_prox', 'seq_dist', 'usage_prox', 'score_prox', 'score_dist']].to_csv("apa_leslie_derti_apadb_pair_data_df_pair_deerect_apa.csv", sep='\t', index=False)
