In [1]:
!pip install PyTDC

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
import time
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split
import torch.optim as optim
from tdc.multi_pred import MTI

In [26]:
data = MTI(name = 'miRTarBase')
data = data.neg_sample(frac = 1)
split = data.get_split()
train = split['train']
valid = split['valid']
test = split['test']

Found local copy...
Loading...
Done!


In [61]:
bto1hot = {'U':[1,0,0,0], 'C':[0,1,0,0], 'A':[0,0,1,0], 'G':[0,0,0,1]}
btoidx = {'U':0, 'C':1, 'A':2, 'G':3}

ucag1hot = [[bto1hot[b] for b in s] for s in train['miRNA']]
ucag = [[btoidx[b] for b in s] for s in train['miRNA']]
ucag1hot_valid = [[bto1hot[b] for b in s] for s in valid['miRNA']]
ucag_valid = [[btoidx[b] for b in s] for s in valid['miRNA']]
ucag1hot_test = [[bto1hot[b] for b in s] for s in test['miRNA']]
ucag_test = [[btoidx[b] for b in s] for s in test['miRNA']]

In [67]:
print('Min seq lengths: ',min([len(s) for s in ucag]), min([len(s) for s in ucag_valid]), min([len(s) for s in ucag_test]))
print('Mean seq lengths: ',sum([len(s) for s in ucag])/len(ucag), 
      sum([len(s) for s in ucag_valid])/len(ucag_valid), sum([len(s) for s in ucag_test])/len(ucag_test))
print('Max seq lengths: ',max([len(s) for s in ucag]), max([len(s) for s in ucag_valid]), max([len(s) for s in ucag_test]))

Min seq lengths:  16 16 16
Mean seq lengths:  21.69915285253921 21.70384673065387 21.702298900851698
Max seq lengths:  28 28 28


In [65]:
aatoidx = {}
i = 0
for p in train['Target']:
  for c in p:
    if c not in aatoidx.keys():
      aatoidx[c] = i
      i += 1
  if i == 23:
    break

In [69]:
aa = [[aatoidx[c] for c in s] for s in train['Target']]
aa_valid = [[aatoidx[c] for c in s] for s in valid['Target']]
aa_test = [[aatoidx[c] for c in s] for s in test['Target']]

In [70]:
print('Min protein lengths: ',min([len(s) for s in aa]), min([len(s) for s in aa_valid]), min([len(s) for s in aa_test]))
print('Mean protein lengths: ',sum([len(s) for s in aa])/len(aa), 
      sum([len(s) for s in aa_valid])/len(aa_valid), sum([len(s) for s in aa_test])/len(aa_test))
print('Max protein lengths: ',max([len(s) for s in aa]), max([len(s) for s in aa_valid]), max([len(s) for s in aa_test]))

Min protein lengths:  24 24 24
Mean protein lengths:  607.473874115137 606.6637672465507 606.0150468965776
Max protein lengths:  34350 8799 34350


In [71]:
class miRTarBase(Dataset):
    def __init__(self, labels, miRNAseqs, AAseqs):
        self.labels = labels
        self.miRNAseqs = miRNAseqs
        self.AAseqs = AAseqs
    
    def __len__(self):
        return self.labels.shape[0]
    
    def __getitem__(self, idx):
        return (torch.tensor([btoidx[c] for c in self.miRNAseqs[idx]]),
                torch.tensor([aatoidx[idx] for aa in self.AAseqs[idx]])), \
               torch.tensor(self.labels.iloc[idx]).type(torch.float)

In [73]:
class Model(nn.Module):
  def __init__(self):
    super().__init__()

  def forward(self, x):
    pass

In [74]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Running on {device}')

torch.backends.cudnn.benchmark = True
torch.backends.cudnn.enabled = True

Running on cpu


In [75]:
labels = train['Y']

data = miRTarBase(labels, train['miRNA'], train['Target'])

train_test = random_split(data, [int(labels.shape[0]*0.7), labels.shape[0]-int(labels.shape[0]*0.7)])

batch_size = 16
train_loader = DataLoader(train_test[0], batch_size=batch_size, shuffle=True)
test_loader = DataLoader(train_test[1], batch_size=batch_size, shuffle=False)

In [None]:
model = Model()
model.to(device)

loss_fn = nn.CrossEntropyLoss()

optimizer1 = optim.Adam(model.parameters())
optimizer2 = optim.SGD(model.parameters(), lr=0.001)

n_epochs = 3
optimizer_cutoff = int(n_epochs*3/4)

In [None]:
tic = time.time()

for epoch in range(n_epochs):
    if epoch < optimizer_cutoff:
        optimizer = optimizer1
    else:
        optimizer = optimizer2
    for ex in train_loader:
        inputs, labs = ex
        labs = labs.to(device)
        
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = loss_fn(outputs, labs)
        loss.backward()
        optimizer.step()

    acc = 0
    with torch.no_grad():
        for ex in test_loader:
            inputs, labs = ex
            inputs = inputs.to(device)
            labs = labs.to(device)
            outputs = model(inputs)
            ps, preds = torch.max(outputs)
            acc += sum(preds == labs) / train_test[0].shape[0]
        print(f'Epoch {epoch+1} Accuracy: {acc}')

toc = time.time()
print(f'Training Runtime: {toc-tic}s')