In [70]:
import time
import pandas as pd
import matplotlib.pyplot as plt
import bokeh.io
import bokeh.plotting
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split
import torch.optim as optim
from tdc.multi_pred import PPI

ts = time.time()
bokeh.io.output_notebook()

In [27]:
data = PPI(name = 'HuRI')
data = data.neg_sample(frac = 1)
split = data.get_split()
train = split['train']
valid = split['valid']
test = split['test']

Found local copy...
Loading...
Done!


In [28]:
train.head(2)

Unnamed: 0,Protein1_ID,Protein1,Protein2_ID,Protein2,Y
0,ENSG00000000005,MAKNPPENCEDCHILNAEAFKSKKICKSLKICGLVFGILALTLIVL...,ENSG00000061656,MRRSSRPGSASSSRKHTPNFFSENSSMSITSEDSKGLRSAEPGPGE...,1
1,ENSG00000000005,MAKNPPENCEDCHILNAEAFKSKKICKSLKICGLVFGILALTLIVL...,ENSG00000104765,MSSHLVEPPPPLHNNNNNCEENEQSLPPPAGLNSSWVELPMNSSNG...,1


In [101]:
aatoidx = {}
i = 0
skip = False
for p1 in train['Protein1']:
    for c1 in p1:
        if c1 not in aatoidx.keys():
            aatoidx[c1] = i
            i += 1
    if i == 23:
        skip = True
        break
if not skip:
    for p2 in train['Protein2']:
        for c2 in p2:
            if c1 not in aatoidx.keys():
                aatoidx[c1] = i
                i += 1
        if i == 23:
            break

idxtoaa = {}
for key in aatoidx.keys():
    idxtoaa[aatoidx[key]] = key

In [33]:
aa1 = [[aatoidx[c] for c in s] for s in train['Protein1']]
aa2 = [[aatoidx[c] for c in s] for s in train['Protein2']]
aav1 = [[aatoidx[c] for c in s] for s in valid['Protein1']]
aav2 = [[aatoidx[c] for c in s] for s in valid['Protein2']]
aat1 = [[aatoidx[c] for c in s] for s in test['Protein1']]
aat2 = [[aatoidx[c] for c in s] for s in test['Protein2']]

In [39]:
len(aa1), len(aa2), len(aav1), len(aav2), len(aat1), len(aat2)

(73316, 73316, 10474, 10474, 20948, 20948)

In [98]:
print('Min protein lengths:')
print('\tTrain 1:', min([len(s) for s in aa1]))
print('\tTrain 2:', min([len(s) for s in aa2]))
print('\tValid 1:', min([len(s) for s in aav1]))
print('\tValid 2:', min([len(s) for s in aav2]))
print('\tTest 1:', min([len(s) for s in aat1]))
print('\tTest 2:', min([len(s) for s in aat2]))
print('Mean protein lengths:')
print('\tTrain 1:', sum([len(s) for s in aa1])/len(aa1))
print('\tTrain 2:', sum([len(s) for s in aa2])/len(aa2))
print('\tValid 1:', sum([len(s) for s in aav1])/len(aav1))
print('\tValid 2:', sum([len(s) for s in aav2])/len(aav2))
print('\tTest 1:', sum([len(s) for s in aat1])/len(aat1))
print('\tTest 2:', sum([len(s) for s in aat2])/len(aat2))
print('Max protein lengths:')
print('\tTrain 1:', max([len(s) for s in aa1]))
print('\tTrain 2:', max([len(s) for s in aa2]))
print('\tValid 1:', max([len(s) for s in aav1]))
print('\tValid 2:', max([len(s) for s in aav2]))
print('\tTest 1:', max([len(s) for s in aat1]))
print('\tTest 2:', max([len(s) for s in aat2]))

Min protein lengths:
	Train 1: 26
	Train 2: 26
	Valid 1: 26
	Valid 2: 36
	Test 1: 26
	Test 2: 26
Mean protein lengths:
	Train 1: 1404.8688826449888
	Train 2: 1338.772450761089
	Valid 1: 1390.0171854114951
	Valid 2: 1313.020908917319
	Test 1: 1404.0047737254154
	Test 2: 1332.4466297498568
Max protein lengths:
	Train 1: 33472
	Train 2: 33472
	Valid 1: 33472
	Valid 2: 33472
	Test 1: 33472
	Test 2: 29856


In [99]:
p = bokeh.plotting.figure(title='Sequence Length CDFs', x_axis_type='log', y_axis_type='log')
p.line(lens, [(i+1)/len(aa1) for i in range(len(aa1))],
       color='red', legend_label='Train 1', line_width=3, alpha=0.5)
p.line(lens2, [(i+1)/len(aa2) for i in range(len(aa2))],
       color='orange', legend_label='Train 2', line_width=3, alpha=0.5)
p.line(lensv, [(i+1)/len(aav1) for i in range(len(aav1))],
       color='green', legend_label='Valid 1', line_width=3, alpha=0.5)
p.line(lensv2, [(i+1)/len(aav2) for i in range(len(aav2))],
       color='cyan', legend_label='Valid 2', line_width=3, alpha=0.5)
p.line(lenst, [(i+1)/len(aat1) for i in range(len(aat1))],
       color='blue', legend_label='Test 1', line_width=3, alpha=0.5)
p.line(lenst2, [(i+1)/len(aat2) for i in range(len(aat2))],
       color='purple', legend_label='Test 2', line_width=3, alpha=0.5)
p.legend.location = 'top_left'
bokeh.io.show(p)

In [100]:
class HuRI(Dataset):
    def __init__(self, labels, p1s, p2s):
        # labels is an (n_samples,)-long ndarray
        # p1s is a list of lists of integers representing protein 1 amino acids
        # p2s is a list of lists of integers representing protein 2 amino acids
        self.labels = labels
        self.p1s = p1s
        self.p2s = p2s
    
    def __len__(self):
        return self.labels.shape[0]
    
    def __getitem__(self, idx):
        return (torch.tensor(self.p1s[idx]),
                torch.tensor(self.p2s[idx])), \
               torch.tensor(self.labels.iloc[idx]).type(torch.float)

In [31]:


class Model(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        pass

In [32]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Running on {device}')

torch.backends.cudnn.benchmark = True
torch.backends.cudnn.enabled = True

Running on cpu


In [102]:
labels = train['Y']

data = HuRI(labels, aa1, aa2)

train_test = random_split(data, [int(labels.shape[0]*0.75), labels.shape[0]-int(labels.shape[0]*0.75)])

batch_size = 16
train_loader = DataLoader(train_test[0], batch_size=batch_size, shuffle=True)
test_loader = DataLoader(train_test[1], batch_size=batch_size, shuffle=False)

In [None]:
model = Model()
model.to(device)

loss_fn = nn.CrossEntropyLoss(reduction='sum')

optimizer1 = optim.Adam(model.parameters())
optimizer2 = optim.SGD(model.parameters(), lr=0.001)

n_epochs = 3
optimizer_cutoff = int(n_epochs*4/4.0)

In [None]:
tic = time.time()
print(f'Setup Time: {tic - ts}')
te = tic

for epoch in range(n_epochs):
    train_loss = 0
    test_loss = 0
    if epoch < optimizer_cutoff:
        optimizer = optimizer1
    else:
        optimizer = optimizer2
    for ex in train_loader:
        inputs, labs = ex
        inputs = inputs.to(device)
        labs = labs.to(device)
        
        optimizer.zero_grad()

        outputs = model(inputs).to(device)
        loss = loss_fn(outputs, labs)
        train_loss += loss / train_test[0].shape[0]
        loss.backward()
        optimizer.step()

    acc = 0
    with torch.no_grad():
        for ex in test_loader:
            inputs, labs = ex
            inputs = inputs.to(device)
            labs = labs.to(device)
            outputs = model(inputs).to(device)
            test_loss += loss_fn(outputs, labs) / train_test[1].shape[0]
            ps, preds = torch.max(outputs)
            acc += sum(preds == labs) / train_test[1].shape[0]
    print(f'Epoch {epoch+1} Accuracy: {acc*100},\tTrain Loss: {train_loss},', end='')
    print(f'\tTest Loss: {test_loss},\tRuntime: {time.time()-te}')
    te = time.time()
    
toc = time.time()
print(f'Training Runtime: {toc-tic}s')