In [1]:
import time
import pandas as pd
import matplotlib.pyplot as plt
import bokeh.io
import bokeh.plotting
import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split
import torch.optim as optim
from tdc.multi_pred import PPI
from tqdm import tqdm

ts = time.time()
bokeh.io.output_notebook()

In [2]:
data = PPI(name = 'HuRI')
data = data.neg_sample(frac = 1)
split = data.get_split()
train = split['train']
valid = split['valid']
test = split['test']

Found local copy...
Loading...
Done!


In [3]:
# remove data where protein sequences are longer than 800 amino acids
train = train.loc[train["Protein1"].str.len()<1000]
train = train.loc[train["Protein2"].str.len()<1000]
# valid = valid.loc[valid["Protein1"].str.len()<1000]
# valid = valid.loc[valid["Protein2"].str.len()<1000]
# test = test.loc[test["Protein1"].str.len()<1000]
# test = test.loc[test["Protein2"].str.len()<1000]

In [4]:
train.head(2)

Unnamed: 0,Protein1_ID,Protein1,Protein2_ID,Protein2,Y
0,ENSG00000000005,MAKNPPENCEDCHILNAEAFKSKKICKSLKICGLVFGILALTLIVL...,ENSG00000061656,MRRSSRPGSASSSRKHTPNFFSENSSMSITSEDSKGLRSAEPGPGE...,1
1,ENSG00000000005,MAKNPPENCEDCHILNAEAFKSKKICKSLKICGLVFGILALTLIVL...,ENSG00000104765,MSSHLVEPPPPLHNNNNNCEENEQSLPPPAGLNSSWVELPMNSSNG...,1


In [5]:
aatoidx = {}
i = 1
skip = False
for p1 in train['Protein1']:
    for c1 in p1:
        if c1 not in aatoidx.keys():
            aatoidx[c1] = i
            i += 1
    if i == 24:
        skip = True
        break
if not skip:
    for p2 in train['Protein2']:
        for c2 in p2:
            if c1 not in aatoidx.keys():
                aatoidx[c1] = i
                i += 1
        if i == 23:
            break
aatoidx['0'] = 0

idxtoaa = {}
for key in aatoidx.keys():
    idxtoaa[aatoidx[key]] = key
    
aato1hot = {}
for aa in aatoidx.keys():
    aato1hot[aa] = F.one_hot(torch.tensor(aatoidx[aa]), num_classes=len(aatoidx))
aato1hot['0'] = torch.cat((torch.tensor(1).unsqueeze(-1), torch.zeros(23))).type(torch.long)

In [6]:
max_len = 1000

aa1 = torch.empty((len(train['Protein1']), max_len, len(aato1hot)))
aa2 = torch.empty((len(train['Protein2']), max_len, len(aato1hot)))
for p in range(len(train['Protein1'])):
    prot1 = train['Protein1'].iloc[p]
    prot2 = train['Protein2'].iloc[p]
    for i in range(max_len):
        if i < len(prot1):
            aa1[p,i,:] = aato1hot[prot1[i]]
        else:
            aa1[p,i,:] = aato1hot['0']
        if i < len(prot2):
            aa2[p,i,:] = aato1hot[prot2[i]]
        else:
            aa2[p,i,:] = aato1hot['0']

In [25]:
# aa1 = [[aatoidx[s[i]] if i < len(s) else 0 for i in range(1000)] for s in train['Protein1']]
# aa2 = [[aatoidx[s[i]] if i < len(s) else 0 for i in range(1000)] for s in train['Protein2']]
# aav1 = [[aatoidx[s[i]] if i < len(s) else 0 for i in range(800)] for s in valid['Protein1']]
# aav2 = [[aatoidx[s[i]] if i < len(s) else 0 for i in range(800)] for s in valid['Protein2']]
# aat1 = [[aatoidx[s[i]] if i < len(s) else 0 for i in range(800)] for s in test['Protein1']]
# aat2 = [[aatoidx[s[i]] if i < len(s) else 0 for i in range(800)] for s in test['Protein2']]

In [26]:
len(aa1), len(aa2) #, len(aav1), len(aav2), len(aat1), len(aat2)

(24694, 24694)

In [27]:
# print('Min protein lengths:')
# print('\tTrain 1:', min([len(s) for s in aa1]))
# print('\tTrain 2:', min([len(s) for s in aa2]))
# print('\tValid 1:', min([len(s) for s in aav1]))
# print('\tValid 2:', min([len(s) for s in aav2]))
# print('\tTest 1:', min([len(s) for s in aat1]))
# print('\tTest 2:', min([len(s) for s in aat2]))
# print('Mean protein lengths:')
# print('\tTrain 1:', sum([len(s) for s in aa1])/len(aa1))
# print('\tTrain 2:', sum([len(s) for s in aa2])/len(aa2))
# print('\tValid 1:', sum([len(s) for s in aav1])/len(aav1))
# print('\tValid 2:', sum([len(s) for s in aav2])/len(aav2))
# print('\tTest 1:', sum([len(s) for s in aat1])/len(aat1))
# print('\tTest 2:', sum([len(s) for s in aat2])/len(aat2))
# print('Max protein lengths:')
# print('\tTrain 1:', max([len(s) for s in aa1]))
# print('\tTrain 2:', max([len(s) for s in aa2]))
# print('\tValid 1:', max([len(s) for s in aav1]))
# print('\tValid 2:', max([len(s) for s in aav2]))
# print('\tTest 1:', max([len(s) for s in aat1]))
# print('\tTest 2:', max([len(s) for s in aat2]))

Min protein lengths:
	Train 1: 1000
	Train 2: 1000
	Valid 1: 36
	Valid 2: 36
	Test 1: 26
	Test 2: 26
Mean protein lengths:
	Train 1: 1000.0
	Train 2: 1000.0
	Valid 1: 1382.5512698109605
	Valid 2: 1342.351346190567
	Test 1: 1403.1202978804658
	Test 2: 1343.2868531602062
Max protein lengths:
	Train 1: 1000
	Train 2: 1000
	Valid 1: 29856
	Valid 2: 33472
	Test 1: 33472
	Test 2: 27986


In [29]:
# lens1 = [len(aa1[i]) for i in range(len(aa1))]
# lens2 = [len(aa2[i]) for i in range(len(aa2))]
# lensv1 = [len(aav1[i]) for i in range(len(aav1))]
# lensv2 = [len(aav2[i]) for i in range(len(aav2))]
# lenst1 = [len(aat1[i]) for i in range(len(aat1))]
# lenst2 = [len(aat2[i]) for i in range(len(aat2))]
# lens1.sort()
# lens2.sort()
# lensv1.sort()
# lensv2.sort()
# lenst1.sort()
# lenst2.sort()

# p = bokeh.plotting.figure(title='Sequence Length CDFs', x_axis_type='log', y_axis_type='log')
# p.line(lens1, [(i+1)/len(aa1) for i in range(len(aa1))],
#        color='red', legend_label='Train 1', line_width=3, alpha=0.5)
# p.line(lens2, [(i+1)/len(aa2) for i in range(len(aa2))],
#        color='orange', legend_label='Train 2', line_width=3, alpha=0.5)
# p.line(lensv1, [(i+1)/len(aav1) for i in range(len(aav1))],
#        color='green', legend_label='Valid 1', line_width=3, alpha=0.5)
# p.line(lensv2, [(i+1)/len(aav2) for i in range(len(aav2))],
#        color='cyan', legend_label='Valid 2', line_width=3, alpha=0.5)
# p.line(lenst1, [(i+1)/len(aat1) for i in range(len(aat1))],
#        color='blue', legend_label='Test 1', line_width=3, alpha=0.5)
# p.line(lenst2, [(i+1)/len(aat2) for i in range(len(aat2))],
#        color='purple', legend_label='Test 2', line_width=3, alpha=0.5)
# p.legend.location = 'top_left'
# bokeh.io.show(p)

In [7]:
class HuRI(Dataset):
    def __init__(self, labels, p1s, p2s):
        # labels is an (n_samples,)-long ndarray
        # p1s is a list of lists of integers representing protein 1 amino acids
        # p2s is a list of lists of integers representing protein 2 amino acids
        self.labels = labels
        self.p1s = p1s
        self.p2s = p2s
    
    def __len__(self):
        return self.labels.shape[0]
    
    def __getitem__(self, idx):
        return (torch.tensor(self.p1s[idx]).type(torch.float), 
                torch.tensor(self.p2s[idx]).type(torch.float)), \
               torch.tensor(self.labels.iloc[idx]).type(torch.float)

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Running on {device}')

torch.backends.cudnn.benchmark = True
torch.backends.cudnn.enabled = True

Running on cpu


In [10]:
labels = train['Y']

data = HuRI(labels, aa1, aa2)

train_test = random_split(data, [int(labels.shape[0]*0.8), labels.shape[0]-int(labels.shape[0]*0.8)])

batch_size = 16
train_loader = DataLoader(train_test[0], batch_size=batch_size, shuffle=True)
test_loader = DataLoader(train_test[1], batch_size=batch_size, shuffle=False)

In [104]:
# labels = test['Y']

# data = HuRI(labels, aat1, aat2)

# batch_size = 18282
# test_loader = DataLoader(data, batch_size=batch_size, shuffle=True)

In [21]:
class ConvolutionalClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Conv1d(0,0,1)
    
    def forward(self, p1, p2):
        return F.softmax(torch.randn(2))

In [22]:
# # Define the neural network model
# class BinaryClassifier(nn.Module):
#     def __init__(self, n_samples = 5):
#         super(BinaryClassifier, self).__init__()
#         self.fc1 = nn.Linear(800, n_samples)   # Input layer
#         # Layers???
#         self.fc2 = nn.Linear(800, n_samples)
#         self.gelu = nn.GELU()
#         self.fc3 = nn.Linear(10, 1)    
#         self.sig = nn.Sigmoid() # Output layer        

#     def forward(self, x1, x2):
#         out1 = self.gelu(self.fc1(x1))
#         out2 = self.gelu(self.fc2(x2))
#         out = torch.cat((out1, out2), dim=1)
#         out = self.fc3(out)
#         out = self.sig(out)

#         return out


In [23]:
# Initialize the model and optimizer
model = ConvolutionalClassifier()

optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=4)

loss_fn = nn.BCELoss(reduction='sum')

In [24]:
num_epochs = 30

train_loss_tracker = np.zeros(num_epochs)
test_loss_tracker = np.zeros(num_epochs)

tic = time.time()
print(f'Setup Time: {tic - ts}')
te = tic

# Train the model
for epoch in tqdm(range(num_epochs)):
    for i, (train_data, train_labels) in enumerate(train_loader):
        # Forward pass
        outputs = model(train_data[0].to(device), train_data[1].to(device)).to(device)
        train_labels = train_labels.unsqueeze(1).to(device)
        train_loss_tracker[i] += loss_fn(outputs, train_labels) / len(train_test[0].shape[0])
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
                                                                      
    acc = 0
    with torch.no_grad():
        for i, (test_data, test_labels) in enumerate(test_loader):
            outputs = model(test_data[0].to(device), test_data[1].to(device)).to(device)
            test_labels = test_labels.unsqueeze(1).to(device)
            test_loss_tracker[i] += loss_fn(outputs, test_labels) / train_test[1].shape[0]
            ps, preds = torch.max(outputs)
            acc += sum(preds == test_labels) / train_test[1].shape[0]
    print(f'Epoch {epoch+1} Accuracy: {acc*100},\tTrain Loss: {train_loss},', end='')
    print(f'\tTest Loss: {test_loss},\tRuntime: {time.time()-te}')
    te = time.time()
    
toc = time.time()
print(f'Training Runtime: {toc-tic}s')

Setup Time: 1719.8332335948944


  0%|                                                                                         | 0/30 [00:00<?, ?it/s]


ValueError: Using a target size (torch.Size([16, 1])) that is different to the input size (torch.Size([2])) is deprecated. Please ensure they have the same size.