In [2]:
%matplotlib inline
import os
import sys
from datetime import datetime
import logging
logging.basicConfig(format='%(asctime)s %(message)s', stream=sys.stdout)

import torch as T
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

import numpy as np
from tqdm import tqdm_notebook as tqdm

from sklearn import preprocessing
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [3]:
class Log():
    def __init__(self):
        pass
    
    def print(self, msg):
        print(f'{datetime.now().strftime("%H:%M:%S")} {msg}')
        
L = Log()

In [4]:
def read_dataset(model: str, task: str, split: str,  basepath='/scratch/fl1092/ml_protein_data'):
    if model == 'transformer' and split == 'test': split='test_fold_holdout'
    path_to_file = f'{basepath}/{model}/{task}/{task}_{split}.p'
    data = np.load(path_to_file, allow_pickle=True)
    
    return data

def dict_2_arr(data_dict, labels, avgr=lambda x: np.mean(x, axis=0)):
    
    emb_shape = list(data_dict.values())[0].shape
    number_of_embeddings = len(data_dict) 

    X = np.zeros((number_of_embeddings, emb_shape[-1]))
    y = np.zeros(number_of_embeddings)
    
    i = 0

    # iter over sorted keys in labels to ensure proteins
    # from different models are indexed the same
    keys = list(labels.keys())
    keys.sort()
    for key in keys :
        if key in data_dict:
            X[i] = avgr(data_dict[key])
            y[i] = labels[key]
            i += 1
        
    return X, y

def ensemble_append_mean_reps(dicts, labels, LEN, average=True, normalize=True):
    # if average set to False, output 2d arrays for each sequence without averaging
    if LEN == -1:
        LEN = float('inf')
        
    keys = set(dicts[0].keys())
    for i in range(1, len(dicts)):
        keys = keys.intersection(set(dicts[i].keys()))
    
    # combine two dictionary into one by concatenating
    new_dict = dict()
    for key in keys:
        seqs = []
        for d in dicts:
            
            # 1d or 2d
            if average:
                seq = np.mean(d[key], axis=0)
            else:
                seq = d[key]
                if seq.shape[0] < LEN:
                    seq = np.concatenate([seq, np.zeros((LEN-seq.shape[0], seq.shape[1]))], axis=0)
                elif seq.shape[0] > LEN:
                    seq = seq[:LEN, ]
                
            # normalize or not
            if normalize:
                seq = preprocessing.normalize(seq.reshape(1, -1), norm='l2')

            seqs.append(seq)
        combined_seqs = np.concatenate(seqs, axis=-1)
        
        if average:
            combined_seqs[0]
        
        new_dict[key] = combined_seqs
    
    emb_size = combined_seqs.shape
    
    if average:
        X = np.zeros((len(new_dict), emb_size[1]))
    else:
        d1, d2 = emb_size
        X = np.zeros((len(new_dict), d1, d2))
    y = np.zeros(len(new_dict))
    
    i = 0
    for key in new_dict:
        X[i] = new_dict[key]
        y[i] = labels[key]
        i += 1
        
    print('concatenated embedding size: ', X.shape)
        
    return X, y

In [5]:
class DataSet(T.utils.data.Dataset):
    ### a Map-style dataset ###
    
    def __init__(self, task, split, average=True, normalize=True, conv=True, seq_len=-1):
        self.models = ['elmo','unirep', 'transformer'] 
        self.splits = ['train', 'test']
        self.x_data = {}
        self.y_data = {}
        self.LEN = seq_len
        self.split = split # the split of the current data
        
        # for split in self.splits:
        # load y
        y_data = read_dataset('label', task, self.split)

        # load x
        to_append = []
        for model in self.models:
            data = read_dataset(model, task, self.split)
            to_append.append(data)

        # concatnate
        X_app, self.y_data[split] = ensemble_append_mean_reps(to_append, y_data, self.LEN, average, normalize)
        if conv:
            # add channel information if convolutional neural net
            X_app = X_app.reshape((X_app.shape[0], 1, X_app.shape[1], X_app.shape[2]))
        print('Shape after appending: ', X_app.shape)
        
        self.x_data[self.split] = T.tensor(X_app, dtype=T.float32)
            
    def get_split(self, split):
        assert(split in self.splits)
        self.split = split
        
    def lenLongestSeq(self, task):
        # find the length of the longest sequence
        # TODO: trim and pad everyting to 95% interval
        LEN = -1
        for split in self.splits:
            for model in self.models:
                data = read_dataset(model, task, split)
                
                length = -1
                for key, embedding in data.items():
                    length = max(length, embedding.shape[0])
                
                print(f"Longest sequence in {model} {split}: {length}")
                    
                LEN = max(LEN, length)
        
        return LEN
    
    def lenRangeSeq(self, task, p=0.05):
        # return the uper p-th percentile
        lens = []
        
        for split in self.splits:
            for model in self.models:
                data = read_dataset(model, task, split)
                
                for key, embedding in data.items():
                    lens.append(embedding.shape[0])
                    
        return 
        
    def __len__(self):
        return len(self.x_data[self.split])

    def __getitem__(self, idx):
        if T.is_tensor(idx):
            idx = idx.tolist()
        
        x = self.x_data[self.split][idx, ]
        y = self.y_data[self.split][idx]
        
        return x, y

In [6]:
### encoder and decoder with two convolutional layers ###

class ConvEncoder(nn.Module):
    def __init__(self, c=10, latent_dims=512):
        super(ConvEncoder, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=c, kernel_size=4, stride=4, padding=1) # out: c x 14 x 14
        self.conv2 = nn.Conv2d(in_channels=c, out_channels=c*2, kernel_size=4, stride=4, padding=1) # out: c x 7 x 7
        self.fc = nn.Linear(in_features=c*2*16*231, out_features=latent_dims)
            
    def forward(self, x):
        #print(list(x.size()))
        x = F.relu(self.conv1(x))
        #print(list(x.size()))
        x = F.relu(self.conv2(x))
        #print(list(x.size()))
        x = x.view(x.size(0), -1) # flatten batch of multi-channel feature maps to a batch of feature vectors
        #print(list(x.size()))
        x = self.fc(x)
        #print(list(x.size()))
        return x

class ConvDecoder(nn.Module):
    def __init__(self, c=10, latent_dims=512):
        super(ConvDecoder, self).__init__()
        self.c = c
        self.fc = nn.Linear(in_features=latent_dims, out_features=c*2*16*231)
        self.conv2 = nn.ConvTranspose2d(in_channels=c*2, out_channels=c, kernel_size=4, stride=4, padding=1)
        self.conv1 = nn.ConvTranspose2d(in_channels=c, out_channels=1, kernel_size=4, stride=4, padding=1)
            
    def forward(self, x):
        #print(list(x.size()))
        x = self.fc(x)
        #print(list(x.size()))
        # unflatten batch of feature vectors to a batch of multi-channel feature maps
        x = x.view(x.size(0), self.c*2, 16, 231)
        #print(list(x.size()))
        x = F.relu(self.conv2(x, output_size=[128, 10, 64, 923]))
        #print(list(x.size()))
        # last layer before output is tanh, since the images are normalized and 0-centered
        x = T.tanh(self.conv1(x, output_size=[128, 1, 256, 3692]))
        #print(list(x.size()))
        return x

In [7]:
### encoder and decoder with fully connected layers only ###

class Encoder(nn.Module):
    def __init__(self, input_shape, latent_dims):
        super(Encoder, self).__init__()
        self.fc1 = nn.Linear(in_features=input_shape, out_features=latent_dims)
        self.fc2 = nn.Linear(in_features=1024, out_features=128)
        self.fc3 = nn.Linear(in_features=128, out_features=128)
            
    def forward(self, x):
        x = F.relu(self.fc1(x))
        #x = F.relu(self.fc2(x))
        #x = self.fc3(x)
        return x

class Decoder(nn.Module):
    def __init__(self, input_shape, latent_dims):
        super(Decoder, self).__init__()
        #self.fc1 = nn.Linear(in_features=128, out_features=128)
        #self.fc2 = nn.Linear(in_features=128, out_features=1024)
        self.fc3 = nn.Linear(in_features=latent_dims, out_features=input_shape)
            
    def forward(self, x):
        #x = F.relu(self.fc1(x))
        #x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return x

In [21]:
### encoder and decoder with fully connected layers only ###

class Encoder(nn.Module):
    def __init__(self, input_shape, latent_dims):
        super(Encoder, self).__init__()
        self.fc1 = nn.Linear(in_features=input_shape, out_features=latent_dims)
            
    def forward(self, x):
        x = F.relu(self.fc1(x))
        return x

class Decoder(nn.Module):
    def __init__(self, input_shape, latent_dims):
        super(Decoder, self).__init__()
        self.fc3 = nn.Linear(in_features=latent_dims, out_features=input_shape)
            
    def forward(self, x):
        x = F.relu(self.fc3(x))
        return x

In [31]:
class DecoupledAutoencoder(nn.Module):
    def __init__(self, input_shape, conv=False, latent_dims=512):
        super(DecoupledAutoencoder, self).__init__()
        self.latent_dims = latent_dims
        self.dim1, self.dim2, self.dim3 = input_shape
        
        if conv:
            L.print('Convolutional neural network')
            self.e1 = ConvEncoder(latent_dims=latent_dims)
            self.d1 = ConvDecoder(latent_dims=latent_dims)
            
            self.e2 = ConvEncoder(latent_dims=latent_dims)
            self.d2 = ConvDecoder(latent_dims=latent_dims)
            
            self.e3 = ConvEncoder(latent_dims=latent_dims)
            self.d3 = ConvDecoder(latent_dims=latent_dims)
        else:
            L.print('Fully connected neural network')
            self.e1 = Encoder(self.dim1, latent_dims)
            self.d1 = Decoder(self.dim1, latent_dims)
            
            self.e2 = Encoder(self.dim2, latent_dims)
            self.d2 = Decoder(self.dim2, latent_dims)
            
            self.e3 = Encoder(self.dim3, latent_dims)
            self.d3 = Decoder(self.dim3, latent_dims)
            
    def encode(self, x):
        x1 = x[:, 0                   : self.dim1]
        x2 = x[:, self.dim1           : self.dim1+self.dim2]
        x3 = x[:, self.dim1+self.dim2 : self.dim1+self.dim2+self.dim3]
        
        l1 = self.e1(x1)
        l2 = self.e2(x2)
        l3 = self.e3(x3)
        
        return l1, l2, l3
    
    def reconstruct(self, l1, l2, l3):
        r1 = self.d1(l1)
        r2 = self.d2(l2)
        r3 = self.d3(l3)
        
        return r1, r2, r3
    
    def forward(self, x, d1, d2, d3):
        l1, l2, l3 = self.encode(x)
        r1, r2, r3 = self.reconstruct(l1, l2, l3)
        
        return r1, r2, r3

In [29]:
def caemeLoss(x, r1, r2, r3, d1, d2, d3, l1=1, l2=1, l3=1):
    mse = nn.MSELoss()
    x1, x2, x3 = x[:, 0:d1], x[:,d1:d1+d2], x[:, d1+d2:d1+d2+d3]
    
    return l1*mse(x1, r1) + l2*mse(x2, r2) + l3*mse(x3, r3)

In [47]:
def metaEmbedding(data, model, device, conv=False):
    X_latent = []
    Y = []

    for i in range(len(data)):
        x, y = data[i]
        # print(x.shape)
        
        if conv:
            x = x.reshape(1, x.size()[0], x.size()[1], x.size()[2])
        else:
            x = x.reshape(1, x.size()[0])
        # print(x.shape)
            
        x = x.to(device)
        l1, l2, l3 = model.encode(x)
        l1 = l1.cpu().detach().numpy()[0]
        l2 = l2.cpu().detach().numpy()[0]
        l3 = l3.cpu().detach().numpy()[0]
        conc = np.concatenate([l1,l2,l3], axis=0)
        # print(conc.shape, l1.shape, l2.shape, l3.shape)
        X_latent.append(conc)
        Y.append(y)
        
    return X_latent, Y

In [11]:
%%time
task='remote_homology'
conv=False
normalize=True
latent_dimension=170
seq_len=-1
num_epochs=30
learning_rate=1e-3
batch_size=128
loss_function=caemeLoss

device = T.device('cuda')
if conv:
    average=False
else:
    average=True

if conv:
    # TODO: remove this
    assert(normalize==False)

L.print('Loading data ...')
train_data = DataSet(task, 'train', average, normalize, conv, seq_len)
test_data = DataSet(task, 'test', average, normalize, conv, seq_len)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

04:54:50 Loading data ...
concatenated embedding size:  (12305, 3692)
Shape after appending:  (12305, 3692)
concatenated embedding size:  (718, 3692)
Shape after appending:  (718, 3692)
CPU times: user 28.9 s, sys: 33 s, total: 1min 1s
Wall time: 1min 3s


In [43]:
%%time
d1, d2, d3 = 1024, 1900, 768

model = DecoupledAutoencoder([d1, d2, d3], conv = conv, latent_dims=latent_dimension).to(device)
optimizer = T.optim.Adam(params=model.parameters(), lr=learning_rate, weight_decay=1e-7)

L.print('Training autoencoder...')
for epoch in tqdm(range(num_epochs)):
    loss = 0
    for batch_features, _ in train_dataloader:

        batch_features = batch_features.to(device) # load data to GPU
        optimizer.zero_grad() # reset the gradients back to zero

        r1, r2, r3 = model(batch_features, d1, d2, d3) # compute reconstructions

        train_loss = loss_function(batch_features, r1, r2, r3, d1, d2, d3) # compute training loss
        train_loss.backward() # compute accumulated gradients
        optimizer.step() # perform parameter update based on current gradients
        loss += train_loss.item() # add the mini-batch training loss to epoch loss

    loss = loss / len(train_dataloader) # compute the epoch training loss

    if epoch %3 == 0:
        L.print("epoch : {}/{}, loss = {:.6f}".format(epoch + 1, num_epochs, loss))

05:18:12 Fully connected neural network
05:18:12 Training autoencoder...


HBox(children=(IntProgress(value=0, max=30), HTML(value='')))

05:18:12 epoch : 1/30, loss = 0.002179
05:18:14 epoch : 4/30, loss = 0.001797
05:18:15 epoch : 7/30, loss = 0.001736
05:18:16 epoch : 10/30, loss = 0.001694
05:18:18 epoch : 13/30, loss = 0.001659
05:18:19 epoch : 16/30, loss = 0.001642
05:18:21 epoch : 19/30, loss = 0.001626
05:18:22 epoch : 22/30, loss = 0.001615
05:18:23 epoch : 25/30, loss = 0.001607
05:18:25 epoch : 28/30, loss = 0.001602

CPU times: user 13.7 s, sys: 202 ms, total: 14 s
Wall time: 14 s


In [48]:
%%time

L.print('Training classifier ...')
    
test_x_latent, test_y = metaEmbedding(test_data, model, device, conv)
train_x_latent, train_y = metaEmbedding(train_data, model, device, conv)

with open(f'train_decoupled_{conv}_{latent_dimension}.npy', 'wb') as f:
    np.save(f, np.array(train_x_latent))

with open(f'test_decoupled_{conv}_{latent_dimension}.npy', 'wb') as f:
    np.save(f, np.array(test_x_latent))
    
with open(f'train_decoupled_{conv}_{latent_dimension}_y.npy', 'wb') as f:
    np.save(f, np.array(train_y))
    
with open(f'test_decoupled_{conv}_{latent_dimension}_y.npy', 'wb') as f:
    np.save(f, np.array(test_y))

05:19:11 Training classifier ...
CPU times: user 5.22 s, sys: 169 ms, total: 5.39 s
Wall time: 5.44 s


In [49]:
%%time
clf = make_pipeline(StandardScaler(), LogisticRegression(solver='liblinear', multi_class='auto'))
clf.fit(train_x_latent, train_y)

score = clf.score(test_x_latent, test_y)

L.print(f"Score on test set: {score}")

06:08:12 Score on test set: 0.24930362116991645
CPU times: user 47min 59s, sys: 36.3 s, total: 48min 36s
Wall time: 48min 43s
