In [1]:
%matplotlib inline
import os

import torch as T
import torch.nn as nn
import torch.nn.functional as F

In [2]:
def read_dataset(model: str, task: str, split: str,  basepath='/scratch/fl1092/ml_protein_data'):
    path_to_file = f'{basepath}/{model}/{task}/{task}_{split}.p'
    data = np.load(path_to_file, allow_pickle=True)
    
    return data

def dict_2_arr(data_dict, labels, avgr=lambda x: np.mean(x, axis=0)):
    
    emb_shape = list(data_dict.values())[0].shape
    number_of_embeddings = len(data_dict) 

    X = np.zeros((number_of_embeddings, emb_shape[-1]))
    y = np.zeros(number_of_embeddings)
    
    i = 0

    # iter over sorted keys in labels to ensure proteins
    # from different models are indexed the same
    keys = list(labels.keys())
    keys.sort()
    for key in keys :
        if key == 'd1smyc_':
            continue
        X[i] = avgr(data_dict[key])
        y[i] = labels[key]
        i += 1
        
    return X, y

def ensemble_append_mean_reps(dicts, labels):
    
    new_dict = dict()
    keys = dicts[0].keys()
    for key in keys:
        seqs = []
        for d in dicts:
            seq = np.mean(d[key], axis=0)
            seq = preprocessing.normalize([seq], norm='l2')
            seqs.append(seq)
        combined_seqs = np.concatenate(seqs, axis=1)
        new_dict[key] = combined_seqs

    emb_size = list(new_dict.values())[0].shape[1]
    X = np.zeros((len(new_dict), emb_size))
    y = np.zeros(len(new_dict))
    
    i = 0
    for key in new_dict:
        X[i] = new_dict[key]
        y[i] = labels[key]
        i += 1
        
    return X, y

In [3]:
class DataSet(T.utils.data.Dataset):
    # creat a Map-style dataset
    
    def __init__(self, task, split, device):
        self.elmo_train = read_dataset('elmo', task, split)
        self.unirep_train = read_dataset('unirep', task, split)
        self.y_data = read_dataset('label', task, split)
        
        X_app, self.y_data = ensemble_append_mean_reps([self.elmo_train, self.unirep_train], self.y_data)
        self.x_data = T.tensor(X_app, dtype=T.float32).to(device)
        
    def __len__(self):
        return len(self.x_data)

    def __getitem__(self, idx):
        if T.is_tensor(idx):
            idx = idx.tolist()
        
        x = self.x_data[idx, ]
        y = self.y_data[idx]
        
        return x, y

In [4]:
# encoder and decoder with two convolutional layers

class ConvEncoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        c = capacity
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=c, kernel_size=4, stride=2, padding=1) # out: c x 14 x 14
        self.conv2 = nn.Conv1d(in_channels=c, out_channels=c*2, kernel_size=4, stride=2, padding=1) # out: c x 7 x 7
        self.fc = nn.Linear(in_features=c*2*7*7, out_features=latent_dims)
            
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = x.view(x.size(0), -1) # flatten batch of multi-channel feature maps to a batch of feature vectors
        x = self.fc(x)
        return x

class ConvDecoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        c = capacity
        self.fc = nn.Linear(in_features=latent_dims, out_features=c*2*7*7)
        self.conv2 = nn.ConvTranspose1d(in_channels=c*2, out_channels=c, kernel_size=4, stride=2, padding=1)
        self.conv1 = nn.ConvTranspose1d(in_channels=c, out_channels=1, kernel_size=4, stride=2, padding=1)
            
    def forward(self, x):
        x = self.fc(x)
        x = x.view(x.size(0), capacity*2, 7, 7) # unflatten batch of feature vectors to a batch of multi-channel feature maps
        x = F.relu(self.conv2(x))
        x = torch.tanh(self.conv1(x)) # last layer before output is tanh, since the images are normalized and 0-centered
        return x

In [5]:
# encoder and decoder with fully connected layers only

class Encoder(nn.Module):
    def __init__(self, input_shape):
        super(Encoder, self).__init__()
        self.fc1 = nn.Linear(in_features=input_shape, out_features=1024)
        self.fc2 = nn.Linear(in_features=1024, out_features=128)
        self.fc3 = nn.Linear(in_features=128, out_features=128)
            
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class Decoder(nn.Module):
    def __init__(self, input_shape):
        super(Decoder, self).__init__()
        self.fc1 = nn.Linear(in_features=128, out_features=128)
        self.fc2 = nn.Linear(in_features=128, out_features=1024)
        self.fc3 = nn.Linear(in_features=1024, out_features=input_shape)
            
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [6]:
class Autoencoder(nn.Module):
    def __init__(self, input_shape):
        super(Autoencoder, self).__init__()
        self.encoder = Encoder(input_shape)
        self.decoder = Decoder(input_shape)
    
    def forward(self, x):
        latent = self.encoder(x)
        x_recon = self.decoder(latent)
        return x_recon

# Load data

In [7]:
import numpy as np
from sklearn import preprocessing

In [8]:
from torch.utils.data import DataLoader

In [9]:
latent_dims = 10
num_epochs = 20
batch_size = 128
capacity = 64
learning_rate = 1e-3
use_gpu = True

In [10]:
device = T.device('cuda')

In [11]:
%%time
train_data = DataSet('remote_homology', 'train', device)

CPU times: user 16.2 s, sys: 14.4 s, total: 30.6 s
Wall time: 30.9 s


In [12]:
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

## example data

In [13]:
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST

In [14]:
transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])

train_dataset = torchvision.datasets.MNIST(
    root="~/torch_datasets", train=True, transform=transform, download=True
)

test_dataset = torchvision.datasets.MNIST(
    root="~/torch_datasets", train=False, transform=transform, download=True
)

train_loader = T.utils.data.DataLoader(
    train_dataset, batch_size=128, shuffle=True, num_workers=4, pin_memory=True
)

test_loader = T.utils.data.DataLoader(
    test_dataset, batch_size=32, shuffle=False, num_workers=4
)

# auto-encoder

## fully connected

In [15]:
num_epochs = 40
learning_rate = 1e-2

In [16]:
model = Autoencoder(2924).to(device)

# create an optimizer object
# Adam optimizer with learning rate 1e-3
optimizer = T.optim.Adam(params=model.parameters(), lr=learning_rate, weight_decay=1e-7)

# mean-squared error loss
criterion = nn.MSELoss()

In [17]:
for epoch in range(num_epochs):
    loss = 0
    for batch_features, _ in train_dataloader:
        # reshape mini-batch data to [N, 784] matrix
        # load it to the active device
        batch_features = batch_features.to(device)
        
        # reset the gradients back to zero
        # PyTorch accumulates gradients on subsequent backward passes
        optimizer.zero_grad()
        
        # compute reconstructions
        outputs = model(batch_features)
        
        # compute training reconstruction loss
        train_loss = criterion(outputs, batch_features)
        
        # compute accumulated gradients
        train_loss.backward()
        
        # perform parameter update based on current gradients
        optimizer.step()
        
        # add the mini-batch training loss to epoch loss
        loss += train_loss.item()
    
    # compute the epoch training loss
    loss = loss / len(train_dataloader)
    
    # display the epoch training loss
    if epoch %3 == 0:
        print("epoch : {}/{}, loss = {:.6f}".format(epoch + 1, num_epochs, loss))

epoch : 1/40, loss = 0.000789
epoch : 4/40, loss = 0.000238
epoch : 7/40, loss = 0.000223
epoch : 10/40, loss = 0.000217
epoch : 13/40, loss = 0.000215
epoch : 16/40, loss = 0.000212
epoch : 19/40, loss = 0.000211
epoch : 22/40, loss = 0.000210
epoch : 25/40, loss = 0.000209
epoch : 28/40, loss = 0.000210
epoch : 31/40, loss = 0.000209
epoch : 34/40, loss = 0.000209
epoch : 37/40, loss = 0.000210
epoch : 40/40, loss = 0.000209


## encode to latent space

In [18]:
from tqdm import tqdm_notebook as tqdm

In [19]:
%%time
test_data = DataSet('remote_homology', 'test', device)

CPU times: user 739 ms, sys: 703 ms, total: 1.44 s
Wall time: 1.45 s


In [20]:
len(train_data), len(test_data)

(12305, 718)

In [21]:
train_x_latent = []
train_y = []

for i in tqdm(range(len(train_data))):
    x, y = train_data[i]
    train_x_latent.append(model.encoder(x).cpu().detach().numpy())
    train_y.append(y)

HBox(children=(IntProgress(value=0, max=12305), HTML(value='')))




In [22]:
test_x_latent = []
test_y = []

for i in tqdm(range(len(test_data))):
    x, y = test_data[i]
    test_x_latent.append(model.encoder(x).cpu().detach().numpy())
    test_y.append(y)

HBox(children=(IntProgress(value=0, max=718), HTML(value='')))




## train classifier

In [23]:
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

In [24]:
clf = OneVsRestClassifier(LinearSVC())
clf.fit(train_x_latent, train_y)

OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=None)

In [25]:
clf.score(test_x_latent, test_y)

0.0905292479108635