In [1]:
import os
import numpy as np
import pandas as pd

PATH_ROOT = os.getcwd().replace("\\","/").replace("/notebooks","")

Code sample from SherlockLiao https://github.com/L1aoXingyu/pytorch-beginner/blob/master/08-AutoEncoder/simple_autoencoder.py

In [39]:
import torch
import torchvision
from torch import nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

class EmbeddingDataset(Dataset):
    def __init__(self, embedding_matrix, transform=None):
        self.embedding_matrix = embedding_matrix
        self.transform = transform

    def __len__(self):
        return len(self.embedding_matrix)

    def __getitem__(self, idx):
#         if torch.is_tensor(idx):
#             idx = idx.tolist()
        sample = self.embedding_matrix[idx]
        if self.transform:
            sample = self.transform(sample)
        return torch.from_numpy(sample)

class autoencoder(nn.Module):
    def __init__(self,input_dim):
        super(autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(True),
            nn.Linear(128, 64),
            nn.ReLU(True),
            nn.Linear(64, 12),
            nn.ReLU(True),
            nn.Linear(12, 2)
        )
        self.decoder = nn.Sequential(
            nn.Linear(2, 12),
            nn.ReLU(True),
            nn.Linear(12, 64),
            nn.ReLU(True),
            nn.Linear(64, 128),
            nn.ReLU(True),
            nn.Linear(128, input_dim),
            nn.Tanh()
        )
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def forward(self, x):
        x = x.float()
        x = x.to(self.device)
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [78]:
num_epochs = 10
learning_rate = 1e-3
batch_size = 5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def train_autoencoder(embedding_to_train,model_save_path):   
    model = autoencoder(len(embedding_to_train[0])).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)
    embedding_dataset = EmbeddingDataset(embedding_to_train)
    embedding_dataloader = DataLoader(embedding_dataset,batch_size=batch_size, shuffle=True)
    best_model_dict = None
    best_loss = float('inf')
    for epoch in range(num_epochs):
        for data in embedding_dataloader:
            data = data.to(device)
            # ===================forward=====================
            output = model(data)
            output = output.float()
            data = data.float()
            loss = criterion(output, data)
            # ===================backward====================
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        # ===================log========================
        print('epoch [{}/{}], loss:{:.4f}'
              .format(epoch + 1, num_epochs, loss.data))
        if (loss.data < best_loss):
            best_model_dict = model.state_dict()
            best_loss = loss.data

    torch.save(best_model_dict, model_save_path)
    
def autoencode_embedding(autoencoder_model,embedding):
    encoded = []
    for row in embedding:
        row = torch.from_numpy(row).float()
        embedded = autoencoder_model.encoder.forward(row)
        embedded = embedded.detach().numpy()
        encoded.append(embedded)
    return np.array(encoded)

In [43]:
proteomeHD_path = f"{PATH_ROOT}/data_sources/ProteomeHD/ProteomeHD_v1_1.csv"
proteomeHD_df = pd.read_csv(proteomeHD_path)
proteomeHD_feature_matrix = proteomeHD_df.iloc[:,4:].fillna(0).to_numpy()

pQTL_protein_path = f"{PATH_ROOT}/data_sources/pQTL/pQTL_protein_converted.csv"
pQTL_protein_df = pd.read_csv(pQTL_protein_path)
pQTL_protein_feature_matrix = pQTL_protein_df.iloc[:,2:].fillna(0).to_numpy()

nikolai_protein_path = f"{PATH_ROOT}/data_sources/Nikolai/Proteins-processed.csv"
nikolai_protein_df = pd.read_csv(nikolai_protein_path)
nikolai_protein_feature_matrix = nikolai_protein_df.iloc[:,1:].fillna(0).to_numpy()

proteomeHD_autoencoder_path = f"{PATH_ROOT}/models/autoencoders/proteomeHD_autoencoder.pth"
pQTL_autoencoder_path = f"{PATH_ROOT}/models/autoencoders/pQTL_autoencoder.pth"
nikolai_autoencoder_path = f"{PATH_ROOT}/models/autoencoders/nikolai_autoencoder.pth"

In [92]:
train_autoencoder(nikolai_protein_feature_matrix,nikolai_autoencoder_path)

epoch [1/10], loss:0.0809
epoch [2/10], loss:0.0304
epoch [3/10], loss:0.0845
epoch [4/10], loss:0.0951
epoch [5/10], loss:0.0832
epoch [6/10], loss:0.0668
epoch [7/10], loss:0.0506
epoch [8/10], loss:0.0335
epoch [9/10], loss:0.0552
epoch [10/10], loss:0.0893


In [93]:
nikolai_model = autoencoder(len(nikolai_protein_feature_matrix[0]))
nikolai_model.load_state_dict(torch.load(nikolai_autoencoder_path))
nikolai_model.eval()

autoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=1018, out_features=128, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU(inplace=True)
    (4): Linear(in_features=64, out_features=12, bias=True)
    (5): ReLU(inplace=True)
    (6): Linear(in_features=12, out_features=2, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=2, out_features=12, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=12, out_features=64, bias=True)
    (3): ReLU(inplace=True)
    (4): Linear(in_features=64, out_features=128, bias=True)
    (5): ReLU(inplace=True)
    (6): Linear(in_features=128, out_features=1018, bias=True)
    (7): Tanh()
  )
)

In [94]:
nikolai_encoded = autoencode_embedding(nikolai_model,nikolai_protein_feature_matrix)

In [95]:
len(nikolai_encoded)

2772

In [96]:
embedding_save_path = f"{PATH_ROOT}/embeddings/autoencoder/nikolai_autoencoder_embedding.npy"
np.save(embedding_save_path,nikolai_encoded)