# Prototype reccomender system with cell lines and drug data embedded via simple autoencoder using KINOMEscan drug data and GDSC cell line data

## Setup

In [1]:
# Standard imports
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import dill

from scipy.stats import pearsonr

import torch
from torch import nn
from torch.autograd import Variable
from torch.nn import functional as F
from torch.utils.data import DataLoader


from sklearn import metrics

In [2]:
# Custom utilities imports
sys.path.append("/media/krzysztof/Nowy/Doktorat - Modelling drug efficacy in cancer/Recommender System Approach/Scripts/Modules")
from modeling import Dataset

## Load the and preprocess dataset

In [3]:
filepath = "../../Data/Preprocessed Datasets/"
with open(filepath + "GDSC-KINOMEscan_proteins_intersection_+_remaining_GDSC_target_genes_dataset.pkl", "rb") as f:
    full_dataset = dill.load(f)
print(full_dataset.name, type(full_dataset))
print()
print(full_dataset.description)

Kinases Dataset + Remaning GDSC drug's putative targets <class 'modeling.Dataset'>

Dataset containing 74 common drugs of GDSC and HMS LINCS Kinome scan dataset.
Cell lines data types: expression, coding variant and tissue type. Expressions and coding variants are 
present only for proteins present in both GDSC and KINOMEscan data, resulting in expression of 188 genes and
nmutations in 18 genes.
In addition, expressions and mutations (17 new features) of remaining target genes from GDSC are included
Tissue types are dummy encoded GDSC Tissue Descriptions 1 (18 features).
Drugs representation: inhibition scores (% control) of 294 proteins. Set of proteins is the intersection of 
proteins screened for each of 74 drugs.
Drug response data: drug reponse data contains AUC metrics across cell lines for 74 drugs considered.


In [4]:
# Establish response data for samples (drug-cell line pairs)
response_df = full_dataset.response_data.copy()

# Establish cell line features data
cell_line_data_df = full_dataset.full_cell_lines_data.copy()

# Search for cell lines present in response data, but missing the genomic features
missing_cell_lines = []
for cosmic_id in response_df.COSMIC_ID.unique():
    if cosmic_id not in cell_line_data_df.cell_line_id.unique():
        missing_cell_lines.append(cosmic_id)
# Put cell line IDs into index and drop cell line IDs columns
cell_line_data_df.index = cell_line_data_df.cell_line_id
cell_line_data_df = cell_line_data_df.drop("cell_line_id", axis=1)

# Extract response only for cell lines for which features are present
response_df = response_df[~response_df.COSMIC_ID.isin(missing_cell_lines)]

# Establish drug features data
drug_data_df = full_dataset.drugs_data.copy()

# Convert drug index from LINCS name to GDSC drug ID
drug_data_df.index = drug_data_df.index.map(full_dataset.kinomescan_name_to_gdsc_id_mapper)
print(cell_line_data_df.shape, drug_data_df.shape, response_df.shape)

(922, 241) (74, 294) (52730, 3)


## Models' definitions

In [150]:
# Simple linear autoencoder
class LinearAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(LinearAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.Tanh())
        self.decoder = nn.Linear(hidden_dim, input_dim)
    
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x
    
# Simple linear autoencoder
class LinearAutoencoderWithoutActivation(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(LinearAutoencoderWithoutActivation, self).__init__()
        self.encoder = nn.Linear(input_dim, hidden_dim)
        self.decoder = nn.Linear(hidden_dim, input_dim)
    
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x
    
# Basic deep autoencoder
class DeepAutoencoder(nn.Module):
    def __init__(self, input_dim, middle_dim, hidden_dim):
        super(DeepAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, middle_dim),
            nn.ReLU(),
            nn.Linear(middle_dim, hidden_dim),
            nn.ReLU())
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, middle_dim),
            nn.ReLU(),
            nn.Linear(middle_dim, input_dim))
        
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# Sample network
class SampleNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SampleNetwork, self).__init__()
        self.layer1 = nn.Linear(input_dim, output_dim)
        
    def forward(self, x):
        x = self.layer1(x)
        return x

class RecSystemWithAutoencoders:
    pass

## Check basic autoencoder alone on drug data

#### Instianianate the model and data

In [35]:
# Determine which data to use
data_train = drug_data_df.copy()

In [7]:
# Shuffle the data
# data_train = data_train.sample(frac=1.)
# print(data_train.shape)

In [37]:
# Scale the data
means = data_train.mean()
stds = data_train.std()
data_train = (data_train - means) / stds
print(data_train.mean().sum(), data_train.std().sum(), data_train.min().min(), data_train.max().max())

6.456547008749136e-15 294.0 -6.81641338878148 1.2330512316262772


In [38]:
# Take a sample of training data
data_train = data_train.iloc[:, :10]
print(data_train.shape)

(74, 10)


In [39]:
data_train = data_train.sample(n=10, random_state=11)
print(data_train.shape, data_train.sum().sum())

(10, 10) 20.590186153530404


In [40]:
# Instatianate the model
hidden_dim = 2

torch.manual_seed(11)
autoencoder = LinearAutoencoder(10, 2)
autoencoder

LinearAutoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=10, out_features=2, bias=True)
    (1): Tanh()
  )
  (decoder): Linear(in_features=2, out_features=10, bias=True)
)

In [41]:
for p in autoencoder.parameters():
    print(p.shape)

torch.Size([2, 10])
torch.Size([2])
torch.Size([10, 2])
torch.Size([10])


#### Training

In [42]:
# Specify training parameters
num_epochs = 1000
batch_size = 8
learning_rate = 0.001
no_batches = data_train.shape[0] // batch_size + 1

criterion = nn.MSELoss()
optimizer = torch.optim.SGD(
    autoencoder.parameters(), lr=learning_rate)

In [78]:
# Batch gradient descent
losses = []
for epoch in range(num_epochs):
    # Preprocess the training data
    data_batch = data_train.values
    batch_input = torch.from_numpy(data_batch)
    # Perform forward pass
    batch_output = autoencoder(batch_input.float())
    loss = criterion(batch_output, batch_input.float())
    losses.append(loss)
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print("Epoch: {}, batch loss: {}".format(
        epoch + 1, loss.item()))

Epoch: 1, batch loss: 0.20876051485538483
Epoch: 2, batch loss: 0.20875924825668335
Epoch: 3, batch loss: 0.2087579369544983
Epoch: 4, batch loss: 0.20875662565231323
Epoch: 5, batch loss: 0.20875529944896698
Epoch: 6, batch loss: 0.20875407755374908
Epoch: 7, batch loss: 0.20875275135040283
Epoch: 8, batch loss: 0.20875145494937897
Epoch: 9, batch loss: 0.2087501734495163
Epoch: 10, batch loss: 0.20874889194965363
Epoch: 11, batch loss: 0.20874756574630737
Epoch: 12, batch loss: 0.20874625444412231
Epoch: 13, batch loss: 0.20874498784542084
Epoch: 14, batch loss: 0.20874370634555817
Epoch: 15, batch loss: 0.2087424099445343
Epoch: 16, batch loss: 0.20874111354351044
Epoch: 17, batch loss: 0.20873983204364777
Epoch: 18, batch loss: 0.2087385207414627
Epoch: 19, batch loss: 0.20873723924160004
Epoch: 20, batch loss: 0.20873592793941498
Epoch: 21, batch loss: 0.2087346464395523
Epoch: 22, batch loss: 0.20873335003852844
Epoch: 23, batch loss: 0.20873206853866577
Epoch: 24, batch loss: 0.

Epoch: 193, batch loss: 0.20851285755634308
Epoch: 194, batch loss: 0.2085115760564804
Epoch: 195, batch loss: 0.20851030945777893
Epoch: 196, batch loss: 0.20850902795791626
Epoch: 197, batch loss: 0.2085077315568924
Epoch: 198, batch loss: 0.20850643515586853
Epoch: 199, batch loss: 0.20850513875484467
Epoch: 200, batch loss: 0.2085038721561432
Epoch: 201, batch loss: 0.20850256085395813
Epoch: 202, batch loss: 0.20850127935409546
Epoch: 203, batch loss: 0.2084999829530716
Epoch: 204, batch loss: 0.20849870145320892
Epoch: 205, batch loss: 0.20849743485450745
Epoch: 206, batch loss: 0.20849616825580597
Epoch: 207, batch loss: 0.2084948718547821
Epoch: 208, batch loss: 0.20849357545375824
Epoch: 209, batch loss: 0.20849229395389557
Epoch: 210, batch loss: 0.2084909975528717
Epoch: 211, batch loss: 0.20848970115184784
Epoch: 212, batch loss: 0.20848841965198517
Epoch: 213, batch loss: 0.2084871232509613
Epoch: 214, batch loss: 0.20848588645458221
Epoch: 215, batch loss: 0.2084845751523

Epoch: 406, batch loss: 0.20823991298675537
Epoch: 407, batch loss: 0.20823867619037628
Epoch: 408, batch loss: 0.20823736488819122
Epoch: 409, batch loss: 0.20823609828948975
Epoch: 410, batch loss: 0.20823483169078827
Epoch: 411, batch loss: 0.2082335501909256
Epoch: 412, batch loss: 0.20823225378990173
Epoch: 413, batch loss: 0.20823097229003906
Epoch: 414, batch loss: 0.20822973549365997
Epoch: 415, batch loss: 0.2082284390926361
Epoch: 416, batch loss: 0.20822715759277344
Epoch: 417, batch loss: 0.20822590589523315
Epoch: 418, batch loss: 0.20822462439537048
Epoch: 419, batch loss: 0.2082233428955078
Epoch: 420, batch loss: 0.20822204649448395
Epoch: 421, batch loss: 0.20822077989578247
Epoch: 422, batch loss: 0.208219513297081
Epoch: 423, batch loss: 0.20821824669837952
Epoch: 424, batch loss: 0.20821696519851685
Epoch: 425, batch loss: 0.20821566879749298
Epoch: 426, batch loss: 0.20821437239646912
Epoch: 427, batch loss: 0.20821313560009003
Epoch: 428, batch loss: 0.20821183919

Epoch: 644, batch loss: 0.20793725550174713
Epoch: 645, batch loss: 0.20793598890304565
Epoch: 646, batch loss: 0.20793470740318298
Epoch: 647, batch loss: 0.2079334706068039
Epoch: 648, batch loss: 0.20793218910694122
Epoch: 649, batch loss: 0.20793095231056213
Epoch: 650, batch loss: 0.2079296112060547
Epoch: 651, batch loss: 0.20792840421199799
Epoch: 652, batch loss: 0.2079271376132965
Epoch: 653, batch loss: 0.20792585611343384
Epoch: 654, batch loss: 0.20792455971240997
Epoch: 655, batch loss: 0.20792333781719208
Epoch: 656, batch loss: 0.2079220414161682
Epoch: 657, batch loss: 0.20792075991630554
Epoch: 658, batch loss: 0.20791950821876526
Epoch: 659, batch loss: 0.20791824162006378
Epoch: 660, batch loss: 0.2079170197248459
Epoch: 661, batch loss: 0.20791572332382202
Epoch: 662, batch loss: 0.20791444182395935
Epoch: 663, batch loss: 0.20791317522525787
Epoch: 664, batch loss: 0.20791195333003998
Epoch: 665, batch loss: 0.20791062712669373
Epoch: 666, batch loss: 0.20790937542

Epoch: 894, batch loss: 0.20762163400650024
Epoch: 895, batch loss: 0.20762036740779877
Epoch: 896, batch loss: 0.20761914551258087
Epoch: 897, batch loss: 0.2076178342103958
Epoch: 898, batch loss: 0.20761661231517792
Epoch: 899, batch loss: 0.20761536061763763
Epoch: 900, batch loss: 0.20761407911777496
Epoch: 901, batch loss: 0.20761284232139587
Epoch: 902, batch loss: 0.2076115757226944
Epoch: 903, batch loss: 0.2076103240251541
Epoch: 904, batch loss: 0.20760908722877502
Epoch: 905, batch loss: 0.20760780572891235
Epoch: 906, batch loss: 0.20760653913021088
Epoch: 907, batch loss: 0.2076053023338318
Epoch: 908, batch loss: 0.20760402083396912
Epoch: 909, batch loss: 0.20760279893875122
Epoch: 910, batch loss: 0.20760154724121094
Epoch: 911, batch loss: 0.20760025084018707
Epoch: 912, batch loss: 0.20759902894496918
Epoch: 913, batch loss: 0.2075977325439453
Epoch: 914, batch loss: 0.20759649574756622
Epoch: 915, batch loss: 0.20759522914886475
Epoch: 916, batch loss: 0.20759397745

In [79]:
batch_input[0]

tensor([ 0.2737,  0.5788, -1.2772,  0.6633,  0.5301, -2.0779,  0.5586,  0.5047,
         0.5440,  0.7036], dtype=torch.float64)

In [80]:
batch_output[0]

tensor([ 0.1628,  0.2995, -0.2047,  0.7543,  0.3145, -0.8917,  0.7201,  0.0270,
         1.2516,  1.2944], grad_fn=<SelectBackward>)

In [81]:
pearsonr(batch_input.numpy().flatten(), batch_output.detach().numpy().flatten())

(0.7979613580926649, 2.826950390151984e-23)

In [82]:
pearsonr(batch_input[0].numpy(), batch_output[0].detach().numpy())

(0.8030690570010074, 0.005149130754613381)

In [83]:
batch_input[0].numpy().shape

(10,)

## Check deep autoencoder alone on drug data

#### Instianianate the model and data

In [208]:
# Determine which data to use
data_train = drug_data_df.copy()

In [209]:
# Shuffle the data
# data_train = data_train.sample(frac=1.)
# print(data_train.shape)

In [210]:
# Scale the data
means = data_train.mean()
stds = data_train.std()
data_train = (data_train - means) / stds
print(data_train.mean().sum(), data_train.std().sum(), data_train.min().min(), data_train.max().max())

6.456547008749136e-15 294.0 -6.81641338878148 1.2330512316262772


In [200]:
# Take a sample of training data
data_train = data_train.iloc[:, :10]
print(data_train.shape)

(74, 10)


In [201]:
data_train = data_train.sample(n=10, random_state=11)
print(data_train.shape, data_train.sum().sum())

(10, 10) 20.590186153530404


In [212]:
print(data_train.shape, data_train.sum().sum())

(74, 294) 4.777844786474361e-13


In [213]:
# Instatianate the model
torch.manual_seed(11)
autoencoder = DeepAutoencoder(data_train.shape[1], 128, 5)
autoencoder

DeepAutoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=294, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=5, bias=True)
    (3): ReLU()
  )
  (decoder): Sequential(
    (0): Linear(in_features=5, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=294, bias=True)
  )
)

In [214]:
for p in autoencoder.parameters():
    print(p.shape)

torch.Size([128, 294])
torch.Size([128])
torch.Size([5, 128])
torch.Size([5])
torch.Size([128, 5])
torch.Size([128])
torch.Size([294, 128])
torch.Size([294])


#### Training

In [216]:
# Specify training parameters
num_epochs = 1000
batch_size = 8
learning_rate = 0.001
no_batches = data_train.shape[0] // batch_size + 1

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(
    autoencoder.parameters(), lr=learning_rate, weight_decay=1e-5)

In [217]:
# Mini-batch gradient descent
for epoch in range(num_epochs):
    for batch in range(no_batches):
        # Extract and preprocess training data batch
        if batch != no_batches:
            data_batch = data_train.iloc[batch * batch_size:(batch + 1) * batch_size]
        else:
            data_batch = data_train.iloc[batch * batch_size:]
        batch_input = torch.from_numpy(data_batch.values)
        
        # Perform forward pass
        batch_output = autoencoder(batch_input.float())
        loss = criterion(batch_output, batch_input.float())
        losses.append(loss)
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    val_data = torch.from_numpy(data_train.values)
    val_preds = autoencoder(val_data.float())
    val_loss = criterion(val_preds, val_data.float())
    
    print("Epoch: {}, batch loss: {}, val loss: {}".format(
        epoch + 1, loss.item(), val_loss.item()))

Epoch: 1, batch loss: 0.6508458256721497, val loss: 0.900892436504364
Epoch: 2, batch loss: 0.5716217756271362, val loss: 0.7733346223831177
Epoch: 3, batch loss: 0.4439236521720886, val loss: 0.6871200203895569
Epoch: 4, batch loss: 0.3903379440307617, val loss: 0.6516532301902771
Epoch: 5, batch loss: 0.3820187449455261, val loss: 0.6448106169700623
Epoch: 6, batch loss: 0.3750286102294922, val loss: 0.6418816447257996
Epoch: 7, batch loss: 0.36380457878112793, val loss: 0.638709545135498
Epoch: 8, batch loss: 0.3465559184551239, val loss: 0.6334574222564697
Epoch: 9, batch loss: 0.30373111367225647, val loss: 0.6206653714179993
Epoch: 10, batch loss: 0.2623368501663208, val loss: 0.6033711433410645
Epoch: 11, batch loss: 0.23159562051296234, val loss: 0.5850980281829834
Epoch: 12, batch loss: 0.21829161047935486, val loss: 0.5725172162055969
Epoch: 13, batch loss: 0.21471241116523743, val loss: 0.5577261447906494
Epoch: 14, batch loss: 0.21089133620262146, val loss: 0.54805177450180

Epoch: 119, batch loss: 0.02147871069610119, val loss: 0.18544691801071167
Epoch: 120, batch loss: 0.02592608705163002, val loss: 0.18157969415187836
Epoch: 121, batch loss: 0.018921369686722755, val loss: 0.1789424866437912
Epoch: 122, batch loss: 0.020701857283711433, val loss: 0.18012721836566925
Epoch: 123, batch loss: 0.03025292418897152, val loss: 0.176066592335701
Epoch: 124, batch loss: 0.02524167113006115, val loss: 0.18112923204898834
Epoch: 125, batch loss: 0.024260610342025757, val loss: 0.17969195544719696
Epoch: 126, batch loss: 0.0332634411752224, val loss: 0.17708835005760193
Epoch: 127, batch loss: 0.023056194186210632, val loss: 0.17891374230384827
Epoch: 128, batch loss: 0.030866840854287148, val loss: 0.18010304868221283
Epoch: 129, batch loss: 0.036470234394073486, val loss: 0.1751330941915512
Epoch: 130, batch loss: 0.026048777624964714, val loss: 0.1816917359828949
Epoch: 131, batch loss: 0.03171103447675705, val loss: 0.1832435131072998
Epoch: 132, batch loss: 0

Epoch: 229, batch loss: 0.022017929702997208, val loss: 0.11124210059642792
Epoch: 230, batch loss: 0.020934494212269783, val loss: 0.11442101001739502
Epoch: 231, batch loss: 0.02643008530139923, val loss: 0.10964150726795197
Epoch: 232, batch loss: 0.026246383786201477, val loss: 0.10622750967741013
Epoch: 233, batch loss: 0.020680174231529236, val loss: 0.11256536841392517
Epoch: 234, batch loss: 0.02836301364004612, val loss: 0.11456210166215897
Epoch: 235, batch loss: 0.025639399886131287, val loss: 0.10924576967954636
Epoch: 236, batch loss: 0.017469732090830803, val loss: 0.11072894930839539
Epoch: 237, batch loss: 0.021990865468978882, val loss: 0.11341742426156998
Epoch: 238, batch loss: 0.030506527051329613, val loss: 0.10698077082633972
Epoch: 239, batch loss: 0.018403945490717888, val loss: 0.10411001741886139
Epoch: 240, batch loss: 0.03423056751489639, val loss: 0.10986312478780746
Epoch: 241, batch loss: 0.052869461476802826, val loss: 0.11349359899759293
Epoch: 242, bat

Epoch: 341, batch loss: 0.01807287335395813, val loss: 0.08218368142843246
Epoch: 342, batch loss: 0.016935614868998528, val loss: 0.08289841562509537
Epoch: 343, batch loss: 0.01784176379442215, val loss: 0.08317013084888458
Epoch: 344, batch loss: 0.016503265127539635, val loss: 0.08316757529973984
Epoch: 345, batch loss: 0.01915113627910614, val loss: 0.08412788063287735
Epoch: 346, batch loss: 0.017186565324664116, val loss: 0.08461830765008926
Epoch: 347, batch loss: 0.018833424896001816, val loss: 0.08607728034257889
Epoch: 348, batch loss: 0.01932700350880623, val loss: 0.08380240201950073
Epoch: 349, batch loss: 0.018389031291007996, val loss: 0.08025672286748886
Epoch: 350, batch loss: 0.019839327782392502, val loss: 0.08382752537727356
Epoch: 351, batch loss: 0.022241979837417603, val loss: 0.08370348066091537
Epoch: 352, batch loss: 0.01746547222137451, val loss: 0.07920020818710327
Epoch: 353, batch loss: 0.016581658273935318, val loss: 0.08004280179738998
Epoch: 354, batch

Epoch: 453, batch loss: 0.024149786680936813, val loss: 0.07088125497102737
Epoch: 454, batch loss: 0.02120434120297432, val loss: 0.06744296848773956
Epoch: 455, batch loss: 0.016972053796052933, val loss: 0.06334961950778961
Epoch: 456, batch loss: 0.01649653911590576, val loss: 0.06590437144041061
Epoch: 457, batch loss: 0.017306147143244743, val loss: 0.06649699807167053
Epoch: 458, batch loss: 0.019292663782835007, val loss: 0.06555013358592987
Epoch: 459, batch loss: 0.017061742022633553, val loss: 0.06380265206098557
Epoch: 460, batch loss: 0.01706847734749317, val loss: 0.06402112543582916
Epoch: 461, batch loss: 0.022561796009540558, val loss: 0.06683125346899033
Epoch: 462, batch loss: 0.021770283579826355, val loss: 0.06625639647245407
Epoch: 463, batch loss: 0.017375880852341652, val loss: 0.06262026727199554
Epoch: 464, batch loss: 0.015782969072461128, val loss: 0.062323492020368576
Epoch: 465, batch loss: 0.01583617925643921, val loss: 0.06391012668609619
Epoch: 466, bat

Epoch: 564, batch loss: 0.01611226052045822, val loss: 0.05332384258508682
Epoch: 565, batch loss: 0.01653781160712242, val loss: 0.056835658848285675
Epoch: 566, batch loss: 0.015585048124194145, val loss: 0.05454682558774948
Epoch: 567, batch loss: 0.017924586310982704, val loss: 0.05415690317749977
Epoch: 568, batch loss: 0.017064807936549187, val loss: 0.05691493675112724
Epoch: 569, batch loss: 0.01612292416393757, val loss: 0.060459837317466736
Epoch: 570, batch loss: 0.02219797484576702, val loss: 0.05967460200190544
Epoch: 571, batch loss: 0.018763430416584015, val loss: 0.057196471840143204
Epoch: 572, batch loss: 0.01766269840300083, val loss: 0.060201723128557205
Epoch: 573, batch loss: 0.01703929901123047, val loss: 0.05974898487329483
Epoch: 574, batch loss: 0.022642986848950386, val loss: 0.061126574873924255
Epoch: 575, batch loss: 0.025153392925858498, val loss: 0.05680697038769722
Epoch: 576, batch loss: 0.023395376279950142, val loss: 0.056302640587091446
Epoch: 577, 

Epoch: 673, batch loss: 0.015292756259441376, val loss: 0.04921158403158188
Epoch: 674, batch loss: 0.024441203102469444, val loss: 0.04958178475499153
Epoch: 675, batch loss: 0.018913907930254936, val loss: 0.04854346811771393
Epoch: 676, batch loss: 0.028848152607679367, val loss: 0.05019121617078781
Epoch: 677, batch loss: 0.015440833754837513, val loss: 0.04701235890388489
Epoch: 678, batch loss: 0.018885470926761627, val loss: 0.04623383656144142
Epoch: 679, batch loss: 0.013291221112012863, val loss: 0.04454343765974045
Epoch: 680, batch loss: 0.014894652180373669, val loss: 0.0445123128592968
Epoch: 681, batch loss: 0.012692160904407501, val loss: 0.043458469212055206
Epoch: 682, batch loss: 0.01699916273355484, val loss: 0.044243473559617996
Epoch: 683, batch loss: 0.013980250805616379, val loss: 0.04338051378726959
Epoch: 684, batch loss: 0.01449243351817131, val loss: 0.04485935717821121
Epoch: 685, batch loss: 0.014522185549139977, val loss: 0.04360548034310341
Epoch: 686, b

Epoch: 784, batch loss: 0.013715453445911407, val loss: 0.04378189146518707
Epoch: 785, batch loss: 0.012762699276208878, val loss: 0.042725883424282074
Epoch: 786, batch loss: 0.016053758561611176, val loss: 0.0421537309885025
Epoch: 787, batch loss: 0.013492272235453129, val loss: 0.04232446849346161
Epoch: 788, batch loss: 0.013701386749744415, val loss: 0.04285944998264313
Epoch: 789, batch loss: 0.012052183039486408, val loss: 0.04141533747315407
Epoch: 790, batch loss: 0.014357276260852814, val loss: 0.04059750959277153
Epoch: 791, batch loss: 0.012620002031326294, val loss: 0.039660610258579254
Epoch: 792, batch loss: 0.013122293166816235, val loss: 0.03944021090865135
Epoch: 793, batch loss: 0.013551592826843262, val loss: 0.03832681477069855
Epoch: 794, batch loss: 0.01279059611260891, val loss: 0.03834763169288635
Epoch: 795, batch loss: 0.010644965805113316, val loss: 0.03781331703066826
Epoch: 796, batch loss: 0.012914972379803658, val loss: 0.037465017288923264
Epoch: 797,

Epoch: 893, batch loss: 0.013447637669742107, val loss: 0.04215581342577934
Epoch: 894, batch loss: 0.01734970137476921, val loss: 0.04414539784193039
Epoch: 895, batch loss: 0.014472388662397861, val loss: 0.041878774762153625
Epoch: 896, batch loss: 0.015989523380994797, val loss: 0.04346844181418419
Epoch: 897, batch loss: 0.016826801002025604, val loss: 0.04369572550058365
Epoch: 898, batch loss: 0.014290599152445793, val loss: 0.04482750967144966
Epoch: 899, batch loss: 0.012268603779375553, val loss: 0.045618705451488495
Epoch: 900, batch loss: 0.016688259318470955, val loss: 0.04535169154405594
Epoch: 901, batch loss: 0.02055279351770878, val loss: 0.044996511191129684
Epoch: 902, batch loss: 0.018813226372003555, val loss: 0.04447184130549431
Epoch: 903, batch loss: 0.015387882478535175, val loss: 0.042840518057346344
Epoch: 904, batch loss: 0.013047504238784313, val loss: 0.04125746339559555
Epoch: 905, batch loss: 0.013656701892614365, val loss: 0.04243861883878708
Epoch: 906

In [206]:
# Batch gradient descent
losses = []
for epoch in range(num_epochs):
    # Preprocess the training data
    data_batch = data_train.values
    batch_input = torch.from_numpy(data_batch)
    # Perform forward pass
    batch_output = autoencoder(batch_input.float())
    loss = criterion(batch_output, batch_input.float())
    losses.append(loss)
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print("Epoch: {}, batch loss: {}".format(
        epoch + 1, loss.item()))

Epoch: 1, batch loss: 0.2800905108451843
Epoch: 2, batch loss: 0.2796255052089691
Epoch: 3, batch loss: 0.27705979347229004
Epoch: 4, batch loss: 0.2742094397544861
Epoch: 5, batch loss: 0.27015721797943115
Epoch: 6, batch loss: 0.2672477960586548
Epoch: 7, batch loss: 0.2645040452480316
Epoch: 8, batch loss: 0.2621996998786926
Epoch: 9, batch loss: 0.26083800196647644
Epoch: 10, batch loss: 0.2592487633228302
Epoch: 11, batch loss: 0.25840842723846436
Epoch: 12, batch loss: 0.25732147693634033
Epoch: 13, batch loss: 0.2563292682170868
Epoch: 14, batch loss: 0.25555136799812317
Epoch: 15, batch loss: 0.25454434752464294
Epoch: 16, batch loss: 0.2540058195590973
Epoch: 17, batch loss: 0.25331050157546997
Epoch: 18, batch loss: 0.25296857953071594
Epoch: 19, batch loss: 0.25276505947113037
Epoch: 20, batch loss: 0.25261396169662476
Epoch: 21, batch loss: 0.25269976258277893
Epoch: 22, batch loss: 0.2526402175426483
Epoch: 23, batch loss: 0.2527793347835541
Epoch: 24, batch loss: 0.252808

Epoch: 249, batch loss: 0.25174713134765625
Epoch: 250, batch loss: 0.25174716114997864
Epoch: 251, batch loss: 0.25174716114997864
Epoch: 252, batch loss: 0.25174716114997864
Epoch: 253, batch loss: 0.25174713134765625
Epoch: 254, batch loss: 0.25174716114997864
Epoch: 255, batch loss: 0.25174716114997864
Epoch: 256, batch loss: 0.25174716114997864
Epoch: 257, batch loss: 0.25174716114997864
Epoch: 258, batch loss: 0.25174716114997864
Epoch: 259, batch loss: 0.25174716114997864
Epoch: 260, batch loss: 0.25174716114997864
Epoch: 261, batch loss: 0.25174713134765625
Epoch: 262, batch loss: 0.25174716114997864
Epoch: 263, batch loss: 0.25174716114997864
Epoch: 264, batch loss: 0.25174716114997864
Epoch: 265, batch loss: 0.25174716114997864
Epoch: 266, batch loss: 0.25174713134765625
Epoch: 267, batch loss: 0.251747190952301
Epoch: 268, batch loss: 0.25174716114997864
Epoch: 269, batch loss: 0.25174716114997864
Epoch: 270, batch loss: 0.25174716114997864
Epoch: 271, batch loss: 0.25174710

Epoch: 488, batch loss: 0.25174713134765625
Epoch: 489, batch loss: 0.25174710154533386
Epoch: 490, batch loss: 0.25174716114997864
Epoch: 491, batch loss: 0.25174713134765625
Epoch: 492, batch loss: 0.25174716114997864
Epoch: 493, batch loss: 0.25174713134765625
Epoch: 494, batch loss: 0.25174713134765625
Epoch: 495, batch loss: 0.25174716114997864
Epoch: 496, batch loss: 0.25174713134765625
Epoch: 497, batch loss: 0.25174716114997864
Epoch: 498, batch loss: 0.25174716114997864
Epoch: 499, batch loss: 0.25174716114997864
Epoch: 500, batch loss: 0.25174716114997864
Epoch: 501, batch loss: 0.25174710154533386
Epoch: 502, batch loss: 0.25174713134765625
Epoch: 503, batch loss: 0.25174716114997864
Epoch: 504, batch loss: 0.25174716114997864
Epoch: 505, batch loss: 0.25174713134765625
Epoch: 506, batch loss: 0.25174716114997864
Epoch: 507, batch loss: 0.25174710154533386
Epoch: 508, batch loss: 0.25174710154533386
Epoch: 509, batch loss: 0.25174713134765625
Epoch: 510, batch loss: 0.251747

Epoch: 696, batch loss: 0.25174713134765625
Epoch: 697, batch loss: 0.25174716114997864
Epoch: 698, batch loss: 0.25174716114997864
Epoch: 699, batch loss: 0.25174713134765625
Epoch: 700, batch loss: 0.25174713134765625
Epoch: 701, batch loss: 0.25174713134765625
Epoch: 702, batch loss: 0.25174716114997864
Epoch: 703, batch loss: 0.25174716114997864
Epoch: 704, batch loss: 0.25174710154533386
Epoch: 705, batch loss: 0.25174713134765625
Epoch: 706, batch loss: 0.25174713134765625
Epoch: 707, batch loss: 0.25174713134765625
Epoch: 708, batch loss: 0.25174710154533386
Epoch: 709, batch loss: 0.25174713134765625
Epoch: 710, batch loss: 0.25174713134765625
Epoch: 711, batch loss: 0.25174713134765625
Epoch: 712, batch loss: 0.25174713134765625
Epoch: 713, batch loss: 0.25174716114997864
Epoch: 714, batch loss: 0.25174716114997864
Epoch: 715, batch loss: 0.25174713134765625
Epoch: 716, batch loss: 0.25174713134765625
Epoch: 717, batch loss: 0.25174716114997864
Epoch: 718, batch loss: 0.251747

Epoch: 917, batch loss: 0.25174713134765625
Epoch: 918, batch loss: 0.25174713134765625
Epoch: 919, batch loss: 0.25174716114997864
Epoch: 920, batch loss: 0.25174713134765625
Epoch: 921, batch loss: 0.25174716114997864
Epoch: 922, batch loss: 0.25174713134765625
Epoch: 923, batch loss: 0.25174716114997864
Epoch: 924, batch loss: 0.25174716114997864
Epoch: 925, batch loss: 0.25174716114997864
Epoch: 926, batch loss: 0.25174713134765625
Epoch: 927, batch loss: 0.25174716114997864
Epoch: 928, batch loss: 0.25174716114997864
Epoch: 929, batch loss: 0.25174713134765625
Epoch: 930, batch loss: 0.25174713134765625
Epoch: 931, batch loss: 0.25174713134765625
Epoch: 932, batch loss: 0.25174716114997864
Epoch: 933, batch loss: 0.25174713134765625
Epoch: 934, batch loss: 0.25174713134765625
Epoch: 935, batch loss: 0.25174713134765625
Epoch: 936, batch loss: 0.25174713134765625
Epoch: 937, batch loss: 0.25174713134765625
Epoch: 938, batch loss: 0.25174713134765625
Epoch: 939, batch loss: 0.251747

#### Evaluate the model

In [218]:
val_data = torch.from_numpy(data_train.values)
val_preds = autoencoder(val_data.float())

val_loss = criterion(val_preds, val_data.float())
val_corr = pearsonr(val_data.flatten(), val_preds.detach().numpy().flatten())
print("Val RMSE and correlation:", val_loss.item(), val_corr)

rmses_per_drug = []
corrs_per_drug = []
for d in range(val_data.shape[0]):
    original_input = val_data[d].numpy()
    predicted_input = val_preds[d].detach().numpy()
    rmses_per_drug.append(metrics.mean_squared_error(original_input, predicted_input) ** 0.5)
    corrs_per_drug.append(pearsonr(original_input, predicted_input)[0])
    
results_per_drug_df = pd.DataFrame()
results_per_drug_df["Drug ID"] = drug_data_df.index
results_per_drug_df["Training RMSE"] = rmses_per_drug
results_per_drug_df["Training correlation"] = corrs_per_drug

Val RMSE and correlation: 0.03093569166958332 (0.9842515965397906, 0.0)


In [219]:
print(results_per_drug_df.shape)

(74, 3)


In [228]:
results_per_drug_df["Training correlation"].iloc[10]

0.9821717378646001

In [226]:
val_data[10, :10]

tensor([ 0.2737, -3.0208, -1.6157, -2.5957, -0.2376, -0.3696, -1.2811, -0.3108,
         0.5440, -1.5509], dtype=torch.float64)

In [227]:
val_preds[10, :10]

tensor([-0.0160, -2.9993, -1.4550, -2.6885, -0.2197, -0.0436, -1.6678, -0.4368,
         0.7624, -1.4990], grad_fn=<SliceBackward>)

## Check deep autoencoder alone on cell line data

## Incorporate autoencoder into main model

## Load the and preprocess dataset

In [4]:
filepath = "../../Data/Preprocessed Datasets/"
with open(filepath + "GDSC-KINOMEscan_proteins_intersection_+_remaining_GDSC_target_genes_dataset.pkl", "rb") as f:
    full_dataset = dill.load(f)
print(full_dataset.name, type(full_dataset))
print()
print(full_dataset.description)

Kinases Dataset + Remaning GDSC drug's putative targets <class 'modeling.Dataset'>

Dataset containing 74 common drugs of GDSC and HMS LINCS Kinome scan dataset.
Cell lines data types: expression, coding variant and tissue type. Expressions and coding variants are 
present only for proteins present in both GDSC and KINOMEscan data, resulting in expression of 188 genes and
nmutations in 18 genes.
In addition, expressions and mutations (17 new features) of remaining target genes from GDSC are included
Tissue types are dummy encoded GDSC Tissue Descriptions 1 (18 features).
Drugs representation: inhibition scores (% control) of 294 proteins. Set of proteins is the intersection of 
proteins screened for each of 74 drugs.
Drug response data: drug reponse data contains AUC metrics across cell lines for 74 drugs considered.


In [5]:
# Establish response data for samples (drug-cell line pairs)
response_df = full_dataset.response_data.copy()

# Establish cell line features data
cell_line_data_original_df = full_dataset.full_cell_lines_data.copy()

# Search for cell lines present in response data, but missing the genomic features
missing_cell_lines = []
for cosmic_id in response_df.COSMIC_ID.unique():
    if cosmic_id not in cell_line_data_original_df.cell_line_id.unique():
        missing_cell_lines.append(cosmic_id)
# Put cell line IDs into index and drop cell line IDs columns
cell_line_data_original_df.index = cell_line_data_original_df.cell_line_id
cell_line_data_original_df = cell_line_data_original_df.drop("cell_line_id", axis=1)

# Extract response only for cell lines for which features are present
response_df = response_df[~response_df.COSMIC_ID.isin(missing_cell_lines)]

# Establish drug features data
drug_data_original_df = full_dataset.drugs_data.copy()

# Convert drug index from LINCS name to GDSC drug ID
drug_data_original_df.index = drug_data_original_df.index.map(full_dataset.kinomescan_name_to_gdsc_id_mapper)
print(cell_line_data_original_df.shape, drug_data_original_df.shape, response_df.shape)

(922, 241) (74, 294) (52730, 3)


Model class definition

In [6]:
class Model:
    def __init__(self, name, network):
        self.name = name
        self.network = network
        
    def train(self, train_samples, cell_line_features, drug_features,
             batch_size, optimizer, criterion, reg_lambda=0, log=True):
        """Perform training process by looping over training set in batches (one epoch) of the
        training."""
        no_batches = train_samples.shape[0] // batch_size + 1
        
        # Training the model
        self.network.train()
        for batch in range(no_batches):
            # Separate response variable batch
            if batch != no_batches:
                samples_batch = train_samples.iloc[batch * batch_size:(batch + 1) * batch_size]
            else:
                samples_batch = train_samples.iloc[batch * batch_size:]

            # Extract output variable batch
            y_batch = torch.from_numpy(samples_batch["AUC"].values).view(-1, 1)

            # Extract cell lines IDs for which data shall be extracted
            cl_ids = samples_batch["COSMIC_ID"].values
            # Extract corresponding cell line data
            cell_line_input_batch = cell_line_features.loc[cl_ids].values
            cell_line_input_batch = torch.from_numpy(cell_line_input_batch)

            # Extract drug IDs for which data shall be extracted
            drug_ids = samples_batch["DRUG_ID"].values
            # Extract corresponding drug data
            drug_input_batch = drug_features.loc[drug_ids].values
            drug_input_batch = torch.from_numpy(drug_input_batch)

            # Clear gradient buffers because we don't want to accummulate gradients 
            optimizer.zero_grad()

            # Perform forward pass
            batch_output = self.network(drug_input_batch.float(), cell_line_input_batch.float())

            reg_sum = 0
            for param in self.network.parameters():
                reg_sum += 0.5 * (param ** 2).sum()  # L2 norm

            # Compute the loss for this batch
            loss = criterion(batch_output, y_batch.float()) + reg_lambda * reg_sum
            # Get the gradients w.r.t. the parameters
            loss.backward()
            # Update the parameters
            optimizer.step()
        return loss
    
    def predict(self, samples, cell_line_features, drug_features):
        """Predict response on a given set of samples"""
        y_true = samples["AUC"].values

        cl_input = cell_line_features.loc[samples["COSMIC_ID"].values].values
        drug_input = drug_features.loc[samples["DRUG_ID"].values].values

        self.network.eval()
        with torch.no_grad():
            predicted = self.network(torch.from_numpy(drug_input).float(), 
                             torch.from_numpy(cl_input).float())
        return predicted, y_true
    
    @staticmethod
    def per_drug_performance_df(samples, predicted, mean_training_auc=None):
        """Compute evaluation metrics per drug and return them in a DataFrame"""
        sample_with_predictions = samples.copy()
        sample_with_predictions["Predicted AUC"] = predicted.numpy()

        drugs = []
        model_corrs = []
        model_rmses = []
        dummy_corrs = []
        dummy_rmses = []
        no_samples = []

        for drug in sample_with_predictions.DRUG_ID.unique():
            df = sample_with_predictions[sample_with_predictions.DRUG_ID == drug]
            if df.shape[0] < 2:
                continue
            if mean_training_auc:
                dummy_preds = [mean_training_auc] * df.shape[0]
            else:
                dummy_preds = [df["AUC"].mean()] * df.shape[0]
            dummy_rmse = metrics.mean_squared_error(df["AUC"], dummy_preds) ** 0.5
            dummy_corr = pearsonr(df["AUC"], dummy_preds)

            try:
                model_rmse = metrics.mean_squared_error(df["AUC"], df["Predicted AUC"]) ** 0.5
                model_corr = pearsonr(df["AUC"], df["Predicted AUC"])
            except ValueError:
                model_rmse, model_corr = np.nan, (np.nan, np.nan)

            drugs.append(drug)
            dummy_rmses.append(dummy_rmse)
            dummy_corrs.append(dummy_corr[0])

            model_rmses.append(model_rmse)
            model_corrs.append(model_corr[0])

            no_samples.append(df.COSMIC_ID.nunique())

        performance_per_drug = pd.DataFrame()
        performance_per_drug["Drug ID"] = drugs
        performance_per_drug["Model RMSE"] = model_rmses
        performance_per_drug["Model correlation"] = model_corrs

        performance_per_drug["Dummy RMSE"] = dummy_rmses
        performance_per_drug["Dummy correlation"] = dummy_corrs
        performance_per_drug["No. samples"] = no_samples

        return performance_per_drug
        
    @staticmethod
    def evaluate_predictions(y_true, preds):
        """Compute RMSE and correlation with true values for model predictions"""
        return metrics.mean_squared_error(y_true, preds) ** 0.5, pearsonr(y_true, preds)

Network definition

In [7]:
# Network definition
class LinearMatrixFactorizationWithFeatures(torch.nn.Module):
    def __init__(self, drug_input_dim, cell_line_input_dim, output_dim, 
                 out_activation_func=None,
                 drug_bias=True,
                 cell_line_bias=True):
        super(LinearMatrixFactorizationWithFeatures, self).__init__()
        self.drug_linear = torch.nn.Linear(drug_input_dim, output_dim, bias=drug_bias)
        self.cell_line_linear = torch.nn.Linear(cell_line_input_dim, output_dim, bias=cell_line_bias)
        self.out_activation = out_activation_func
        
    def forward(self, drug_features, cell_line_features):
        drug_outputs = self.drug_linear(drug_features)
        cell_line_outputs = self.cell_line_linear(cell_line_features)
        
        final_outputs = torch.sum(torch.mul(drug_outputs, cell_line_outputs), dim=1).view(-1, 1)
        if self.out_activation:
            return self.out_activation(final_outputs)
        return final_outputs

### Linear model

#### Further data preprocessing

In [10]:
# Split data into train/val/test sets
num_val_cell_lines = 100
num_test_cell_lines = 100
split_seed = 11
samples_train, samples_val, samples_test, cell_lines_test, cell_lines_val = Dataset.samples_train_test_split(
                                                                        response_df,
                                                                        num_val_cell_lines,
                                                                        num_test_cell_lines,
                                                                        split_seed,
                                                                        shuffle=True)
# Normalize the data
# Cell line data
cols_subset = [col for col in list(cell_line_data_original_df) if col.endswith("_exp")]
rows_subset = [x for x in cell_line_data_original_df.index if x not in cell_lines_test + cell_lines_val]

cell_line_data_df = Dataset.standardize_data(cell_line_data_original_df, cols_subset=cols_subset,
                                            rows_subset=rows_subset)
# Drug data
drug_data_df = Dataset.standardize_data(drug_data_original_df)

#### Instantianate the model

In [11]:
# Parameters of the model
drug_dim, cell_line_dim = drug_data_df.shape[1], cell_line_data_df.shape[1]
drug_bias, cell_line_bias = True, True
out_activation_func = torch.sigmoid
hidden_dim = 5

In [12]:
network = LinearMatrixFactorizationWithFeatures(drug_dim, cell_line_dim, hidden_dim,
                                                 drug_bias=drug_bias,
                                                 cell_line_bias=drug_bias,
                                                 out_activation_func=out_activation_func)

#### Training

In [13]:
# Specify training parameters
num_epochs = 50
batch_size = 128
learning_rate = 0.001
no_batches = samples_train.shape[0] // batch_size + 1
reg_lambda = 1e-5

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(
    network.parameters(), lr=learning_rate, weight_decay=1e-5)

In [14]:
# Mini-batch gradient descent
network.train()
for epoch in range(num_epochs):
    for batch in range(no_batches):
        # Separate response variable batch
        if batch != no_batches:
            samples_batch = samples_train.iloc[batch * batch_size:(batch + 1) * batch_size]
        else:
            samples_batch = samples_train.iloc[batch * batch_size:]

        # Extract output variable batch
        y_batch = torch.from_numpy(samples_batch["AUC"].values).view(-1, 1)

        # Extract cell lines IDs for which data shall be extracted
        cl_ids = samples_batch["COSMIC_ID"].values
        # Extract corresponding cell line data
        cell_line_input_batch = cell_line_data_df.loc[cl_ids].values
        cell_line_input_batch = torch.from_numpy(cell_line_input_batch)

        # Extract drug IDs for which data shall be extracted
        drug_ids = samples_batch["DRUG_ID"].values
        # Extract corresponding drug data
        drug_input_batch = drug_data_df.loc[drug_ids].values
        drug_input_batch = torch.from_numpy(drug_input_batch)

        # Clear gradient buffers because we don't want to accummulate gradients 
        optimizer.zero_grad()

        # Perform forward pass
        batch_output = network(drug_input_batch.float(), cell_line_input_batch.float())

        reg_sum = 0
        for param in network.parameters():
            reg_sum += 0.5 * (param ** 2).sum()  # L2 norm

        # Compute the loss for this batch
        loss = criterion(batch_output, y_batch.float()) + reg_lambda * reg_sum
        # Get the gradients w.r.t. the parameters
        loss.backward()
        # Update the parameters
        optimizer.step()
        
    # Evaluate on validation set
    validation_pairs = samples_train.iloc[:100]
    validation_true_responses = torch.from_numpy(validation_pairs["AUC"].values).view(-1, 1)
    # Extract cell lines IDs for which data shall be extracted
    cl_ids = validation_pairs["COSMIC_ID"].values
    # Extract corresponding cell line data
    cell_line_input_validation = cell_line_data_df.loc[cl_ids].values
    cell_line_input_validation = torch.from_numpy(cell_line_input_validation)
    # Extract drug IDs for which data shall be extracted
    drug_ids = validation_pairs["DRUG_ID"].values
    # Extract corresponding drug data
    drug_input_validation = drug_data_df.loc[drug_ids].values
    drug_input_validation = torch.from_numpy(drug_input_validation)
    
    network.eval()
    validation_output = network(drug_input_validation.float(), cell_line_input_validation.float())
    
    val_rmse, val_corr = Model.evaluate_predictions(validation_true_responses.numpy().flatten(), 
                                                    validation_output.detach().numpy().flatten())
    
    print("Epoch: {}, batch loss: {}, val_loss: {}, val_corr: {}".format(
        epoch + 1, loss.item(), val_rmse, val_corr[0]))

Epoch: 1, batch loss: 0.02867189608514309, val_loss: 0.1880162827121493, val_corr: 0.6301681074304405
Epoch: 2, batch loss: 0.01765073649585247, val_loss: 0.1319277847301788, val_corr: 0.6895879230655876
Epoch: 3, batch loss: 0.019665133208036423, val_loss: 0.13272284354148725, val_corr: 0.682123831877956
Epoch: 4, batch loss: 0.01923956535756588, val_loss: 0.12984282335650055, val_corr: 0.6973952805015904
Epoch: 5, batch loss: 0.01693965494632721, val_loss: 0.12740322727643005, val_corr: 0.7099484253363766
Epoch: 6, batch loss: 0.014819617383182049, val_loss: 0.12616778199531395, val_corr: 0.7168634422249307
Epoch: 7, batch loss: 0.01356389932334423, val_loss: 0.12559166710524233, val_corr: 0.7207460785752255
Epoch: 8, batch loss: 0.012879186309874058, val_loss: 0.125116283070562, val_corr: 0.7241535722294504
Epoch: 9, batch loss: 0.01247566007077694, val_loss: 0.12454829465312152, val_corr: 0.7278920286016871
Epoch: 10, batch loss: 0.012217331677675247, val_loss: 0.12391884623742921,

### Model with autoencoders

#### Model's definitions

In [55]:
# Linear system
class LinearMatrixFactorizationWithFeatures(torch.nn.Module):
    def __init__(self, drug_input_dim, cell_line_input_dim, output_dim, 
                 out_activation_func=None,
                 drug_bias=True,
                 cell_line_bias=True):
        super(LinearMatrixFactorizationWithFeatures, self).__init__()
        self.drug_linear = torch.nn.Linear(drug_input_dim, output_dim, bias=drug_bias)
        self.cell_line_linear = torch.nn.Linear(cell_line_input_dim, output_dim, bias=cell_line_bias)
        self.out_activation = out_activation_func
        
    def forward(self, drug_features, cell_line_features):
        drug_outputs = self.drug_linear(drug_features)
        cell_line_outputs = self.cell_line_linear(cell_line_features)
        
        final_outputs = torch.sum(torch.mul(drug_outputs, cell_line_outputs), dim=1).view(-1, 1)
        if self.out_activation:
            return self.out_activation(final_outputs)
        return final_outputs

class RecSystemWithAutoencoders(torch.nn.Module):
    def __init__(self, 
                 drug_autoencoder,
                 cell_line_autoencoder,
                 out_activation=None):
        
        super(RecSystemWithAutoencoders, self).__init__()
        self.drug_autoencoder = drug_autoencoder
        self.cell_line_autoencoder = cell_line_autoencoder
        self.out_activation = out_activation
        
    def forward(self, drug_features, cell_line_features):
        drug_code, drug_reconstruction = self.drug_autoencoder(drug_features)
        cell_line_code, cell_line_reconstruction = self.cell_line_autoencoder(cell_line_features)
        
        final_outputs = torch.sum(torch.mul(drug_code, cell_line_code), dim=1).view(-1, 1)
        if self.out_activation:
            return self.out_activation(final_outputs)
        return final_outputs, drug_reconstruction, cell_line_reconstruction

Autoencoders definitions

In [47]:
# Deep autoencoder with one hidden layer
class DeepAutoencoderOneHiddenLayer(nn.Module):
    def __init__(self, input_dim, hidden_dim, code_dim, activation_func=nn.ReLU):
        super(DeepAutoencoderOneHiddenLayer, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            activation_func(),
            nn.Linear(hidden_dim, code_dim),
            activation_func())
        self.decoder = nn.Sequential(
            nn.Linear(code_dim, hidden_dim),
            activation_func(),
            nn.Linear(hidden_dim, input_dim))
        
    def forward(self, x):
        x = self.encoder(x)
        code = x
        x = self.decoder(x)
        return code, x


# Simple linear autoencoder
class LinearAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(LinearAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.Tanh())
        self.decoder = nn.Linear(hidden_dim, input_dim)
    
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x
    
# Simple linear autoencoder
class LinearAutoencoderWithoutActivation(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(LinearAutoencoderWithoutActivation, self).__init__()
        self.encoder = nn.Linear(input_dim, hidden_dim)
        self.decoder = nn.Linear(hidden_dim, input_dim)
    
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

#### Further data preprocessing

In [48]:
# Split data into train/val/test sets
num_val_cell_lines = 100
num_test_cell_lines = 100
split_seed = 11
samples_train, samples_val, samples_test, cell_lines_test, cell_lines_val = Dataset.samples_train_test_split(
                                                                        response_df,
                                                                        num_val_cell_lines,
                                                                        num_test_cell_lines,
                                                                        split_seed,
                                                                        shuffle=True)
# Normalize the data
# Cell line data
cols_subset = [col for col in list(cell_line_data_df) if col.endswith("_exp")]
rows_subset = [x for x in cell_line_data_df.index if x not in cell_lines_test + cell_lines_val]

cell_line_data_df = Dataset.standardize_data(cell_line_data_original_df, cols_subset=cols_subset,
                                            rows_subset=rows_subset)
# Drug data
drug_data_df = Dataset.standardize_data(drug_data_original_df)

print(drug_data_df.mean().sum(), drug_data_df.std().sum(), 
      cell_line_data_df.mean().sum(), cell_line_data_df.std().sum())

6.456547008749136e-15 294.0 0.30624881699008477 206.979763195284


#### Instantianate the models

In [56]:
# Instantiate drug and cell line autoencoders
drug_input_dim, cell_line_input_dim = drug_data_df.shape[1], cell_line_data_df.shape[1]
code_dim = 5    # Dimension of lower-dimensional drugs and cell lines representation
drug_hidden_dim = 128   # Dimension of middle (hidden layer) in encoding network
cell_line_hidden_dim = 128

drug_autoencoder = DeepAutoencoderOneHiddenLayer(drug_input_dim, drug_hidden_dim, code_dim,
                                                activation_func=nn.ReLU)

cell_line_autoencoder = DeepAutoencoderOneHiddenLayer(cell_line_input_dim, cell_line_hidden_dim, code_dim,
                                                activation_func=nn.ReLU)

In [57]:
# Instantiate complete model
rec_system = RecSystemWithAutoencoders(drug_autoencoder, cell_line_autoencoder,
                                      out_activation=torch.sigmoid)

In [58]:
for p in rec_system.parameters():
    print(p.shape, p.requires_grad)

torch.Size([128, 294]) True
torch.Size([128]) True
torch.Size([5, 128]) True
torch.Size([5]) True
torch.Size([128, 5]) True
torch.Size([128]) True
torch.Size([294, 128]) True
torch.Size([294]) True
torch.Size([128, 241]) True
torch.Size([128]) True
torch.Size([5, 128]) True
torch.Size([5]) True
torch.Size([128, 5]) True
torch.Size([128]) True
torch.Size([241, 128]) True
torch.Size([241]) True


#### Test the forward prop

In [59]:
drug_feats_test = drug_data_df.iloc[0].values.reshape(-1, 294)
drug_feats_test = torch.from_numpy(drug_feats_test)
print(drug_feats_test.shape)

torch.Size([1, 294])


In [60]:
cl_feats_test = cell_line_data_df.iloc[0].values.reshape(-1, 241)
cl_feats_test = torch.from_numpy(cl_feats_test)
print(cl_feats_test.shape)

torch.Size([1, 241])


In [62]:
output = rec_system(drug_feats_test.float(), cl_feats_test.float())

In [63]:
output.shape

torch.Size([1, 1])

#### Training

In [129]:
func = nn.ReLU

In [130]:
func(torch.tensor([1.]))

ReLU(inplace=True)

In [131]:
func

torch.nn.modules.activation.ReLU

In [132]:
nn.ReLU(torch.tensor([1.]))

ReLU(inplace=True)