In [None]:
import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch.autograd import Variable
from torch.optim import lr_scheduler

from sklearn.metrics import r2_score
from sklearn import preprocessing
#import scipy.io as sio

from models import VAE,AEBase,Predictor,PretrainedPredictor
from models import DNN
import numpy as np
import pandas as pd
import models
import utils as ut
import copy

from scipy import stats

In [None]:
from scipy.stats import pearsonr

# Parameters

In [None]:
# Define parameters
epochs = 500 #200,500,1000  
#dim_au_in = 20049
dim_au_out = 512 #8, 16, 32, 64, 128, 256,512
dim_dnn_in = dim_au_out
dim_dnn_out=1
select_drug = 'Tamoxifen'
na = 1

# Import data

In [None]:
data_r=pd.read_csv('data/GDSC2_expression.csv',index_col=0)
label_r=pd.read_csv('data/GDSC2_label_9drugs.csv',index_col=0)

In [None]:
label_r=label_r.fillna(na)

In [None]:
hvg,adata = ut.highly_variable_genes(data_r)

In [None]:
selected_idx = label_r.loc[:,select_drug]!=na

In [None]:
data_r.columns = adata.var_names

In [None]:
hvg.sum()

# Your is gene-cell, mine is cell-gene

In [None]:
#data = data_r.loc[selected_idx,:]
data = data_r.loc[selected_idx,hvg]

In [None]:
label = label_r.loc[selected_idx,select_drug]
#sscaler = preprocessing.StandardScaler(with_mean=True, with_std=True)
mmscaler = preprocessing.MinMaxScaler()
lbscaler = preprocessing.MinMaxScaler()

data = mmscaler.fit_transform(data)
label = lbscaler.fit_transform(label.values.reshape(-1,1))
#label = label.values.reshape(-1,1)

In [None]:
print(np.std(data))
print(np.mean(data))

In [None]:
data.mean(axis=0)

In [None]:
print(data.max())
print(data.min())

In [None]:
data.shape

In [None]:
label_r.shape

# Split test train

In [None]:
from sklearn.model_selection import train_test_split
X_train_all, X_test, Y_train_all, Y_test = train_test_split(data, label, test_size=0.2, random_state=42)
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train_all, Y_train_all, test_size=0.2, random_state=42)

In [None]:
print(data.shape)
print(label.shape)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

In [None]:
print(X_train.max())
print(X_train.min())

# AE MODEL

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)
torch.cuda.set_device(device)

# Add all data to AE

In [None]:
X_trainTensor = torch.FloatTensor(X_train).to(device)
X_validTensor = torch.FloatTensor(X_valid).to(device)
X_testTensor = torch.FloatTensor(X_test).to(device)
X_allTensor = torch.FloatTensor(data).to(device)
#X_alltrainTensor = torch.FloatTensor(X_train_all).to(device)

Y_trainTensor = torch.FloatTensor(Y_train).to(device)
Y_validTensor = torch.FloatTensor(Y_valid).to(device)

# construct TensorDataset
train_dataset = TensorDataset(X_trainTensor, X_trainTensor)
valid_dataset = TensorDataset(X_validTensor, X_validTensor)
test_dataset = TensorDataset(X_testTensor, X_testTensor)
all_dataset = TensorDataset(X_allTensor, X_allTensor)

X_trainDataLoader = DataLoader(dataset=train_dataset, batch_size=200, shuffle=True)
X_validDataLoader = DataLoader(dataset=valid_dataset, batch_size=200, shuffle=True)
X_allDataLoader = DataLoader(dataset=all_dataset, batch_size=200, shuffle=True)

In [None]:
dataloader = X_trainDataLoader

In [None]:
X_trainDataLoader.dataset.tensors[0].shape[0]

# The model

In [None]:
model = PretrainedPredictor(input_dim=5116,latent_dim=512,hidden_dims=[2048,1024], 
                            hidden_dims_predictor=[256,128],
                            pretrained_weights='saved/models/GDSCnew_ae.pkl',freezed=False)

In [None]:
print(model)

In [None]:
#model = VAE(dim_au_in=data_r.shape[1],dim_au_out=128)
if torch.cuda.is_available():
    model.cuda()
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-2)
loss_function = nn.MSELoss()

# Decay LR by a factor of 0.1 every 7 epochs
#exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer)

In [None]:
# Load data
# data type conversion

# y_trainTensor = torch.FloatTensor(Y_train).to(device)
# y_validTensor = torch.FloatTensor(Y_valid).to(device)

# construct TensorDataset
trainreducedDataset = TensorDataset(X_trainTensor, Y_trainTensor)
validreducedDataset = TensorDataset(X_validTensor, Y_validTensor)

trainDataLoader_p = DataLoader(dataset=trainreducedDataset, batch_size=200, shuffle=True)
validDataLoader_p = DataLoader(dataset=trainreducedDataset, batch_size=200, shuffle=True)

In [None]:
dataloaders_train = {'train':trainDataLoader_p,'val':validDataLoader_p}

In [None]:
model,report = ut.train_predictor_model(model,dataloaders_train,
                                        optimizer,loss_function,epochs,exp_lr_scheduler,save_path="saved/models/pre_pre_model.pkl")

In [None]:
model(X_testTensor)

In [None]:
Y_test

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
dl_result = model(X_testTensor).detach().cpu().numpy()

In [None]:
r2_score(dl_result,Y_test)

In [None]:
pearsonr(dl_result.flatten(),Y_test.flatten())

In [None]:
mean_squared_error(dl_result,Y_test)