In [2]:
import pandas as pd
import numpy as np
from anndata import read_h5ad, AnnData
import scanpy as sc
import json

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cross_decomposition import PLSRegression

import altair as alt
from altair_saver import save as alt_save

In [3]:
metmap_tissues = snakemake.params['metmap_tissues']
tm_to_metmap = snakemake.params['tm_to_metmap']

In [4]:
curr_fold = int(snakemake.wildcards["fold"])
curr_fold

In [5]:
kfold_df = pd.read_csv(snakemake.input['kfold_indices'])
kfold_train_df = kfold_df.loc[(kfold_df["fold"] == curr_fold) & (kfold_df["set"] == "train")]
kfold_test_df = kfold_df.loc[(kfold_df["fold"] == curr_fold) & (kfold_df["set"] == "test")]

In [6]:
kfold_train_df.head()

In [7]:
ccle_adata = read_h5ad(snakemake.input['ccle_exp'])

In [8]:
# Preprocess the gene expression data based on the current wildcards
gexp_transform = snakemake.wildcards["gexp_transform"]
if gexp_transform == "tpm":
    sc.pp.normalize_total(ccle_adata, target_sum=1e4)
elif gexp_transform == "log1p_tpm":
    sc.pp.normalize_total(ccle_adata, target_sum=1e4)
    sc.pp.log1p(ccle_adata)
elif gexp_transform == "log1p_tpm_scale":
    sc.pp.normalize_total(ccle_adata, target_sum=1e4)
    sc.pp.log1p(ccle_adata)
    sc.pp.scale(ccle_adata, max_value=10)

In [9]:
mm_all_df = pd.read_excel(snakemake.input['mm_potential'], sheet_name=f"metp500.all5", index_col=0)

In [10]:
train_celllines = kfold_train_df["cellline"].values.tolist()
test_celllines = kfold_test_df["cellline"].values.tolist()

In [11]:
# Need to take union of significantly differentially expressed genes in the training set.

deseq_files = dict(zip(metmap_tissues, snakemake.input[:len(metmap_tissues)]))
deseq_dfs = {}

significance_level = 0.05
fc_threshold = float(snakemake.wildcards["fc_threshold"])

deseq_significant_union = set()

for tissue, deseq_file in deseq_files.items():
    tissue_deseq_df = pd.read_csv(deseq_file, index_col=0)
    tissue_deseq_df["significant"] = tissue_deseq_df.apply(lambda row: row['padj'] <= significance_level and abs(row['log2FoldChange']) >= fc_threshold, axis='columns')
    # Filter to keep only the significantly differentially expressed genes
    tissue_deseq_df = tissue_deseq_df.loc[tissue_deseq_df["significant"]]
    
    deseq_dfs[tissue] = tissue_deseq_df
    
    deseq_significant_union = deseq_significant_union.union(set(tissue_deseq_df.index.values.tolist()))

deseq_signficant_genes = list(deseq_significant_union)
len(deseq_signficant_genes)

In [12]:
tissue_train_test_X = {
    "train": ccle_adata[train_celllines, deseq_signficant_genes].X,
    "test": ccle_adata[test_celllines, deseq_signficant_genes].X,
}

Metastatic potential: "Data are presented on a log10 scale, range from -4 ~ 4."
- <= -4: non-metastatic
- -4~-2: (weakly) metastatic, but with low confidence
- \>= -2: metastatic, with higher confidence



In [25]:
def met_potential_encode(arr):
    non_metastatic_mask = (arr <= -4)
    weakly_metastatic_mask = (arr > -4) & (arr < -2)
    metastatic_mask = (arr >= -2)
    
    arr[non_metastatic_mask] = 0
    arr[weakly_metastatic_mask] = 1
    arr[metastatic_mask] = 2
    
    return arr
    return np.power(np.repeat(10, arr.shape[0]), arr)

def met_potential_decode(arr):
    return arr
    return np.log10(np.clip(arr, a_min=0.000000001, a_max=None))

In [96]:
# Make a dictionary mapping tissue type to training and testing metastatic potential values.
# These will become the response variables for PLSRegression.
# These should be ordered according to the ordering of cell lines in kfold_train_df and kfold_test_df

train_y = []
test_y = []
for tissue in metmap_tissues:
    mm_tissue = tm_to_metmap[tissue]
    mm_tissue_df = pd.read_excel(snakemake.input['mm_potential'], sheet_name=f"metp500.{mm_tissue}", index_col=0)
    
    mm_tissue_train_df = mm_tissue_df.loc[train_celllines]
    mm_tissue_test_df = mm_tissue_df.loc[test_celllines]
    
    train_y.append(met_potential_encode(mm_tissue_train_df["mean"].values))
    test_y.append(met_potential_encode(mm_tissue_test_df["mean"].values))

tissue_train_test_y = {
    "train": np.stack(train_y, axis=-1),
    "test": np.stack(test_y, axis=-1)
}

In [97]:
n_components = int(snakemake.wildcards["num_pc"])

## PLSRegression

In [253]:
X_train = tissue_train_test_X["train"]
Y_train = tissue_train_test_y["train"]

X_test = tissue_train_test_X["test"]
Y_test = tissue_train_test_y["test"]

pls2 = PLSRegression(n_components=n_components)
pls2.fit(X_train, Y_train)

# Predict on test (held out) data
Y_pred = pls2.predict(X_test)

In [254]:
model_results = {}
for tissue_i, tissue in enumerate(metmap_tissues):
    Y_pred_tissue = met_potential_decode(Y_pred.T[tissue_i])
    Y_test_tissue = met_potential_decode(Y_test.T[tissue_i])
    #print(Y_pred_tissue)
    #print(Y_test_tissue)
    model_results[tissue] = {
        "r2": r2_score(Y_pred_tissue, Y_test_tissue),
        "mse": mean_squared_error(Y_pred_tissue, Y_test_tissue)
    }

In [255]:
model_results

In [256]:
with open(snakemake.output["model_test_results"], "w") as f:
    json.dump(model_results, f)

In [257]:
# Predict on training data
Y_train_pred = pls2.predict(X_train)

In [258]:
model_train_results = {}
for tissue_i, tissue in enumerate(metmap_tissues):
    Y_pred_tissue = met_potential_decode(Y_train_pred.T[tissue_i])
    Y_train_tissue = met_potential_decode(Y_train.T[tissue_i])
    model_train_results[tissue] = {
        "r2": r2_score(Y_pred_tissue, Y_train_tissue),
        "mse": mean_squared_error(Y_pred_tissue, Y_train_tissue)
    }

In [259]:
model_train_results

In [260]:
with open(snakemake.output["model_train_results"], "w") as f:
    json.dump(model_train_results, f)

## Basic deep learning model

In [261]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset, TensorDataset

In [262]:
X_train = torch.tensor(X_train, dtype=torch.float32)
Y_train = torch.tensor(Y_train).type(torch.LongTensor)

X_test = torch.tensor(X_test, dtype=torch.float32)
Y_test = torch.tensor(Y_test).type(torch.LongTensor)

In [263]:
Y_train

In [264]:
class GeneExpressionDataset(Dataset):
    # https://pytorch.org/tutorials/beginner/basics/data_tutorial.html#creating-a-custom-dataset-for-your-files
    def __init__(self, X, Y, tissue_i):
        self.X = X
        self.Y = Y
        self.tissue_i = tissue_i
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, i):
        return (self.X[i], self.Y[i][self.tissue_i])
    

In [265]:
num_features = X_train.shape[1]
num_classes = 3

In [280]:
train_dataloader = DataLoader(GeneExpressionDataset(X_train, Y_train, 0), batch_size=16, shuffle=True)
test_dataloader = DataLoader(GeneExpressionDataset(X_test, Y_test, 0), batch_size=16, shuffle=True)

In [281]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

In [304]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(num_features, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, num_classes),
            nn.ReLU()
        )
    
    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

In [311]:
model = NeuralNetwork().to(device)
print(model)

In [312]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [313]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        
        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch % 8 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [314]:
def test(dataloader, model):
    size = len(dataloader.dataset)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= size
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [315]:
epochs = 30
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model)
print("Done!")

In [316]:
# TODO:
# train models for each organ individually
# visualize the accuracy and loss over each epoch
# train single model for all organs together, with 3*num_organs nodes in the output layer