In [2]:
import pandas as pd
import numpy as np
from anndata import read_h5ad, AnnData
import scanpy as sc
import json

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cross_decomposition import PLSRegression

import altair as alt
from altair_saver import save as alt_save

In [3]:
metmap_tissues = snakemake.params['metmap_tissues']
tm_to_metmap = snakemake.params['tm_to_metmap']

In [4]:
curr_fold = int(snakemake.wildcards["fold"])
curr_fold

In [5]:
kfold_df = pd.read_csv(snakemake.input['kfold_indices'])
kfold_train_df = kfold_df.loc[(kfold_df["fold"] == curr_fold) & (kfold_df["set"] == "train")]
kfold_test_df = kfold_df.loc[(kfold_df["fold"] == curr_fold) & (kfold_df["set"] == "test")]

In [6]:
kfold_train_df.head()

In [7]:
ccle_adata = read_h5ad(snakemake.input['ccle_exp'])

In [8]:
# Preprocess the gene expression data based on the current wildcards
gexp_transform = snakemake.wildcards["gexp_transform"]
if gexp_transform == "tpm":
    sc.pp.normalize_total(ccle_adata, target_sum=1e4)
elif gexp_transform == "log1p_tpm":
    sc.pp.normalize_total(ccle_adata, target_sum=1e4)
    sc.pp.log1p(ccle_adata)
elif gexp_transform == "log1p_tpm_scale":
    sc.pp.normalize_total(ccle_adata, target_sum=1e4)
    sc.pp.log1p(ccle_adata)
    sc.pp.scale(ccle_adata, max_value=10)

In [9]:
mm_all_df = pd.read_excel(snakemake.input['mm_potential'], sheet_name=f"metp500.all5", index_col=0)

In [10]:
train_celllines = kfold_train_df["cellline"].values.tolist()
test_celllines = kfold_test_df["cellline"].values.tolist()

In [11]:
# Need to take union of significantly differentially expressed genes in the training set.

deseq_files = dict(zip(metmap_tissues, snakemake.input[:len(metmap_tissues)]))
deseq_dfs = {}

significance_level = 0.05
fc_threshold = float(snakemake.wildcards["fc_threshold"])

deseq_significant_union = set()

for tissue, deseq_file in deseq_files.items():
    tissue_deseq_df = pd.read_csv(deseq_file, index_col=0)
    tissue_deseq_df["significant"] = tissue_deseq_df.apply(lambda row: row['padj'] <= significance_level and abs(row['log2FoldChange']) >= fc_threshold, axis='columns')
    # Filter to keep only the significantly differentially expressed genes
    tissue_deseq_df = tissue_deseq_df.loc[tissue_deseq_df["significant"]]
    
    deseq_dfs[tissue] = tissue_deseq_df
    
    deseq_significant_union = deseq_significant_union.union(set(tissue_deseq_df.index.values.tolist()))

deseq_signficant_genes = list(deseq_significant_union)
len(deseq_signficant_genes)

In [12]:
tissue_train_test_X = {
    "train": ccle_adata[train_celllines, deseq_signficant_genes].X,
    "test": ccle_adata[test_celllines, deseq_signficant_genes].X,
}

In [13]:
# Make a dictionary mapping tissue type to training and testing metastatic potential values.
# These will become the response variables for PLSRegression.
# These should be ordered according to the ordering of cell lines in kfold_train_df and kfold_test_df

train_y = []
test_y = []
for tissue in metmap_tissues:
    mm_tissue = tm_to_metmap[tissue]
    mm_tissue_df = pd.read_excel(snakemake.input['mm_potential'], sheet_name=f"metp500.{mm_tissue}", index_col=0)
    
    mm_tissue_train_df = mm_tissue_df.loc[train_celllines]
    mm_tissue_test_df = mm_tissue_df.loc[test_celllines]
    
    train_y.append(mm_tissue_train_df["mean"].values)
    test_y.append(mm_tissue_test_df["mean"].values)

tissue_train_test_y = {
    "train": np.stack(train_y, axis=-1),
    "test": np.stack(test_y, axis=-1)
}

In [14]:
n_components = int(snakemake.wildcards["num_pc"])

In [15]:
X_train = tissue_train_test_X["train"]
Y_train = tissue_train_test_y["train"]

X_test = tissue_train_test_X["test"]
Y_test = tissue_train_test_y["test"]

pls2 = PLSRegression(n_components=n_components)
pls2.fit(X_train, Y_train)

# Predict on test (held out) data
Y_pred = pls2.predict(X_test)

In [18]:
model_results = {}
for tissue_i, tissue in enumerate(metmap_tissues):
    Y_pred_tissue = Y_pred.T[tissue_i]
    Y_test_tissue = Y_test.T[tissue_i]
    model_results[tissue] = {
        "r2": r2_score(Y_pred_tissue, Y_test_tissue),
        "mse": mean_squared_error(Y_pred_tissue, Y_test_tissue)
    }

In [19]:
with open(snakemake.output["model_test_results"], "w") as f:
    json.dump(model_results, f)

In [20]:
# Predict on training data
Y_train_pred = pls2.predict(X_train)

In [21]:
model_train_results = {}
for tissue_i, tissue in enumerate(metmap_tissues):
    Y_pred_tissue = Y_train_pred.T[tissue_i]
    Y_train_tissue = Y_train.T[tissue_i]
    model_train_results[tissue] = {
        "r2": r2_score(Y_pred_tissue, Y_train_tissue),
        "mse": mean_squared_error(Y_pred_tissue, Y_train_tissue)
    }

In [22]:
with open(snakemake.output["model_train_results"], "w") as f:
    json.dump(model_train_results, f)