In [4]:
import pandas as pd
import numpy as np
from anndata import read_h5ad, AnnData
import scanpy as sc
import json
from scipy.io import mmwrite

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cross_decomposition import PLSRegression

import altair as alt
from altair_saver import save as alt_save

In [2]:
metmap_tissues = snakemake.params['metmap_tissues']
tm_to_metmap = snakemake.params['tm_to_metmap']

NameError: name 'snakemake' is not defined

In [None]:
curr_fold = int(snakemake.wildcards["fold"])
curr_fold

In [None]:
kfold_df = pd.read_csv(snakemake.input['kfold_indices'])
kfold_train_df = kfold_df.loc[(kfold_df["fold"] == curr_fold) & (kfold_df["set"] == "train")]
kfold_test_df = kfold_df.loc[(kfold_df["fold"] == curr_fold) & (kfold_df["set"] == "test")]

In [None]:
kfold_train_df.head()

In [None]:
ccle_adata = read_h5ad(snakemake.input['ccle_exp'])

In [None]:
mm_all_df = pd.read_excel(snakemake.input['mm_potential'], sheet_name=f"metp500.all5", index_col=0)

In [None]:
train_celllines = kfold_train_df["cellline"].values.tolist()
test_celllines = kfold_test_df["cellline"].values.tolist()

In [None]:
# Need to take union of significantly differentially expressed genes in the training set.

deseq_files = dict(zip(metmap_tissues, snakemake.input[:len(metmap_tissues)]))
deseq_dfs = {}

significance_level = 0.01
fc_level = 2

deseq_significant_union = set()

for tissue, deseq_file in deseq_files.items():
    tissue_deseq_df = pd.read_csv(deseq_file, index_col=0)
    tissue_deseq_df["significant"] = tissue_deseq_df.apply(lambda row: row['padj'] <= significance_level and abs(row['log2FoldChange']) >= fc_level, axis='columns')
    # Filter to keep only the significantly differentially expressed genes
    tissue_deseq_df = tissue_deseq_df.loc[tissue_deseq_df["significant"]]
    
    deseq_dfs[tissue] = tissue_deseq_df
    
    deseq_significant_union = deseq_significant_union.union(set(tissue_deseq_df.index.values.tolist()))

deseq_signficant_genes = list(deseq_significant_union)
len(deseq_signficant_genes)

In [None]:
tissue_train_test_X = {
    "train": ccle_adata[train_celllines, deseq_signficant_genes],
    "test": ccle_adata[test_celllines, deseq_signficant_genes],
}

In [3]:
# Make a dictionary mapping tissue type to training and testing metastatic potential values.
# These will become the response variables for PLSRegression.
# These should be ordered according to the ordering of cell lines in kfold_train_df and kfold_test_df

train_y = []
test_y = []
for tissue in metmap_tissues:
    mm_tissue = tm_to_metmap[tissue]
    mm_tissue_df = pd.read_excel(snakemake.input['mm_potential'], sheet_name=f"metp500.{mm_tissue}", index_col=0)
    
    mm_tissue_train_df = mm_tissue_df.loc[train_celllines]
    mm_tissue_test_df = mm_tissue_df.loc[test_celllines]
    
    train_y.append(mm_tissue_train_df["mean"].values)
    test_y.append(mm_tissue_test_df["mean"].values)

tissue_train_test_y = {
    "train": np.stack(train_y, axis=-1),
    "test": np.stack(test_y, axis=-1)
}

NameError: name 'metmap_tissues' is not defined

In [13]:
X_train = tissue_train_test_X["train"]
Y_train = tissue_train_test_y["train"]

X_test = tissue_train_test_X["test"]
Y_test = tissue_train_test_y["test"]

In [None]:
mmwrite(snakemake.output["X_train"], X_train)
mmwrite(snakemake.output["Y_train"], Y_train)
mmwrite(snakemake.output["X_test"], X_test)
mmwrite(snakemake.output["Y_test"], Y_test)