In [38]:
import pandas as pd
import numpy as np
from anndata import read_h5ad, AnnData
import scanpy as sc

In [39]:
metmap_tissue = snakemake.params['metmap_tissue']
curr_fold = int(snakemake.wildcards["fold"])

This notebook is similar to `nonmetastatic_deseq_preparation.py.ipynb` except now we need to restrict the differential expression data to training data for the current fold. 

In [40]:
kfold_df = pd.read_csv(snakemake.input['kfold_indices'])
kfold_df = kfold_df.loc[(kfold_df["fold"] == curr_fold) & (kfold_df["set"] == "train")]
kfold_df.head()

In [41]:
ccle_adata = read_h5ad(snakemake.input['ccle_exp'])

In [42]:
mm_all_df = pd.read_excel(snakemake.input['mm_potential'], sheet_name=f"metp500.all5", index_col=0)
mm_tissue_df = pd.read_excel(snakemake.input['mm_potential'], sheet_name=f"metp500.{metmap_tissue}", index_col=0)

In [43]:
fold_celllines = kfold_df["cellline"].values.tolist()

Keep only the data for those cell lines in the training set for the current fold.

In [44]:
mm_rows = ccle_adata.obs.index.to_series().apply(lambda cid: cid in fold_celllines)
ccle_adata = ccle_adata[mm_rows, :]

In [45]:
ccle_adata.obs = ccle_adata.obs.merge(mm_tissue_df, how="left", left_index=True, right_index=True)

In [46]:
ccle_adata.obs["metastatic"] = ccle_adata.obs["penetrance"].apply(lambda p: p > 0.0)

In [47]:
ccle_adata.obs

In [48]:
counts_df = pd.DataFrame(data=ccle_adata.X, columns=ccle_adata.var.index.values.tolist(), index=ccle_adata.obs.index.values.tolist())

In [49]:
conditions_df = ccle_adata.obs[["mean", "penetrance", "metastatic"]]

In [50]:
counts_df.to_csv(snakemake.output["counts"])
conditions_df.to_csv(snakemake.output["conditions"])