In [15]:
import pandas as pd
import numpy as np
from anndata import read_h5ad, AnnData

from sklearn.model_selection import StratifiedKFold

In [16]:
num_folds = snakemake.params["num_folds"]

In [17]:
ccle_adata = read_h5ad(snakemake.input['ccle_exp'])

In [18]:
mm_all_df = pd.read_excel(snakemake.input['mm_potential'], sheet_name=f"metp500.all5", index_col=0)

In [19]:
mm_celllines = mm_all_df.index.values.tolist()
cellline_intersect = set(ccle_adata.obs.index.values.tolist()).intersection(set(mm_celllines))
num_cellines = len(cellline_intersect)
num_cellines

In [20]:
ccle_adata.obs = ccle_adata.obs.merge(mm_all_df, how="left", left_index=True, right_index=True)

In [21]:
mm_rows = ccle_adata.obs.index.to_series().apply(lambda cid: cid in cellline_intersect)
ccle_adata = ccle_adata[mm_rows, :]

In [22]:
ccle_adata.obs["metastatic"] = ccle_adata.obs["penetrance"].apply(lambda p: p > 0.0)

In [23]:
ccle_adata.obs

In [24]:
random_state = 2445

In [25]:
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=random_state)

In [31]:
indices = ccle_adata.obs.index.values.tolist()
X = np.zeros(num_cellines)
y = ccle_adata.obs["metastatic"].values

In [32]:
out_cols = ["fold", "cellline", "cellline_index", "set"]
out_df = pd.DataFrame(columns=["fold", "cellline", "cellline_index", "set"])

for fold_i, (train_indices, test_indices) in enumerate(skf.split(X, y)):
    for train_i in train_indices:
        out_df = out_df.append({
            "fold": fold_i,
            "cellline": indices[train_i],
            "cellline_index": train_i,
            "set": "train"
        }, ignore_index=True)
    for test_i in test_indices:
        out_df = out_df.append({
            "fold": fold_i,
            "cellline": indices[test_i],
            "cellline_index": test_i,
            "set": "test"
        }, ignore_index=True)



In [33]:
out_df.to_csv(snakemake.output[0])