In [2]:
# start coding here

In [136]:
import pandas as pd
import numpy as np
from anndata import read_h5ad, AnnData
import scanpy as sc

In [137]:
metmap_tissue = snakemake.params['metmap_tissue']
expression_scale = snakemake.wildcards['expression_scale']

In [138]:
tm_adata = read_h5ad(snakemake.input['tm_pseudobulk'])
ccle_adata = read_h5ad(snakemake.input['ccle_exp'])
# Tissue-specific metastasis potential
mm_potential_df = pd.read_excel(snakemake.input['mm_potential'], sheet_name=f"metp500.{metmap_tissue}", index_col=0)
i_df = pd.read_csv(snakemake.input['interactions'], sep='\t')

# Restrict to only those interactions for which there is at least one mouse ortholog for partner A or partner B
i_df = i_df.loc[i_df["a_or_b_have_orthologs"]]

In [139]:
mm_celllines = mm_potential_df.index.values.tolist()

In [140]:
cellline_intersect = set(ccle_adata.obs.index.values.tolist()).intersection(set(mm_celllines))
len(cellline_intersect)

In [141]:
ccle_adata.var['human_gene_ensembl'] = ccle_adata.var.index.to_series().apply(lambda gene_v: gene_v.split(".")[0])
ccle_adata.var = ccle_adata.var.rename(columns={'Description': "human_gene"})
ccle_adata.var.head()

In [142]:
tm_adata.var = tm_adata.var.rename(columns={'name': "mouse_gene"})

In [143]:
ccle_genes = set(ccle_adata.var['human_gene_ensembl'].values.tolist())

In [144]:
# Get the set of mouse genes present in the interaction table
i_mouse_genes = set(i_df.dropna(subset=['mouse_gene'])['mouse_gene'].unique().tolist())
i_mouse_genes_ensembl = set(i_df.dropna(subset=['mouse_gene_ensembl'])['mouse_gene_ensembl'].unique().tolist())

In [145]:
# Get the set of human genes present in the interaction table
i_human_genes = set(i_df.dropna(subset=['human_gene'])['human_gene'].unique().tolist())
i_human_genes_ensembl = set(i_df.dropna(subset=['human_gene_ensembl'])['human_gene_ensembl'].unique().tolist())

In [146]:
tm_genes = set(tm_adata.var.index.values.tolist())

In [147]:
i_mh_df = i_df.dropna(subset=["mouse_gene", "human_gene"])
gene_mouse_to_human = dict(zip(i_mh_df['mouse_gene'].values.tolist(), i_mh_df['human_gene'].values.tolist()))
gene_human_to_mouse = dict(zip(i_mh_df['human_gene'].values.tolist(), i_mh_df['mouse_gene'].values.tolist()))

i_mhe_df = i_df.dropna(subset=["mouse_gene_ensembl", "human_gene_ensembl"])
gene_mouse_ensembl_to_human_ensembl = dict(zip(i_mhe_df['mouse_gene_ensembl'].values.tolist(), i_mhe_df['human_gene_ensembl'].values.tolist()))
gene_human_ensembl_to_mouse_ensembl = dict(zip(i_mhe_df['human_gene_ensembl'].values.tolist(), i_mhe_df['mouse_gene_ensembl'].values.tolist()))

i_he_df = i_df.dropna(subset=["human_gene", "human_gene_ensembl"])
gene_human_to_human_ensembl = dict(zip(i_he_df['human_gene'].values.tolist(), i_he_df['human_gene_ensembl'].values.tolist()))
gene_human_ensembl_to_human = dict(zip(i_he_df['human_gene_ensembl'].values.tolist(), i_he_df['human_gene'].values.tolist()))

i_me_df = i_df.dropna(subset=["mouse_gene", "mouse_gene_ensembl"])
gene_mouse_to_mouse_ensembl = dict(zip(i_me_df['mouse_gene'].values.tolist(), i_me_df['mouse_gene_ensembl'].values.tolist()))
gene_mouse_ensembl_to_mouse = dict(zip(i_me_df['mouse_gene_ensembl'].values.tolist(), i_me_df['mouse_gene'].values.tolist()))

In [148]:
tm_adata.var['in_interaction'] = tm_adata.var['mouse_gene'].apply(lambda gene: gene in i_mouse_genes)
ccle_adata.var['in_interaction'] = ccle_adata.var['human_gene_ensembl'].apply(lambda gene: gene in i_human_genes_ensembl)

In [149]:
tm_adata.var['in_interaction'].sum()

In [150]:
ccle_adata.var['in_interaction'].sum()

In [151]:
tm_cpdb_adata = tm_adata[:, tm_adata.var['in_interaction']]
ccle_cpdb_adata = ccle_adata[:, ccle_adata.var['in_interaction']]

In [152]:
ccle_cpdb_adata.var

In [162]:
# Standardize the gene names and add ensembl gene names where they are missing.
# Add orthologous gene names and ensembl gene names to each data frame.

In [156]:
ccle_cpdb_adata.var['human_gene'] = ccle_cpdb_adata.var['human_gene_ensembl'].apply(lambda ens: gene_human_ensembl_to_human[ens])

def get_mouse_gene_ensembl_from_human_gene_ensembl(ens):
    try:
        return gene_human_ensembl_to_mouse_ensembl[ens]
    except KeyError:
        return np.nan
ccle_cpdb_adata.var['mouse_gene_ensembl'] = ccle_cpdb_adata.var['human_gene_ensembl'].apply(get_mouse_gene_ensembl_from_human_gene_ensembl)

def get_mouse_gene_from_mouse_gene_ensembl(ens):
    try:
        return gene_mouse_ensembl_to_mouse[ens]
    except KeyError:
        return np.nan
ccle_cpdb_adata.var['mouse_gene'] = ccle_cpdb_adata.var['mouse_gene_ensembl'].apply(get_mouse_gene_from_mouse_gene_ensembl)

In [159]:
tm_cpdb_adata.var['mouse_gene_ensembl'] = tm_cpdb_adata.var['mouse_gene'].apply(lambda gene: gene_mouse_to_mouse_ensembl[gene])

def get_human_gene_ensembl_from_mouse_gene_ensembl(ens):
    try:
        return gene_mouse_ensembl_to_human_ensembl[ens]
    except KeyError:
        return np.nan
tm_cpdb_adata.var['human_gene_ensembl'] = tm_cpdb_adata.var['mouse_gene_ensembl'].apply(get_human_gene_ensembl_from_mouse_gene_ensembl)

def get_human_gene_from_human_gene_ensembl(ens):
    try:
        return gene_human_ensembl_to_human[ens]
    except KeyError:
        return np.nan
tm_cpdb_adata.var['human_gene'] = tm_cpdb_adata.var['human_gene_ensembl'].apply(get_human_gene_from_human_gene_ensembl)

In [163]:
ccle_cpdb_adata.var.head()

In [164]:
tm_cpdb_adata.var.head()

In [127]:
sc.pl.highest_expr_genes(ccle_cpdb_adata, n_top=20, gene_symbols="human_gene")

Preprocessing of expression data

In [128]:
sc.pl.highest_expr_genes(tm_cpdb_adata, n_top=20, gene_symbols="mouse_gene")

In [129]:
sc.pp.normalize_total(tm_cpdb_adata, target_sum=1e4)
sc.pp.normalize_total(ccle_cpdb_adata, target_sum=1e4)

if expression_scale == "log":
    sc.pp.log1p(tm_cpdb_adata)
    sc.pp.log1p(ccle_cpdb_adata)
elif expression_scale == "binary":
    tm_cpdb_adata.X = (tm_cpdb_adata.X > 0)
    ccle_cpdb_adata.X = (ccle_cpdb_adata.X > 0)

In [130]:
sc.pl.highest_expr_genes(ccle_cpdb_adata, n_top=20, gene_symbols="human_gene")

In [131]:
sc.pl.highest_expr_genes(tm_cpdb_adata, n_top=20, gene_symbols="mouse_gene")

In [165]:
ccle_cpdb_X_df = pd.DataFrame(data=ccle_cpdb_adata.X.T, index=ccle_cpdb_adata.var['human_gene_ensembl'].values.tolist(), columns=ccle_cpdb_adata.obs.index.values.tolist())
tm_cpdb_X_df = pd.DataFrame(data=tm_cpdb_adata.X.T, index=tm_cpdb_adata.var['mouse_gene_ensembl'].values.tolist(), columns=tm_cpdb_adata.obs.index.values.tolist())

In [166]:
ccle_cpdb_X_df.head()

In [167]:
tm_cpdb_X_df.head()

## Compute co-expression

Want to consider both cases:
- `a_and_b`: partner A is expressed in human cell line, partner B is expressed in tabula muris
- `b_and_a`: partner A is expressed in tabula muris, partner B is expressed in human cell line

In [98]:
cpdb_human_a_df = o_in_tm_and_ccle_df[['ensembl_a']]
cpdb_human_b_df = o_in_tm_and_ccle_df[['ensembl_b']]

cpdb_mouse_a_df = o_in_tm_and_ccle_df[['ensembl_gene_mouse_a']]
cpdb_mouse_b_df = o_in_tm_and_ccle_df[['ensembl_gene_mouse_b']]

In [99]:
cpdb_human_a_df.head()

In [100]:
cpdb_human_a_df = cpdb_human_a_df.merge(ccle_cpdb_X_df, how='left', left_on="ensembl_a", right_index=True)
cpdb_human_b_df = cpdb_human_b_df.merge(ccle_cpdb_X_df, how='left', left_on="ensembl_b", right_index=True)

cpdb_mouse_a_df = cpdb_mouse_a_df.merge(tm_cpdb_X_df, how='left', left_on="ensembl_gene_mouse_a", right_index=True)
cpdb_mouse_b_df = cpdb_mouse_b_df.merge(tm_cpdb_X_df, how='left', left_on="ensembl_gene_mouse_b", right_index=True)

In [101]:
cpdb_mouse_b_df.head()

In [102]:
cell_ontology_ids = tm_cpdb_adata.obs.index.values.tolist()

In [103]:
coexp_obs_df = pd.DataFrame(columns=["cell_ontology_id", "cell_line", "human_a_mouse_b"])
coexp_arr = []

In [104]:
for cell_ontology_id in cell_ontology_ids:
    for cell_line in cellline_intersect:
        # Compute co-expression
        human_exp = cpdb_human_a_df[cell_line].values
        mouse_exp = cpdb_mouse_b_df[cell_ontology_id].values
        exp_pairs = np.stack((human_exp, mouse_exp), axis=-1)
        tm_ccle_coexp = np.amin(exp_pairs, axis=1)
        coexp_arr.append(tm_ccle_coexp)
        coexp_obs_df = coexp_obs_df.append({
            "cell_ontology_id": cell_ontology_id,
            "cell_line": cell_line,
            "human_a_mouse_b": True,
            "metmap_tissue": metmap_tissue,
            "met_potential_ci_05": mm_potential_df.at[cell_line, 'CI.05'],
            "met_potential_ci_95": mm_potential_df.at[cell_line, 'CI.95'],
            "met_potential_mean": mm_potential_df.at[cell_line, 'mean'],
            "met_potential_penetrance": mm_potential_df.at[cell_line, 'penetrance'],
        }, ignore_index=True)

In [105]:
coexp_X = np.stack(coexp_arr, axis=-1).T

In [106]:
coexp_adata = AnnData(X=coexp_X, obs=coexp_obs_df, var=o_in_tm_and_ccle_df)

In [107]:
coexp_adata.write(snakemake.output[0])