In [38]:
# start coding here

In [39]:
import pandas as pd
import numpy as np
from anndata import read_h5ad, AnnData
import scanpy as sc

In [40]:
metmap_tissue = snakemake.params['metmap_tissue']
expression_scale = snakemake.wildcards['expression_scale']

In [41]:
tm_adata = read_h5ad(snakemake.input['tm_pseudobulk'])
ccle_adata = read_h5ad(snakemake.input['ccle_exp'])
# Tissue-specific metastasis potential
mm_potential_df = pd.read_excel(snakemake.input['mm_potential'], sheet_name=f"metp500.{metmap_tissue}", index_col=0)
i_df = pd.read_csv(snakemake.input['interactions'], sep='\t')

# Restrict to only those interactions for which there is at least one mouse ortholog for partner A or partner B
i_df = i_df.loc[i_df["a_or_b_have_orthologs"]]

In [42]:
mm_celllines = mm_potential_df.index.values.tolist()

In [43]:
cellline_intersect = set(ccle_adata.obs.index.values.tolist()).intersection(set(mm_celllines))
len(cellline_intersect)

In [44]:
ccle_adata.var['human_gene_ensembl'] = ccle_adata.var.index.to_series().apply(lambda gene_v: gene_v.split(".")[0])
ccle_adata.var = ccle_adata.var.rename(columns={'Description': "human_gene"})
ccle_adata.var.head()

In [45]:
tm_adata.var = tm_adata.var.rename(columns={'name': "mouse_gene"})

In [46]:
ccle_genes = set(ccle_adata.var['human_gene_ensembl'].values.tolist())

In [47]:
# Get the set of mouse genes present in the interaction table
i_mouse_genes = set(i_df.dropna(subset=['mouse_gene'])['mouse_gene'].unique().tolist())
i_mouse_genes_ensembl = set(i_df.dropna(subset=['mouse_gene_ensembl'])['mouse_gene_ensembl'].unique().tolist())

In [48]:
# Get the set of human genes present in the interaction table
i_human_genes = set(i_df.dropna(subset=['human_gene'])['human_gene'].unique().tolist())
i_human_genes_ensembl = set(i_df.dropna(subset=['human_gene_ensembl'])['human_gene_ensembl'].unique().tolist())

In [49]:
tm_genes = set(tm_adata.var.index.values.tolist())

In [50]:
i_mh_df = i_df.dropna(subset=["mouse_gene", "human_gene"])
gene_mouse_to_human = dict(zip(i_mh_df['mouse_gene'].values.tolist(), i_mh_df['human_gene'].values.tolist()))
gene_human_to_mouse = dict(zip(i_mh_df['human_gene'].values.tolist(), i_mh_df['mouse_gene'].values.tolist()))

i_mhe_df = i_df.dropna(subset=["mouse_gene_ensembl", "human_gene_ensembl"])
gene_mouse_ensembl_to_human_ensembl = dict(zip(i_mhe_df['mouse_gene_ensembl'].values.tolist(), i_mhe_df['human_gene_ensembl'].values.tolist()))
gene_human_ensembl_to_mouse_ensembl = dict(zip(i_mhe_df['human_gene_ensembl'].values.tolist(), i_mhe_df['mouse_gene_ensembl'].values.tolist()))

i_he_df = i_df.dropna(subset=["human_gene", "human_gene_ensembl"])
gene_human_to_human_ensembl = dict(zip(i_he_df['human_gene'].values.tolist(), i_he_df['human_gene_ensembl'].values.tolist()))
gene_human_ensembl_to_human = dict(zip(i_he_df['human_gene_ensembl'].values.tolist(), i_he_df['human_gene'].values.tolist()))

i_me_df = i_df.dropna(subset=["mouse_gene", "mouse_gene_ensembl"])
gene_mouse_to_mouse_ensembl = dict(zip(i_me_df['mouse_gene'].values.tolist(), i_me_df['mouse_gene_ensembl'].values.tolist()))
gene_mouse_ensembl_to_mouse = dict(zip(i_me_df['mouse_gene_ensembl'].values.tolist(), i_me_df['mouse_gene'].values.tolist()))

In [51]:
tm_adata.var['in_interaction'] = tm_adata.var['mouse_gene'].apply(lambda gene: gene in i_mouse_genes)
ccle_adata.var['in_interaction'] = ccle_adata.var['human_gene_ensembl'].apply(lambda gene: gene in i_human_genes_ensembl)

In [52]:
tm_adata.var['in_interaction'].sum()

In [53]:
ccle_adata.var['in_interaction'].sum()

In [54]:
tm_cpdb_adata = tm_adata[:, tm_adata.var['in_interaction']]
ccle_cpdb_adata = ccle_adata[:, ccle_adata.var['in_interaction']]

In [55]:
ccle_cpdb_adata.var

In [56]:
# Standardize the gene names and add ensembl gene names where they are missing.
# Add orthologous gene names and ensembl gene names to each data frame.

In [57]:
ccle_cpdb_adata.var['human_gene'] = ccle_cpdb_adata.var['human_gene_ensembl'].apply(lambda ens: gene_human_ensembl_to_human[ens])

def get_mouse_gene_ensembl_from_human_gene_ensembl(ens):
    try:
        return gene_human_ensembl_to_mouse_ensembl[ens]
    except KeyError:
        return np.nan
ccle_cpdb_adata.var['mouse_gene_ensembl'] = ccle_cpdb_adata.var['human_gene_ensembl'].apply(get_mouse_gene_ensembl_from_human_gene_ensembl)

def get_mouse_gene_from_mouse_gene_ensembl(ens):
    try:
        return gene_mouse_ensembl_to_mouse[ens]
    except KeyError:
        return np.nan
ccle_cpdb_adata.var['mouse_gene'] = ccle_cpdb_adata.var['mouse_gene_ensembl'].apply(get_mouse_gene_from_mouse_gene_ensembl)

In [58]:
tm_cpdb_adata.var['mouse_gene_ensembl'] = tm_cpdb_adata.var['mouse_gene'].apply(lambda gene: gene_mouse_to_mouse_ensembl[gene])

def get_human_gene_ensembl_from_mouse_gene_ensembl(ens):
    try:
        return gene_mouse_ensembl_to_human_ensembl[ens]
    except KeyError:
        return np.nan
tm_cpdb_adata.var['human_gene_ensembl'] = tm_cpdb_adata.var['mouse_gene_ensembl'].apply(get_human_gene_ensembl_from_mouse_gene_ensembl)

def get_human_gene_from_human_gene_ensembl(ens):
    try:
        return gene_human_ensembl_to_human[ens]
    except KeyError:
        return np.nan
tm_cpdb_adata.var['human_gene'] = tm_cpdb_adata.var['human_gene_ensembl'].apply(get_human_gene_from_human_gene_ensembl)

In [59]:
ccle_cpdb_adata.var.head()

In [60]:
tm_cpdb_adata.var.head()

In [61]:
#sc.pl.highest_expr_genes(ccle_cpdb_adata, n_top=20, gene_symbols="human_gene")

Preprocessing of expression data

In [62]:
#sc.pl.highest_expr_genes(tm_cpdb_adata, n_top=20, gene_symbols="mouse_gene")

In [63]:
sc.pp.normalize_total(tm_cpdb_adata, target_sum=1e4)
sc.pp.normalize_total(ccle_cpdb_adata, target_sum=1e4)

if expression_scale == "log":
    sc.pp.log1p(tm_cpdb_adata)
    sc.pp.log1p(ccle_cpdb_adata)
elif expression_scale == "binary":
    tm_cpdb_adata.X = (tm_cpdb_adata.X > 0)
    ccle_cpdb_adata.X = (ccle_cpdb_adata.X > 0)

In [64]:
#sc.pl.highest_expr_genes(ccle_cpdb_adata, n_top=20, gene_symbols="human_gene")

In [65]:
#sc.pl.highest_expr_genes(tm_cpdb_adata, n_top=20, gene_symbols="mouse_gene")

In [66]:
ccle_cpdb_X_df = pd.DataFrame(data=ccle_cpdb_adata.X.T, index=ccle_cpdb_adata.var['human_gene_ensembl'].values.tolist(), columns=ccle_cpdb_adata.obs.index.values.tolist())
tm_cpdb_X_df = pd.DataFrame(data=tm_cpdb_adata.X.T, index=tm_cpdb_adata.var['mouse_gene_ensembl'].values.tolist(), columns=tm_cpdb_adata.obs.index.values.tolist())

In [67]:
ccle_cpdb_X_df.head()

In [68]:
tm_cpdb_X_df.head()

## Compute co-expression

Want to consider both cases:
- `a_and_b`: partner A is expressed in human cell line, partner B is expressed in tabula muris (B must have at least one mouse ortholog)
- `b_and_a`: partner A is expressed in tabula muris, partner B is expressed in human cell line (A must have at least one mouse ortholog)

For each cell line and `a_and_b` interaction:
- Compute the minimum expression of all partner A genes for each CCLE cell line

For each cell line and `b_and_a` interaction:
- Compute the minimum expression of all partner B genes for each CCLE cell line

For each tabula muris cell type and `a_and_b` interaction:
- Compute the minimum expression of all partner B gene mouse orthologs for each tabula muris cell type

For each tabula muris cell type and `b_and_a` interaction:
- Compute the minimum expression of all partner A gene mouse orthologs for each tabula muris cell type

In [69]:
cell_ontology_ids = tm_cpdb_adata.obs.index.values.tolist()

In [70]:
min_expression = {
    "ccle": {
        "a_and_b": {},
        "b_and_a": {}
    },
    "tm": {
        "a_and_b": {},
        "b_and_a": {}
    }
}

In [71]:
i_df.head()

In [72]:
# Always do a left join onto the interaction alignment dataframe so that the expression dataframes can be aligned correctly.
i_align_df = pd.DataFrame(index=list(i_df["interaction_id"].unique().tolist()))

In [73]:
for cell_line in cellline_intersect:
    # Since can drop the mouse genes in this case, since dealing with human cell lines
    # a_and_b
    i_a_df = i_df.loc[i_df["a_or_b"] == "a"].drop_duplicates(subset=["interaction_id", "human_gene_ensembl"])
    i_a_df = i_a_df[["interaction_id", "human_gene_ensembl"]]
    i_a_df = i_a_df.merge(ccle_cpdb_X_df[[cell_line]], left_on="human_gene_ensembl", right_index=True).rename(columns={cell_line: "gene_expression"})
    i_a_df["partner_i"] = i_a_df.groupby(by="interaction_id").cumcount()
    i_a_exp_df = i_a_df[["interaction_id", "partner_i", "gene_expression"]].pivot(index='interaction_id', columns='partner_i', values="gene_expression")
    i_a_exp_df = i_align_df.merge(i_a_exp_df, how="left", left_index=True, right_index=True)
    
    i_a_min_arr = i_a_exp_df.min(axis=1, skipna=True).values
    i_a_min_arr = np.nan_to_num(i_a_min_arr, nan=0.0)
    min_expression["ccle"]["a_and_b"][cell_line] = i_a_min_arr
    
    # b_and_a
    i_b_df = i_df.loc[i_df["a_or_b"] == "b"].drop_duplicates(subset=["interaction_id", "human_gene_ensembl"])
    i_b_df = i_b_df[["interaction_id", "human_gene_ensembl"]]
    i_b_df = i_b_df.merge(ccle_cpdb_X_df[[cell_line]], left_on="human_gene_ensembl", right_index=True).rename(columns={cell_line: "gene_expression"})
    i_b_df["partner_i"] = i_b_df.groupby(by="interaction_id").cumcount()
    i_b_exp_df = i_b_df[["interaction_id", "partner_i", "gene_expression"]].pivot(index='interaction_id', columns='partner_i', values="gene_expression")
    i_b_exp_df = i_align_df.merge(i_b_exp_df, how="left", left_index=True, right_index=True)
    
    i_b_min_arr = i_b_exp_df.min(axis=1, skipna=True).values
    i_b_min_arr = np.nan_to_num(i_b_min_arr, nan=0.0)
    min_expression["ccle"]["b_and_a"][cell_line] = i_b_min_arr

In [74]:
for cell_type in cell_ontology_ids:
    # a_and_b
    i_b_df = i_df[i_df["a_or_b"] == "b"]
    i_b_df = i_b_df[["interaction_id", "mouse_gene_ensembl"]]
    i_b_df = i_b_df.merge(tm_cpdb_X_df[[cell_type]], left_on="mouse_gene_ensembl", right_index=True).rename(columns={cell_type: "gene_expression"})
    i_b_df["partner_i"] = i_b_df.groupby(by="interaction_id").cumcount()
    i_b_exp_df = i_b_df[["interaction_id", "partner_i", "gene_expression"]].pivot(index='interaction_id', columns='partner_i', values="gene_expression")
    i_b_exp_df = i_align_df.merge(i_b_exp_df, how="left", left_index=True, right_index=True)
    
    i_b_min_arr = i_b_exp_df.min(axis=1, skipna=True).values
    i_b_min_arr = np.nan_to_num(i_b_min_arr, nan=0.0)
    min_expression["tm"]["a_and_b"][cell_type] = i_b_min_arr
    
    # b_and_a
    i_a_df = i_df[i_df["a_or_b"] == "a"]
    i_a_df = i_a_df[["interaction_id", "mouse_gene_ensembl"]]
    i_a_df = i_a_df.merge(tm_cpdb_X_df[[cell_type]], left_on="mouse_gene_ensembl", right_index=True).rename(columns={cell_type: "gene_expression"})
    i_a_df["partner_i"] = i_a_df.groupby(by="interaction_id").cumcount()
    i_a_exp_df = i_a_df[["interaction_id", "partner_i", "gene_expression"]].pivot(index='interaction_id', columns='partner_i', values="gene_expression")
    i_a_exp_df = i_align_df.merge(i_a_exp_df, how="left", left_index=True, right_index=True)
    
    i_a_min_arr = i_a_exp_df.min(axis=1, skipna=True).values
    i_a_min_arr = np.nan_to_num(i_a_min_arr, nan=0.0)
    min_expression["tm"]["b_and_a"][cell_type] = i_a_min_arr

In [75]:
i_align_df.reset_index()

In [76]:
case_df = pd.DataFrame(data=[{"case": "a_and_b"}, {"case": "b_and_a"}])
i_align_dup_df = case_df.join(i_align_df.reset_index().rename(columns={"index": "interaction_id"}), how="cross")
i_align_dup_df

In [77]:
coexp_obs_df = pd.DataFrame(columns=["cell_ontology_id", "cell_line"])
coexp_arr = []

In [78]:
for cell_ontology_id in cell_ontology_ids:
    for cell_line in cellline_intersect:
        # Compute co-expression
        a_and_b___a = min_expression["ccle"]["a_and_b"][cell_line]
        a_and_b___b = min_expression["tm"]["a_and_b"][cell_ontology_id]
        
        b_and_a___b = min_expression["ccle"]["b_and_a"][cell_line]
        b_and_a___a = min_expression["tm"]["b_and_a"][cell_ontology_id]
        
        human_exp = np.concatenate((a_and_b___a, b_and_a___b))
        mouse_exp = np.concatenate((a_and_b___b, b_and_a___a))
        
        exp_pairs = np.stack((human_exp, mouse_exp), axis=-1)
        tm_ccle_coexp = np.amin(exp_pairs, axis=1)
        coexp_arr.append(tm_ccle_coexp)
        coexp_obs_df = coexp_obs_df.append({
            "cell_ontology_id": cell_ontology_id,
            "cell_line": cell_line,
            "metmap_tissue": metmap_tissue,
            "met_potential_ci_05": mm_potential_df.at[cell_line, 'CI.05'],
            "met_potential_ci_95": mm_potential_df.at[cell_line, 'CI.95'],
            "met_potential_mean": mm_potential_df.at[cell_line, 'mean'],
            "met_potential_penetrance": mm_potential_df.at[cell_line, 'penetrance'],
        }, ignore_index=True)

In [79]:
coexp_X = np.stack(coexp_arr, axis=-1).T

In [80]:
coexp_adata = AnnData(X=coexp_X, obs=coexp_obs_df, var=i_align_dup_df)

In [81]:
coexp_adata.write(snakemake.output[0])