In [2]:
# start coding here

In [3]:
import pandas as pd
import numpy as np
from anndata import read_h5ad, AnnData
import scanpy as sc

In [4]:
tm_adata = read_h5ad(snakemake.input['tm_pseudobulk'])
ccle_adata = read_h5ad(snakemake.input['ccle_exp'])
mm_potential_df = pd.read_excel(snakemake.input['mm_potential'], sheet_name="metp500.all5", index_col=0)
o_df = pd.read_csv(snakemake.input['cpdb_orthologs'], sep='\t')

In [5]:
mm_celllines = mm_potential_df.index.values.tolist()

In [6]:
cellline_intersect = set(ccle_adata.obs.index.values.tolist()).intersection(set(mm_celllines))
len(cellline_intersect)

In [7]:
ccle_adata.var['ensembl'] = ccle_adata.var.index.to_series().apply(lambda gene_v: gene_v.split(".")[0])

In [8]:
ccle_genes = set(ccle_adata.var['ensembl'].values.tolist())

In [9]:
mouse_gene_a = set(o_df['gene_mouse_a'].unique().tolist())
mouse_gene_b = set(o_df['gene_mouse_b'].unique().tolist())

In [10]:
human_gene_a = set(o_df['ensembl_a'].unique().tolist())
human_gene_b = set(o_df['ensembl_b'].unique().tolist())

In [11]:
cpdb_mouse_genes = mouse_gene_a.union(mouse_gene_b)
cpdb_human_genes = human_gene_a.union(human_gene_b)

In [12]:
tm_genes = set(tm_adata.var.index.values.tolist())

In [13]:
o_df["a_in_tm"] = o_df["gene_mouse_a"].apply(lambda gene: gene in tm_genes)
o_df["b_in_tm"] = o_df["gene_mouse_b"].apply(lambda gene: gene in tm_genes)
o_df["both_in_tm"] = (o_df["a_in_tm"] & o_df["b_in_tm"])

In [14]:
o_df["both_in_tm"].sum() / o_df["both_in_tm"].shape[0]

In [15]:
o_df["a_in_ccle"] = o_df["ensembl_a"].apply(lambda gene: gene in ccle_genes)
o_df["b_in_ccle"] = o_df["ensembl_b"].apply(lambda gene: gene in ccle_genes)
o_df["both_in_ccle"] = (o_df["a_in_ccle"] & o_df["b_in_ccle"])

In [16]:
o_df["both_in_ccle"].sum() / o_df["both_in_ccle"].shape[0]

In [17]:
o_df["both_in_tm_and_ccle"] = (o_df["both_in_tm"] & o_df["both_in_ccle"])
o_df["both_in_tm_and_ccle"].sum() / o_df["both_in_tm_and_ccle"].shape[0]

In [18]:
o_in_tm_and_ccle_df = o_df.loc[o_df["both_in_tm_and_ccle"]]

# CONTAINS DUPLICATES
genes_in_tm_and_ccle_as_human = o_in_tm_and_ccle_df['gene_name_a'].values.tolist() + o_in_tm_and_ccle_df['gene_name_b'].values.tolist()
genes_in_tm_and_ccle_as_human_ensembl = o_in_tm_and_ccle_df['ensembl_a'].values.tolist() + o_in_tm_and_ccle_df['ensembl_b'].values.tolist()

genes_in_tm_and_ccle_as_mouse = o_in_tm_and_ccle_df['gene_mouse_a'].values.tolist() + o_in_tm_and_ccle_df['gene_mouse_b'].values.tolist()
genes_in_tm_and_ccle_as_mouse_ensembl = o_in_tm_and_ccle_df['ensembl_gene_mouse_a'].values.tolist() + o_in_tm_and_ccle_df['ensembl_gene_mouse_b'].values.tolist()


# DIFFERENT SIZES
gene_mouse_to_human = dict(zip(genes_in_tm_and_ccle_as_mouse, genes_in_tm_and_ccle_as_human))
gene_human_to_mouse = dict(zip(genes_in_tm_and_ccle_as_human, genes_in_tm_and_ccle_as_mouse))

gene_human_to_human_ensembl = dict(zip(genes_in_tm_and_ccle_as_human, genes_in_tm_and_ccle_as_human_ensembl))
gene_human_ensembl_to_human = dict(zip(genes_in_tm_and_ccle_as_human_ensembl, genes_in_tm_and_ccle_as_human))

gene_mouse_to_mouse_ensembl = dict(zip(genes_in_tm_and_ccle_as_mouse, genes_in_tm_and_ccle_as_mouse_ensembl))
gene_mouse_ensembl_to_mouse = dict(zip(genes_in_tm_and_ccle_as_mouse_ensembl, genes_in_tm_and_ccle_as_mouse))

In [19]:
o_in_tm_and_ccle_df.head()

In [20]:
tm_adata.var['in_cpdb'] = tm_adata.var['name'].apply(lambda gene: gene in set(genes_in_tm_and_ccle_as_mouse))
ccle_adata.var['in_cpdb'] = ccle_adata.var['ensembl'].apply(lambda gene: gene in set(genes_in_tm_and_ccle_as_human_ensembl))

In [21]:
tm_adata.var['in_cpdb'].sum()

In [22]:
ccle_adata.var['in_cpdb'].sum()

In [23]:
tm_cpdb_adata = tm_adata[:, tm_adata.var['in_cpdb']]
ccle_cpdb_adata = ccle_adata[:, ccle_adata.var['in_cpdb']]

In [24]:
ccle_cpdb_adata.var['name'] = ccle_cpdb_adata.var['ensembl'].apply(lambda ens: gene_human_ensembl_to_human[ens])

In [25]:
tm_cpdb_adata.var['ensembl'] = tm_cpdb_adata.var['name'].apply(lambda gene: gene_mouse_to_mouse_ensembl[gene])

In [27]:
sc.pl.highest_expr_genes(tm_cpdb_adata, n_top=20)

Preprocessing of expression data

In [28]:
sc.pp.normalize_total(tm_cpdb_adata, target_sum=1e4)
sc.pp.normalize_total(ccle_cpdb_adata, target_sum=1e4)

sc.pp.log1p(tm_cpdb_adata)
sc.pp.log1p(ccle_cpdb_adata)

In [29]:
sc.pl.highest_expr_genes(tm_cpdb_adata, n_top=20)

In [30]:
sc.pl.highest_expr_genes(ccle_cpdb_adata, n_top=20, gene_symbols='name')

In [31]:
tm_cpdb_adata.var

In [32]:
ccle_cpdb_adata.var

In [33]:
ccle_cpdb_X_df = pd.DataFrame(data=ccle_cpdb_adata.X.T, index=ccle_cpdb_adata.var['ensembl'].values.tolist(), columns=ccle_cpdb_adata.obs.index.values.tolist())
tm_cpdb_X_df = pd.DataFrame(data=tm_cpdb_adata.X.T, index=tm_cpdb_adata.var['ensembl'].values.tolist(), columns=tm_cpdb_adata.obs.index.values.tolist())

In [34]:
ccle_cpdb_X_df.head()

In [35]:
tm_cpdb_X_df.head()

In [36]:
cpdb_human_a_df = o_in_tm_and_ccle_df[['ensembl_a']]
cpdb_human_b_df = o_in_tm_and_ccle_df[['ensembl_b']]

cpdb_mouse_a_df = o_in_tm_and_ccle_df[['ensembl_gene_mouse_a']]
cpdb_mouse_b_df = o_in_tm_and_ccle_df[['ensembl_gene_mouse_b']]

In [37]:
cpdb_human_a_df.head()

In [38]:
cpdb_human_a_df = cpdb_human_a_df.merge(ccle_cpdb_X_df, how='left', left_on="ensembl_a", right_index=True)
cpdb_human_b_df = cpdb_human_b_df.merge(ccle_cpdb_X_df, how='left', left_on="ensembl_b", right_index=True)

cpdb_mouse_a_df = cpdb_mouse_a_df.merge(tm_cpdb_X_df, how='left', left_on="ensembl_gene_mouse_a", right_index=True)
cpdb_mouse_b_df = cpdb_mouse_b_df.merge(tm_cpdb_X_df, how='left', left_on="ensembl_gene_mouse_b", right_index=True)

In [39]:
cpdb_mouse_b_df.head()

In [40]:
cell_ontology_ids = tm_cpdb_adata.obs.index.values.tolist()

In [41]:
coexp_obs_df = pd.DataFrame(columns=["cell_ontology_id", "cell_line", "human_a_mouse_b"])
coexp_arr = []

In [42]:
for cell_ontology_id in cell_ontology_ids:
    for cell_line in cellline_intersect:
        # Compute co-expression
        human_exp = cpdb_human_a_df[cell_line].values
        mouse_exp = cpdb_mouse_b_df[cell_ontology_id].values
        exp_pairs = np.stack((human_exp, mouse_exp), axis=-1)
        tm_ccle_coexp = np.amin(exp_pairs, axis=1)
        coexp_arr.append(tm_ccle_coexp)
        coexp_obs_df = coexp_obs_df.append({
            "cell_ontology_id": cell_ontology_id,
            "cell_line": cell_line,
            "human_a_mouse_b": True,
        }, ignore_index=True)

In [43]:
coexp_X = np.stack(coexp_arr, axis=-1).T

In [44]:
coexp_adata = AnnData(X=coexp_X, obs=coexp_obs_df, var=o_in_tm_and_ccle_df)

In [45]:
coexp_adata.write(snakemake.output[0])