In [46]:
from anndata import read_h5ad
import pandas as pd
import numpy as np
from scipy.stats import entropy

import altair as alt
from altair_saver import save as alt_save

alt.data_transformers.disable_max_rows();

import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
interactions_df = pd.read_csv(snakemake.input['interactions'], sep='\t')
interactions_df.head()

In [5]:
# a_and_b: partner A is expressed in human cell line, partner B is expressed in tabula muris
# b_and_a: partner A is expressed in tabula muris, partner B is expressed in human cell line

In [7]:
def make_interaction_name(a_genes, b_genes):
    return ";".join([ a_gene for a_gene in a_genes if pd.notna(a_gene) ]) + "__" + ";".join([ b_gene for b_gene in b_genes if pd.notna(b_gene) ])

In [8]:
interaction_name_df = pd.DataFrame(columns=['interaction_id', 'interaction_name', 'case'])
interaction_name_map = {}

for interaction_id, interaction_df in interactions_df.groupby(by=['interaction_id']):
    a_interaction_df = interaction_df.loc[interaction_df['a_or_b'] == 'a']
    b_interaction_df = interaction_df.loc[interaction_df['a_or_b'] == 'b']
    
    a_human_genes = a_interaction_df['human_gene'].values.tolist()
    a_mouse_genes = a_interaction_df['mouse_gene'].values.tolist()
    b_human_genes = b_interaction_df['human_gene'].values.tolist()
    b_mouse_genes = b_interaction_df['mouse_gene'].values.tolist()
    
    a_and_b_name = make_interaction_name(a_human_genes, b_mouse_genes)
    b_and_a_name = make_interaction_name(b_human_genes, a_mouse_genes)
    
    interaction_name_df = interaction_name_df.append({
        'interaction_id': interaction_id,
        'interaction_name': a_and_b_name,
        'case': "a_and_b"
    }, ignore_index=True)
    interaction_name_map[("a_and_b", interaction_id)] = a_and_b_name
    interaction_name_df = interaction_name_df.append({
        'interaction_id': interaction_id,
        'interaction_name': b_and_a_name,
        'case': "b_and_a"
    }, ignore_index=True)
    interaction_name_map[("b_and_a", interaction_id)] = b_and_a_name

In [9]:
tissue_adata = read_h5ad(snakemake.input['coexpression'])

In [10]:
tissue_adata.var['interaction_name'] = tissue_adata.var.apply(lambda row: interaction_name_map[(row['case'], row['interaction_id'])], axis='columns')

In [19]:
# Rows: cell types and cell lines
# Cols: interactions
tissue_adata.var.head()

In [25]:
def gini(arr):
    # Reference: https://github.com/oliviaguest/gini/blob/master/gini.py
    if np.amin(arr) < 0:
        # Values cannot be negative:
        arr -= np.amin(arr)
    # Values cannot be 0:
    arr += 0.0000001
    # Values must be sorted:
    arr = np.sort(arr)
    # Index per array element:
    index = np.arange(1, arr.shape[0] + 1)
    # Number of array elements:
    n = arr.shape[0]
    # Gini coefficient:
    return ((np.sum((2 * index - n  - 1) * arr)) / (n * np.sum(arr)))

In [76]:
df = pd.DataFrame(columns=["cell_ontology_id", "interaction_name", "variable", "value"])

for cell_ontology_id, cell_type_df in tissue_adata.obs.groupby(by="cell_ontology_id"):
    cell_type_adata = tissue_adata[tissue_adata.obs["cell_ontology_id"] == cell_ontology_id]
    for interaction_i in range(cell_type_adata.X.shape[1]):
        arr = cell_type_adata.X[:,interaction_i]
        interaction_name = cell_type_adata.var.at[str(interaction_i), "interaction_name"]
        
        gini_value = gini(arr)
        entropy_value = entropy(arr)
        variance_value = np.var(arr)
        
        df = df.append({
            "cell_ontology_id": cell_ontology_id,
            "interaction_name": interaction_name,
            "variable": "gini",
            "value": gini_value
        }, ignore_index=True)
        df = df.append({
            "cell_ontology_id": cell_ontology_id,
            "interaction_name": interaction_name,
            "variable": "entropy",
            "value": entropy_value
        }, ignore_index=True)
        df = df.append({
            "cell_ontology_id": cell_ontology_id,
            "interaction_name": interaction_name,
            "variable": "variance",
            "value": variance_value
        }, ignore_index=True)
    
    print(cell_ontology_id)

In [139]:
def make_plot(df, variable):
    filtered_df = df.loc[df["variable"] == variable]
    filtered_df["value"] = filtered_df["value"].clip(upper=100000.0)
    plot = alt.Chart(filtered_df).mark_bar().encode(
        alt.X("value:Q", bin=alt.Bin(maxbins=30), axis=alt.Axis(title=variable)),
        y=alt.Y('count()', axis=alt.Axis(title="Count of interactions")),
    ).properties(title=f"Distribution of interaction coexpression variation ({variable})")
    
    alt_save(plot, snakemake.output[variable])
    
    return plot

In [140]:
make_plot(df, "gini")

In [141]:
make_plot(df, "entropy")

In [142]:
make_plot(df, "variance")