In [37]:
# start coding here
import pandas as pd
import numpy as np

import altair as alt
from altair_saver import save as alt_save

In [38]:
o_df = pd.read_csv(snakemake.input['orthologs'], sep='\t', skiprows=1)
gi_df = pd.read_csv(snakemake.input['cpdb_gene_input'])
pi_df = pd.read_csv(snakemake.input['cpdb_protein_input'])
ci_df = pd.read_csv(snakemake.input['cpdb_complex_input'])
ii_df = pd.read_csv(snakemake.input['cpdb_interaction_input'])

o_df = o_df.drop(columns=[
  "Gene stable ID version",
  "Transcript stable ID version",
  "Mouse protein or transcript stable ID"
])
o_df = o_df.rename(columns={
  "Gene stable ID": "human_gene_ensembl",
  "Transcript stable ID": "human_transcript_ensembl",
  "Mouse gene stable ID": "mouse_gene_ensembl",
  "Mouse gene name": "mouse_gene",
  "Mouse orthology confidence [0 low, 1 high]": "mouse_orthology_confidence"
})

ii_df = ii_df.rename(columns={
    "id_cp_interaction": "interaction_id"
})

gi_df = gi_df.rename(columns={
    "gene_name": "human_gene",
    "hgnc_symbol": "human_gene_hgnc",
    "ensembl": "human_gene_ensembl"
})

o_df = o_df.drop(columns=["human_transcript_ensembl"])
o_df = o_df.drop_duplicates()

In [39]:
gi_df.head()

In [40]:
def col_append(df, s):
    return df.rename(columns=dict(zip(df.columns.values.tolist(), [f"{col}{s}" for col in df.columns.values.tolist()])))

In [41]:
# Keep track of the number of genes for each possible filtering step
counts = {
    "orthologs": {},
    "interactions": {},
    "proteins": {},
    "complexes": {},
}

In [42]:
counts["orthologs"]["Number of human-mouse orthologs (all genes)"] = o_df.shape[0]
counts["orthologs"]["Number of human-mouse orthologs (all genes, high confidence orthologs)"] = o_df.loc[o_df["mouse_orthology_confidence"] == 1].shape[0]
counts

In [43]:
# Interaction dataframe
ii_df.head()

In [44]:
counts["interactions"]["Number of interactions"] = ii_df.shape[0]
counts["interactions"]["Number of binary interactions"] = ii_df.loc[pd.notna(ii_df['protein_name_a']) & pd.notna(ii_df['protein_name_b'])].shape[0]
counts["interactions"]["Number of complex interactions"] = ii_df.loc[~pd.notna(ii_df['protein_name_a']) | ~pd.notna(ii_df['protein_name_b'])].shape[0]
counts

# COP: complex or protein

In [45]:
molten_ii_df = ii_df.drop(columns=["protein_name_a", "protein_name_b"]).rename(columns={"partner_a": "a", "partner_b": "b"}).melt(id_vars=["interaction_id", "annotation_strategy", "source"], value_name="cop_name", var_name="a_or_b")
molten_ii_df.head()

In [46]:
# Ultimately, we want to join the gene IDs to the interaction dataframe.
# Need to first join with the complex_input, then the gene_input, then the orthologous mouse genes.
# Before joining with the gene_input, will need to melt the complex_input rows so that each uniprot ID has its own row.

In [47]:
ci_id_vars = list(set(ci_df.columns.values.tolist()).difference({'uniprot_1', 'uniprot_2', 'uniprot_3', 'uniprot_4'}))
molten_ci_df = ci_df.melt(id_vars=ci_id_vars, value_name="uniprot").drop(columns=["variable"]).dropna(subset=["uniprot"])

In [48]:
counts["complexes"]["Number of unique protein complexes"] = ci_df.shape[0]
counts["proteins"]["Number of unique proteins involved in complex interactions"] = len(molten_ci_df['uniprot'].unique().tolist())
counts

In [49]:
molten_ci_df.loc[molten_ci_df["complex_name"] == "IL12"]

In [50]:
molten_ci_df.loc[molten_ci_df["complex_name"] == "IL12 receptor"]

In [51]:
molten_ci_df.head()

In [52]:
pi_df.head()

In [53]:
ii_ci_df = molten_ii_df.merge(molten_ci_df, how="left", left_on="cop_name", right_on="complex_name")
ii_ci_df.head()

In [54]:
ii_ci_df.loc[ii_ci_df["interaction_id"] == "CPI-CC0041E1D30"]

In [55]:
gi_df.head()

In [56]:
def col_xy(df, col):
    x_col = f"{col}_x"
    y_col = f"{col}_y"
    df[col] = df.apply(lambda row: row[x_col] if pd.notna(row[x_col]) else row[y_col], axis='columns')
    return df.drop(columns=[x_col, y_col])

In [57]:
ii_ci_df["is_complex"] = ii_ci_df.apply(lambda row: pd.notna(row["uniprot"]), axis='columns')
ii_ci_df["uniprot"] = ii_ci_df.apply(lambda row: row["uniprot"] if pd.notna(row["uniprot"]) else row["cop_name"], axis='columns')
ii_pi_df = ii_ci_df.merge(pi_df, how="left", on="uniprot")

ii_pi_df = col_xy(ii_pi_df, "secreted_highlight")
ii_pi_df = col_xy(ii_pi_df, "other_desc")
ii_pi_df = col_xy(ii_pi_df, "integrin")
ii_pi_df = col_xy(ii_pi_df, "secreted_desc")
ii_pi_df = col_xy(ii_pi_df, "receptor_desc")
ii_pi_df = col_xy(ii_pi_df, "receptor")
ii_pi_df = col_xy(ii_pi_df, "other")
ii_pi_df = col_xy(ii_pi_df, "transmembrane")
ii_pi_df = col_xy(ii_pi_df, "secreted")
ii_pi_df = col_xy(ii_pi_df, "peripheral")

ii_gi_df = ii_pi_df.merge(gi_df, how="left", on="uniprot")
ii_gi_df.tail()

In [58]:
ii_gi_df.loc[ii_gi_df["interaction_id"] == "CPI-SS0FF49C823", ["integrin", "secreted_highlight"]]

In [59]:
# Check that all genes in the interaction dataframe have an associated ensembl human gene ID
ii_gi_df.loc[ii_gi_df["human_gene_ensembl"] == np.nan].shape[0]

In [60]:
ii_o_df = ii_gi_df.merge(o_df, how="left", on="human_gene_ensembl")
ii_o_df.head()

In [61]:
num_1_any = 0
num_all_any = 0
num_1_high = 0
num_all_high = 0

num_with_transmembrane = 0
num_with_peripheral = 0
num_with_secreted = 0
num_with_secreted_highlight = 0
num_with_receptor = 0
num_with_integrin = 0

for interaction_id, complex_df in ii_o_df.groupby(by="interaction_id"):
    if complex_df.loc[complex_df["transmembrane"] == True].shape[0] >= 1:
        num_with_transmembrane += 1
    if complex_df.loc[complex_df["peripheral"] == True].shape[0] >= 1:
        num_with_peripheral += 1
    if complex_df.loc[complex_df["secreted"] == True].shape[0] >= 1:
        num_with_secreted += 1
    if complex_df.loc[complex_df["secreted_highlight"] == True].shape[0] >= 1:
        num_with_secreted_highlight += 1
    if complex_df.loc[complex_df["receptor"] == True].shape[0] >= 1:
        num_with_receptor += 1
    if complex_df.loc[complex_df["integrin"] == True].shape[0] >= 1:
        num_with_integrin += 1
        
    
    a_df = dict(list(complex_df.groupby(by="a_or_b")))['a']
    b_df = dict(list(complex_df.groupby(by="a_or_b")))['b']
    
    if a_df.loc[pd.notna(a_df["mouse_gene_ensembl"])].shape[0] >= 1 and b_df.loc[pd.notna(b_df["mouse_gene_ensembl"])].shape[0] >= 1:
        num_1_any += 1
    if a_df.loc[pd.notna(a_df["mouse_gene_ensembl"])].shape[0] == a_df.shape[0] and b_df.loc[pd.notna(b_df["mouse_gene_ensembl"])].shape[0] == b_df.shape[0]:
        num_all_any += 1
    if a_df.loc[a_df["mouse_orthology_confidence"] == 1.0].shape[0] >= 1 and b_df.loc[b_df["mouse_orthology_confidence"] == 1.0].shape[0] >= 1:
        num_1_high += 1
    if a_df.loc[a_df["mouse_orthology_confidence"] == 1.0].shape[0] == a_df.shape[0] and b_df.loc[b_df["mouse_orthology_confidence"] == 1.0].shape[0] == b_df.shape[0]:
        num_all_high += 1


counts["interactions"]["Number of interactions with at least one protein annotated as transmembrane"] = num_with_transmembrane
counts["interactions"]["Number of interactions with at least one protein annotated as peripheral"] = num_with_peripheral
counts["interactions"]["Number of interactions with at least one protein annotated as secreted"] = num_with_secreted
counts["interactions"]["Number of interactions with at least one protein annotated as secreted_highlight"] = num_with_secreted_highlight
counts["interactions"]["Number of interactions with at least one protein annotated as receptor"] = num_with_receptor
counts["interactions"]["Number of interactions with at least one protein annotated as integrin"] = num_with_integrin

counts["interactions"]["Number of interactions with at least one mouse ortholog for a and b"] = num_1_any
counts["interactions"]["Number of interactions with all mouse orthologs for a and b"] = num_all_any
counts["interactions"]["Number of interactions with at least one high confidence mouse ortholog for a and b"] = num_1_high
counts["interactions"]["Number of interactions with all high confidence mouse orthologs both a and b"] = num_all_high

counts

In [62]:
pd.DataFrame(data=list(counts["orthologs"].items()), columns=["key", "value"])

## Make plots

In [63]:
# Make plot
o_plot_df = pd.DataFrame(data=list(counts["orthologs"].items()), columns=["key", "value"])

o_plot = alt.Chart(o_plot_df).mark_bar().encode(
  y=alt.Y("key:N", axis=alt.Axis(labelLimit=1000, title=None)),
  x=alt.X("value:Q", axis=alt.Axis(title="Count")),
).properties(title="Orthologs from Ensembl")

o_plot

In [64]:
# Make plot
i_plot_df = pd.DataFrame(data=list(counts["interactions"].items()), columns=["key", "value"])
i_plot_df["ortholog"] = i_plot_df["key"].apply(lambda x: "mouse ortholog" in x)

i_plot = alt.Chart(i_plot_df).mark_bar().encode(
  y=alt.Y("key:N", axis=alt.Axis(labelLimit=1000, title=None), sort=list(counts["interactions"].keys())),
  x=alt.X("value:Q", axis=alt.Axis(title="Count")),
  color=alt.Color("ortholog:N", legend=None, sort=[True, False])
).properties(title="Interactions from CellPhoneDB")

i_plot

In [65]:
ii_o_df.to_csv(snakemake.output['table'], sep='\t', index=False)
alt_save(o_plot, snakemake.output['ortholog_plot'])
alt_save(i_plot, snakemake.output['interaction_plot'])