In [24]:
# start coding here
import pandas as pd
import numpy as np

import altair as alt
from altair_saver import save

In [25]:
o_df = pd.read_csv(snakemake.input['orthologs'], sep='\t', skiprows=1)
gi_df = pd.read_csv(snakemake.input['cpdb_gene_input'])
pi_df = pd.read_csv(snakemake.input['cpdb_protein_input'])
pc_df = pd.read_csv(snakemake.input['cpdb_protein_curated'])
ic_df = pd.read_csv(snakemake.input['cpdb_interaction_curated'])

In [26]:
o_df = o_df.drop(columns=[
  "Gene stable ID version",
  "Transcript stable ID version",
  "Mouse protein or transcript stable ID"
])
o_df = o_df.rename(columns={
  "Gene stable ID": "ensembl",
  "Transcript stable ID": "ensembl_transcript",
  "Mouse gene stable ID": "ensembl_gene_mouse",
  "Mouse gene name": "gene_mouse",
  "Mouse orthology confidence [0 low, 1 high]": "mouse_orthology_confidence"
})
#o_df = o_df.loc[o_df["mouse_orthology_confidence"] == 1]
o_df = o_df.drop(columns=["ensembl_transcript"])
o_df = o_df.drop_duplicates()

In [27]:
def col_append(df, s):
    return df.rename(columns=dict(zip(df.columns.values.tolist(), [f"{col}{s}" for col in df.columns.values.tolist()])))

In [28]:
gi_a_df = col_append(gi_df, "_a")
gi_b_df = col_append(gi_df, "_b")

In [29]:
o_a_df = col_append(o_df, "_a")
o_b_df = col_append(o_df, "_b")

In [30]:
print(ic_df.loc[ic_df.duplicated(subset=['partner_a', 'partner_b'], keep=False)])
  
# TODO: find ortholog for partner_a ONLY
# TODO: duplicate all interactions and find orthologs for partner_a (aka partner_b from first half) ONLY

ic_gi_df = ic_df.merge(gi_a_df, how="inner", left_on="partner_a", right_on="uniprot_a")
ic_gi_df = ic_gi_df.merge(gi_b_df, how="inner", left_on="partner_b", right_on="uniprot_b")
ic_gi_df = ic_gi_df.drop(columns=["uniprot_a", "uniprot_b"])

In [31]:
ic_gi_o_df = ic_gi_df.merge(o_a_df, how="inner", left_on="ensembl_a", right_on="ensembl_a")
ic_gi_o_df = ic_gi_o_df.merge(o_b_df, how="inner", left_on="ensembl_b", right_on="ensembl_b")

In [35]:
# Make plot

counts = [
  {
    "name": "Human-human interactions",
    "count": ic_df.shape[0]
  },
  {
    "name": "Human-human interactions with unique mouse-mouse orthologs",
    "count": ic_gi_o_df.shape[0]
  },
  {
    "name": "Unique human-human interactions with mouse-mouse orthologs",
    "count": ic_gi_o_df.drop_duplicates(subset=['partner_a', 'partner_b']).shape[0]
  }
]

plot = alt.Chart(pd.DataFrame(data=counts)).mark_bar().encode(
  y=alt.Y("name:N", axis=alt.Axis(labelLimit=1000, title=None)),
  x=alt.X("count:Q"),
).properties(title="Number of CellPhoneDB interactions")

plot

In [33]:
ic_gi_o_df.to_csv(snakemake.output['table'], sep='\t', index=False)
plot.save(snakemake.output['plot'], scale_factor=2.0)