In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read ORF and CRISPR cosine similarities

orf_similarity_df = pd.read_parquet('cos_sim/cosine_similarities_genes_orf.parquet')
crispr_similarity_df = pd.read_parquet('cos_sim/cosine_similarities_genes_crispr.parquet')

In [3]:
def check_connections(
    genes, orf_similarity_df, crispr_similarity_df, connections_df, previous
):
    genes.sort()
    for i in range(len(genes) - 1):
        gene_1 = genes[i]
        for j in range(i + 1, len(genes)):
            gene_2 = genes[j]
            df = pd.DataFrame(
                {
                    "gene_1": gene_1,
                    "gene_2": gene_2,
                    "Previous": previous,
                    "Present_and_replicable_in_ORF": False,
                    "ORF_cosine_similarity": np.nan,
                    "Present_and_replicable_in_CRISPR": False,
                    "CRISPR_cosine_similarity": np.nan,
                },
                index=[0],
            )
            if (
                gene_1 in orf_similarity_df.index
                and gene_2 in orf_similarity_df.columns
            ):
                df["Present_and_replicable_in_ORF"] = True
                df["ORF_cosine_similarity"] = orf_similarity_df.loc[gene_1, gene_2]
            if (
                gene_1 in crispr_similarity_df.index
                and gene_2 in crispr_similarity_df.columns
            ):
                df["Present_and_replicable_in_CRISPR"] = True
                df["CRISPR_cosine_similarity"] = crispr_similarity_df.loc[
                    gene_1, gene_2
                ]

            connections_df = pd.concat([connections_df, df], ignore_index=True, axis=0)
    
    connections_df.drop_duplicates(subset=["gene_1", "gene_2"], inplace=True, keep="first")

    return connections_df

In [4]:
connections_df = pd.DataFrame()
signal_threshold = 0.4

Check previous connections that we were interested in

In [5]:
previous_gene_lists = [
    ["ZBTB16", "SLC39A1"],
    ["RAB30", "NAT14"],
    ["MYT1", "RNF41", "INSYN1"],
    ["PIK3R3", "RAB40A", "RAB40B", "RAB40C", "INSYN1", "XLOC_l2_008134"],
    ["BICD2", "HOOK1", "HOOK2", "SPDL1", "NDEL1", "NDE1", "PAFAH1B1"],
    ["TRAF2", "STK3", "YAP1", "WWTR1", "STK11"],
    ["GPR176", "TSC22D1", "DPAT1", "CHRM4"],
    ["ECH1", "UQCRFS1", "SARS2"],
    ["POLRID", "SPATA25", "CAMK2A", "GJB2", "ATG7", "MGLL", "CCL14", "PNPLA4", "EML1", "PER1", "SLC39A1"],
    ["FOXO3", "TGFB"]
]

In [6]:
for genes in previous_gene_lists:
    previous = True
    connections_df = check_connections(genes, orf_similarity_df, crispr_similarity_df, connections_df, previous=previous)

Check all the new connections

In [7]:
# Read current positive and negative connections

modalities = ["orf", "crispr"]
directions = ["similar", "anti_similar"]

for modality in modalities:
    for direction in directions:
        df = (
            pd.read_csv(f"output/{modality}_top_{direction}_genes.csv", index_col=0)
            .stack()
            .reset_index()
            .rename(columns={0: "cos_sim", "level_0": "gene_1", "level_1": "gene_2"})
        )

        df = df.query("gene_1 != gene_2")
        if direction == "similar":
            df = df.nlargest(100, "cos_sim")
        elif direction == "anti_similar":
            df = df.nsmallest(100, "cos_sim")

        gene_list = df[["gene_1", "gene_2"]].values
        previous=False
        for genes in gene_list:
            connections_df = check_connections(genes, orf_similarity_df, crispr_similarity_df, connections_df, previous=previous)

Add all knowledge graph information

In [8]:
knowledge_graph_orf = pd.read_csv(
    "~/Downloads/orf_scores_merged.zip",
    sep="\t",
    usecols=["GENE1", "GENE2", "gene_mf__go", "gene_bp__go", "gene_pathway"],
).rename(
    columns={
        "GENE1": "gene_1",
        "GENE2": "gene_2",
        "gene_mf__go": "gene_mf",
        "gene_bp__go": "gene_bp",
    }
)

knowledge_graph_orf = pd.concat(
    [
        knowledge_graph_orf,
        knowledge_graph_orf.rename(columns={"gene_1": "gene_2", "gene_2": "gene_1"}),
    ],
    ignore_index=True,
)

knowledge_graph_crispr = pd.read_csv(
    "~/Downloads/crispr_scores_merged.zip",
    sep="\t",
    usecols=["GENE1", "GENE2", "gene_mf__go", "gene_bp__go", "gene_pathway"],
).rename(
    columns={
        "GENE1": "gene_1",
        "GENE2": "gene_2",
        "gene_mf__go": "gene_mf",
        "gene_bp__go": "gene_bp",
    }
)

knowledge_graph_crispr = pd.concat(
    [
        knowledge_graph_crispr,
        knowledge_graph_crispr.rename(columns={"gene_1": "gene_2", "gene_2": "gene_1"}),
    ],
    ignore_index=True,
)

knowledge_graph = pd.concat(
    [knowledge_graph_orf, knowledge_graph_crispr], ignore_index=True
).drop_duplicates(subset=["gene_1", "gene_2"])

connections_df = connections_df.merge(
    knowledge_graph, on=["gene_1", "gene_2"], how="left"
)

Determine if there is evidence in each data type

In [9]:
connections_df["Evidence_in_ORF"] = np.where(np.abs(connections_df["ORF_cosine_similarity"]) > signal_threshold, True, False)
connections_df["Evidence_in_CRISPR"] = np.where(np.abs(connections_df["CRISPR_cosine_similarity"]) > signal_threshold, True, False)
connections_df["Evidence_in_Knowledge_Graph"] = (connections_df[["gene_mf", "gene_bp", "gene_pathway"]].abs() > signal_threshold).any(axis=1)

In [10]:
with pd.ExcelWriter("output/connections.xlsx") as writer:
    connections_df.to_excel(writer, index=False, sheet_name="all")

Filter out rows where the knowledge graph information is not available

In [11]:
connections_df_knowledge_graph = connections_df.query("gene_mf.notna() and gene_bp.notna() and gene_pathway.notna()").reset_index(drop=True)

Old connections that have evidence in either ORFs or CRISPRs or both

In [12]:
with pd.ExcelWriter("output/connections.xlsx", mode="a") as writer:
    connections_df[
        connections_df[["Evidence_in_ORF", "Evidence_in_CRISPR"]].any(axis=1)
    ].query("Previous==True").reset_index(drop=True).to_excel(
        writer, index=False, sheet_name="Previous_with_evidence"
    )

Old connections without evidence in either ORFs or CRISPRs

In [13]:
with pd.ExcelWriter("output/connections.xlsx", mode="a") as writer:
    connections_df[
        ~connections_df[["Evidence_in_ORF", "Evidence_in_CRISPR"]].any(axis=1)
    ].query("Previous==True").reset_index(drop=True).to_excel(
        writer, index=False, sheet_name="Previous_without_evidence"
    )

Connections that have evidence in ORFs or CRISPRs

In [14]:
with pd.ExcelWriter("output/connections.xlsx", mode="a") as writer:
    connections_df[
        connections_df[["Evidence_in_ORF", "Evidence_in_CRISPR"]].any(axis=1)
    ].reset_index(drop=True).to_excel(
        writer, index=False, sheet_name="all_with_evidence"
    )

Connections that have evidence in ORFs or CRISPRs with evidence in knowledge graph

In [15]:
with pd.ExcelWriter("output/connections.xlsx", mode="a") as writer:
    connections_df_knowledge_graph[
        connections_df_knowledge_graph[["Evidence_in_ORF", "Evidence_in_CRISPR"]].any(
            axis=1
        )
    ].query("Evidence_in_Knowledge_Graph==True").reset_index(drop=True).to_excel(
        writer, index=False, sheet_name="all_with_evidence_known"
    )

Connections that have evidence in ORFs or CRISPRs without evidence in knowledge graph

In [16]:
with pd.ExcelWriter("output/connections.xlsx", mode="a") as writer:
    connections_df_knowledge_graph[
        connections_df_knowledge_graph[["Evidence_in_ORF", "Evidence_in_CRISPR"]].any(
            axis=1
        )
    ].query("Evidence_in_Knowledge_Graph==False").reset_index(drop=True).to_excel(
        writer, index=False, sheet_name="all_with_evidence_unknown"
    )

Connections with evidence in both ORFs and CRISPRs

In [17]:
with pd.ExcelWriter("output/connections.xlsx", mode="a") as writer:
    connections_df[
        connections_df[["Evidence_in_ORF", "Evidence_in_CRISPR"]].all(axis=1)
    ].reset_index(drop=True).to_excel(
        writer, index=False, sheet_name="all_with_evidence_in_both"
    )

Connections with evidence in both ORFs and CRISPRs with evidence in knowledge graph

In [18]:
with pd.ExcelWriter("output/connections.xlsx", mode="a") as writer:
    connections_df_knowledge_graph[
        connections_df_knowledge_graph[["Evidence_in_ORF", "Evidence_in_CRISPR"]].all(
            axis=1
        )
    ].query("Evidence_in_Knowledge_Graph==True").reset_index(drop=True).to_excel(
        writer, index=False, sheet_name="all_with_evidence_both_known"
    )

Connections with evidence in both ORFs and CRISPRs without evidence in knowledge graph

In [19]:
with pd.ExcelWriter("output/connections.xlsx", mode="a") as writer:
    connections_df_knowledge_graph[
        connections_df_knowledge_graph[["Evidence_in_ORF", "Evidence_in_CRISPR"]].all(
            axis=1
        )
    ].query("Evidence_in_Knowledge_Graph==False").reset_index(drop=True).to_excel(
        writer, index=False, sheet_name="all_with_evidence_both_unknown"
    )

YAP1 connections

In [20]:
print(orf_similarity_df.query("YAP1 > @signal_threshold or YAP1 < -@signal_threshold")["YAP1"].to_markdown())

|        |     YAP1 |
|:-------|---------:|
| WWTR1  | 0.528834 |
| PLS3   | 0.429233 |
| CORO2B | 0.466192 |
| CORO2A | 0.558896 |
| RTKN   | 0.422219 |
| CNN1   | 0.476689 |
| PRKCE  | 0.452393 |
| SYT2   | 0.405234 |
| SYT1   | 0.406036 |
| YAP1   | 1        |
| ZNF704 | 0.544696 |
