In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read ORF and CRISPR cosine similarities

orf_similarity_df = pd.read_parquet('cos_sim/cosine_similarities_genes_orf.parquet')
crispr_similarity_df = pd.read_parquet('cos_sim/cosine_similarities_genes_crispr.parquet')

In [3]:
# Read knowledge graph information

knowledge_graph = pd.read_csv(
    "~/Downloads/orf_scores_merged.tsv",
    sep="\t",
    usecols=["GENE1", "GENE2", "gene_mf", "gene_bp", "gene_pathway"],
).rename(columns={"GENE1": "gene_1", "GENE2": "gene_2"})

knowledge_graph = (
    pd.concat(
        [
            knowledge_graph,
            knowledge_graph.rename(columns={"gene_1": "gene_2", "gene_2": "gene_1"}),
        ],
        ignore_index=True,
    )
)

In [4]:
def check_connections(
    genes, orf_similarity_df, crispr_similarity_df, connections_df, previous
):
    genes.sort()
    for i in range(len(genes) - 1):
        gene_1 = genes[i]
        for j in range(i + 1, len(genes)):
            gene_2 = genes[j]
            df = pd.DataFrame(
                {
                    "gene_1": gene_1,
                    "gene_2": gene_2,
                    "Previous": previous,
                    "Present_and_replicable_in_ORF": False,
                    "ORF_cosine_similarity": np.nan,
                    "Present_and_replicable_in_CRISPR": False,
                    "CRISPR_cosine_similarity": np.nan,
                },
                index=[0],
            )
            if (
                gene_1 in orf_similarity_df.index
                and gene_2 in orf_similarity_df.columns
            ):
                df["Present_and_replicable_in_ORF"] = True
                df["ORF_cosine_similarity"] = orf_similarity_df.loc[gene_1, gene_2]
            if (
                gene_1 in crispr_similarity_df.index
                and gene_2 in crispr_similarity_df.columns
            ):
                df["Present_and_replicable_in_CRISPR"] = True
                df["CRISPR_cosine_similarity"] = crispr_similarity_df.loc[
                    gene_1, gene_2
                ]

            connections_df = pd.concat([connections_df, df], ignore_index=True, axis=0)
    
    connections_df.drop_duplicates(subset=["gene_1", "gene_2"], inplace=True, keep="first")

    return connections_df

In [5]:
connections_df = pd.DataFrame()
signal_threshold = 0.4

Check previous connections that we were interested in

In [6]:
previous_gene_lists = [
    ["ZBTB16", "SLC39A1"],
    ["RAB30", "NAT14"],
    ["MYT1", "RNF41"],
    ["PIK3R3", "INSYN1"],
    ["PIK3R3", "RAB40B", "INSYN1"],
    ["HOOK2", "NDEL1", "NDE1", "PAFAH1B1"],
    ["TRAF2", "STK3", "YAP1", "WWTR1", "STK11"],
    ["GPR176", "TSC22D1", "DPAT1", "CHRM4"],
    ["ECH1", "UQCRFS1", "SARS2"],
    ["POLRID", "SPATA25", "CAMK2A", "GJB2", "ATG7", "MGLL", "CCL14", "PNPLA4", "EML1", "PER1", "SLC39A1"],
    ["FOXO3", "TGFB"]
]

In [7]:
for genes in previous_gene_lists:
    previous = True
    connections_df = check_connections(genes, orf_similarity_df, crispr_similarity_df, connections_df, previous=previous)

Check all the new connections

In [8]:
# Read current positive and negative connections

modalities = ["orf", "crispr"]
directions = ["similar", "anti_similar"]

for modality in modalities:
    for direction in directions:
        df = (
            pd.read_csv(f"output/{modality}_top_{direction}_genes.csv", index_col=0)
            .stack()
            .reset_index()
            .rename(columns={0: "cos_sim", "level_0": "gene_1", "level_1": "gene_2"})
        )

        df = df.query("gene_1 != gene_2")
        if direction == "similar":
            df = df.nlargest(100, "cos_sim")
        elif direction == "anti_similar":
            df = df.nsmallest(100, "cos_sim")

        gene_list = df[["gene_1", "gene_2"]].values
        previous=False
        for genes in gene_list:
            connections_df = check_connections(genes, orf_similarity_df, crispr_similarity_df, connections_df, previous=previous)

Add all knowledge graph information

In [9]:
knowledge_graph = pd.read_csv(
    "~/Downloads/orf_scores_merged.tsv",
    sep="\t",
    usecols=["GENE1", "GENE2", "gene_mf", "gene_bp", "gene_pathway"],
).rename(columns={"GENE1": "gene_1", "GENE2": "gene_2"})

knowledge_graph = (
    pd.concat(
        [
            knowledge_graph,
            knowledge_graph.rename(columns={"gene_1": "gene_2", "gene_2": "gene_1"}),
        ],
        ignore_index=True,
    )
)

connections_df = connections_df.merge(knowledge_graph, on=["gene_1", "gene_2"], how="left")

Determine if there is evidence in each data type

In [10]:
connections_df["Evidence_in_ORF"] = np.where(np.abs(connections_df["ORF_cosine_similarity"]) > signal_threshold, True, False)
connections_df["Evidence_in_CRISPR"] = np.where(np.abs(connections_df["CRISPR_cosine_similarity"]) > signal_threshold, True, False)
connections_df["Evidence_in_Knowledge_Graph"] = (connections_df[["gene_mf", "gene_bp", "gene_pathway"]].abs() > signal_threshold).any(axis=1)

Filter out rows where the knowledge graph information is not available

In [11]:
connections_df_knowledge_graph = connections_df.query("gene_mf.notna() and gene_bp.notna() and gene_pathway.notna()").reset_index(drop=True)
connections_df_knowledge_graph

Unnamed: 0,gene_1,gene_2,Previous,Present_and_replicable_in_ORF,ORF_cosine_similarity,Present_and_replicable_in_CRISPR,CRISPR_cosine_similarity,gene_mf,gene_bp,gene_pathway,Evidence_in_ORF,Evidence_in_CRISPR,Evidence_in_Knowledge_Graph
0,NAT14,RAB30,True,True,0.701613,False,,0.266,0.579,0.359,True,False,True
1,MYT1,RNF41,True,True,-0.555660,True,-0.044279,0.356,0.202,0.381,True,False,False
2,INSYN1,PIK3R3,True,True,0.260941,False,,0.448,0.328,0.182,False,False,True
3,INSYN1,RAB40B,True,True,-0.424922,False,,0.245,0.374,0.229,True,False,False
4,PIK3R3,RAB40B,True,True,-0.337578,True,0.022672,0.252,0.434,0.067,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
156,PDLIM1,TBPL1,False,True,0.000233,True,-0.914976,0.345,0.159,0.220,False,True,False
157,CCND2,PRKCE,False,True,-0.046213,True,-0.912739,0.648,0.443,0.547,False,True,True
158,BRCA1,TBPL1,False,True,-0.114601,True,-0.912553,0.945,0.744,0.794,False,True,True
159,ACOT4,CNOT3,False,True,0.045712,True,-0.912291,0.005,-0.410,-0.186,False,True,True


Old connections that have evidence in either ORFs or CRISPRs or both

How many are known connections?

In [12]:
connections_df_knowledge_graph[
    connections_df_knowledge_graph[["Evidence_in_ORF", "Evidence_in_CRISPR"]].any(axis=1)
].query("Previous==True").query("Evidence_in_Knowledge_Graph==True").reset_index(
    drop=True
)

Unnamed: 0,gene_1,gene_2,Previous,Present_and_replicable_in_ORF,ORF_cosine_similarity,Present_and_replicable_in_CRISPR,CRISPR_cosine_similarity,gene_mf,gene_bp,gene_pathway,Evidence_in_ORF,Evidence_in_CRISPR,Evidence_in_Knowledge_Graph
0,NAT14,RAB30,True,True,0.701613,False,,0.266,0.579,0.359,True,False,True
1,HOOK2,NDE1,True,True,-0.6563,False,,0.684,0.643,0.341,True,False,True
2,HOOK2,NDEL1,True,True,-0.614541,False,,0.7,0.733,0.421,True,False,True
3,HOOK2,PAFAH1B1,True,True,-0.610536,False,,0.717,0.641,0.378,True,False,True
4,NDE1,NDEL1,True,True,0.905747,False,,0.996,0.999,0.98,True,False,True
5,NDE1,PAFAH1B1,True,True,0.860934,False,,0.996,0.999,0.988,True,False,True
6,NDEL1,PAFAH1B1,True,True,0.911471,False,,0.998,1.0,0.986,True,False,True
7,WWTR1,YAP1,True,True,0.528834,False,,0.949,0.898,0.969,True,False,True
8,CHRM4,GPR176,True,True,0.689368,True,0.486747,0.392,0.65,0.835,True,True,True
9,ECH1,SARS2,True,True,0.376947,True,0.456086,0.201,0.569,0.58,False,True,True


How many are unknown connections?

In [13]:
connections_df_knowledge_graph[
    connections_df_knowledge_graph[["Evidence_in_ORF", "Evidence_in_CRISPR"]].any(axis=1)
].query("Previous==True").query("Evidence_in_Knowledge_Graph==False").reset_index(
    drop=True
)

Unnamed: 0,gene_1,gene_2,Previous,Present_and_replicable_in_ORF,ORF_cosine_similarity,Present_and_replicable_in_CRISPR,CRISPR_cosine_similarity,gene_mf,gene_bp,gene_pathway,Evidence_in_ORF,Evidence_in_CRISPR,Evidence_in_Knowledge_Graph
0,MYT1,RNF41,True,True,-0.55566,True,-0.044279,0.356,0.202,0.381,True,False,False
1,INSYN1,RAB40B,True,True,-0.424922,False,,0.245,0.374,0.229,True,False,False
2,CHRM4,TSC22D1,True,True,0.478575,True,0.548977,-0.002,0.197,0.048,True,True,False
3,GPR176,TSC22D1,True,True,0.681274,True,0.291056,0.223,0.271,0.055,True,False,False
4,ATG7,MGLL,True,True,0.408002,True,-0.574792,0.306,-0.24,-0.27,True,True,False
5,CCL14,EML1,True,True,0.485789,False,,-0.186,0.322,0.386,True,False,False


New connections that have evidence in either ORFs or CRISPRs or both

How many are known connections?

In [14]:
connections_df_knowledge_graph[
    connections_df_knowledge_graph[["Evidence_in_ORF", "Evidence_in_CRISPR"]].any(axis=1)
].query("Previous==False").query("Evidence_in_Knowledge_Graph==True").reset_index(
    drop=True
)

Unnamed: 0,gene_1,gene_2,Previous,Present_and_replicable_in_ORF,ORF_cosine_similarity,Present_and_replicable_in_CRISPR,CRISPR_cosine_similarity,gene_mf,gene_bp,gene_pathway,Evidence_in_ORF,Evidence_in_CRISPR,Evidence_in_Knowledge_Graph
0,OR2S2,SLC22A13,False,True,0.934601,False,,0.110,0.339,0.775,True,False,True
1,DGUOK,MRPS2,False,True,0.923327,False,,0.484,0.457,0.428,True,False,True
2,ALKBH7,COQ5,False,True,0.921665,False,,0.462,0.527,0.637,True,False,True
3,SLC22A13,SLC7A9,False,True,0.910474,False,,0.979,0.930,0.881,True,False,True
4,OR2S2,RHBG,False,True,0.909783,False,,0.243,0.410,0.634,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,BMP2K,TBPL1,False,True,-0.011600,True,-0.921174,0.523,0.041,0.247,False,True,True
78,CCND2,PRKCE,False,True,-0.046213,True,-0.912739,0.648,0.443,0.547,False,True,True
79,BRCA1,TBPL1,False,True,-0.114601,True,-0.912553,0.945,0.744,0.794,False,True,True
80,ACOT4,CNOT3,False,True,0.045712,True,-0.912291,0.005,-0.410,-0.186,False,True,True


How many are unknown connections?

In [15]:
connections_df_knowledge_graph[
    connections_df_knowledge_graph[["Evidence_in_ORF", "Evidence_in_CRISPR"]].any(axis=1)
].query("Previous==False").query("Evidence_in_Knowledge_Graph==False").reset_index(
    drop=True
)

Unnamed: 0,gene_1,gene_2,Previous,Present_and_replicable_in_ORF,ORF_cosine_similarity,Present_and_replicable_in_CRISPR,CRISPR_cosine_similarity,gene_mf,gene_bp,gene_pathway,Evidence_in_ORF,Evidence_in_CRISPR,Evidence_in_Knowledge_Graph
0,CYP11B1,MRPS2,False,True,0.884627,False,,0.088,-0.009,-0.066,True,False,False
1,NEMP1,TOR1AIP2,False,True,-0.6551,False,,0.316,0.391,0.344,True,False,False
2,NEMP1,STYK1,False,True,-0.612626,False,,0.359,0.249,0.105,True,False,False
3,NDE1,PAFAH1B2,False,True,-0.585289,False,,0.343,0.051,-0.077,True,False,False
4,SUN2,TMX4,False,True,-0.58511,False,,0.131,0.281,0.029,True,False,False
5,NEMP1,PTPN5,False,True,-0.584235,False,,0.399,0.25,0.167,True,False,False
6,CCL14,CT83,False,True,-0.533877,False,,0.09,-0.015,0.213,True,False,False
7,NDE1,PAFAH1B3,False,True,-0.530362,False,,0.373,0.117,0.017,True,False,False
8,EPHX1,SUN2,False,True,-0.525203,False,,0.163,-0.015,0.213,True,False,False
9,EPHX1,NEMP1,False,True,-0.519084,False,,0.157,0.207,0.321,True,False,False


Ignoring knowledge graph information, how many previous connections have evidence in either ORFs or CRISPRs or both?

In [16]:
connections_df[
    connections_df[["Evidence_in_ORF", "Evidence_in_CRISPR"]].any(axis=1)
].query("Previous==True").reset_index(drop=True)

Unnamed: 0,gene_1,gene_2,Previous,Present_and_replicable_in_ORF,ORF_cosine_similarity,Present_and_replicable_in_CRISPR,CRISPR_cosine_similarity,gene_mf,gene_bp,gene_pathway,Evidence_in_ORF,Evidence_in_CRISPR,Evidence_in_Knowledge_Graph
0,SLC39A1,ZBTB16,True,True,-0.294491,True,-0.627275,,,,False,True,False
1,NAT14,RAB30,True,True,0.701613,False,,0.266,0.579,0.359,True,False,True
2,MYT1,RNF41,True,True,-0.55566,True,-0.044279,0.356,0.202,0.381,True,False,False
3,INSYN1,RAB40B,True,True,-0.424922,False,,0.245,0.374,0.229,True,False,False
4,HOOK2,NDE1,True,True,-0.6563,False,,0.684,0.643,0.341,True,False,True
5,HOOK2,NDEL1,True,True,-0.614541,False,,0.7,0.733,0.421,True,False,True
6,HOOK2,PAFAH1B1,True,True,-0.610536,False,,0.717,0.641,0.378,True,False,True
7,NDE1,NDEL1,True,True,0.905747,False,,0.996,0.999,0.98,True,False,True
8,NDE1,PAFAH1B1,True,True,0.860934,False,,0.996,0.999,0.988,True,False,True
9,NDEL1,PAFAH1B1,True,True,0.911471,False,,0.998,1.0,0.986,True,False,True


Ignoring knowledge graph information, how many new connections have evidence in either ORFs or CRISPRs or both?

In [17]:
connections_df[
    connections_df[["Evidence_in_ORF", "Evidence_in_CRISPR"]].any(axis=1)
].query("Previous==False").reset_index(drop=True)

Unnamed: 0,gene_1,gene_2,Previous,Present_and_replicable_in_ORF,ORF_cosine_similarity,Present_and_replicable_in_CRISPR,CRISPR_cosine_similarity,gene_mf,gene_bp,gene_pathway,Evidence_in_ORF,Evidence_in_CRISPR,Evidence_in_Knowledge_Graph
0,OR2S2,SLC22A13,False,True,0.934601,False,,0.110,0.339,0.775,True,False,True
1,DGUOK,MRPS2,False,True,0.923327,False,,0.484,0.457,0.428,True,False,True
2,ALKBH7,COQ5,False,True,0.921665,False,,0.462,0.527,0.637,True,False,True
3,SLC22A13,SLC7A9,False,True,0.910474,False,,0.979,0.930,0.881,True,False,True
4,OR2S2,RHBG,False,True,0.909783,False,,0.243,0.410,0.634,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,FHL5,HTR1B,False,False,,True,-0.906100,,,,False,True,False
189,CDKN2A,CYP11B1,False,False,,True,-0.905980,,,,False,True,False
190,CCRL2,TCFL5,False,False,,True,-0.905511,,,,False,True,False
191,BMP2K,SLC24A5,False,True,0.030368,True,-0.905494,,,,False,True,False


YAP1 connections

In [18]:
print(orf_similarity_df.query("YAP1 > @signal_threshold or YAP1 < -@signal_threshold")["YAP1"].to_markdown())

|        |     YAP1 |
|:-------|---------:|
| WWTR1  | 0.528834 |
| PLS3   | 0.429233 |
| CORO2B | 0.466192 |
| CORO2A | 0.558896 |
| RTKN   | 0.422219 |
| CNN1   | 0.476689 |
| PRKCE  | 0.452393 |
| SYT2   | 0.405234 |
| SYT1   | 0.406036 |
| YAP1   | 1        |
| ZNF704 | 0.544696 |
