In [1]:
import pandas as pd
import numpy as np

In [2]:
orf_similarity_df = pd.read_parquet('cos_sim/cosine_similarities_genes_orf.parquet')
crispr_similarity_df = pd.read_parquet('cos_sim/cosine_similarities_genes_crispr.parquet')

In [3]:
summary_df = pd.DataFrame()

In [4]:
def check_connections(
    genes, orf_similarity_df, crispr_similarity_df, summary_df
):
    for i in range(len(genes) - 1):
        gene_1 = genes[i]
        for j in range(i + 1, len(genes)):
            gene_2 = genes[j]
            df = pd.DataFrame(
                {
                    "gene_1": gene_1,
                    "gene_2": gene_2,
                    "Present_in_ORF": False,
                    "ORF_cosine_similarity": np.nan,
                    "Present_in_CRISPR": False,
                    "CRISPR_cosine_similarity": np.nan,
                },
                index=[0],
            )
            if (
                gene_1 in orf_similarity_df.index
                and gene_2 in orf_similarity_df.columns
            ):
                df["Present_in_ORF"] = True
                df["ORF_cosine_similarity"] = orf_similarity_df.loc[gene_1, gene_2]
            if (
                gene_1 in crispr_similarity_df.index
                and gene_2 in crispr_similarity_df.columns
            ):
                df["Present_in_CRISPR"] = True
                df["CRISPR_cosine_similarity"] = crispr_similarity_df.loc[
                    gene_1, gene_2
                ]

            summary_df = pd.concat([summary_df, df], ignore_index=True, axis=0)

    return summary_df

Check all old connections

In [5]:
gene_lists = [
    ["ZBTB16", "SLC39A1"],
    ["RAB30", "NAT14"],
    ["MYT1", "RNF41"],
    ["PIK3R3", "INSYN1"],
    ["PIK3R3", "RAB40B", "INSYN1"],
    ["HOOK2", "NDEL1", "NDE1", "PAFAH1B1"],
    ["TRAF2", "STK3", "YAP1", "WWTR1", "STK11"],
    ["GPR176", "TSC22D1", "DPAT1", "CHRM4"],
    ["ECH1", "UQCRFS1", "SARS2"],
    ["POLRID", "SPATA25", "CAMK2A", "GJB2", "ATG7", "MGLL", "CCL14", "PNPLA4", "EML1", "PER1", "SLC39A1"],
    ["FOXO3", "TGFB"]
]

In [6]:
for genes in gene_lists:
    summary_df = check_connections(genes, orf_similarity_df, crispr_similarity_df, summary_df)

In [7]:
print(summary_df.to_markdown(index=False))

| gene_1   | gene_2   | Present_in_ORF   |   ORF_cosine_similarity | Present_in_CRISPR   |   CRISPR_cosine_similarity |
|:---------|:---------|:-----------------|------------------------:|:--------------------|---------------------------:|
| ZBTB16   | SLC39A1  | True             |            -0.294491    | True                |                 -0.627275  |
| RAB30    | NAT14    | True             |             0.701613    | False               |                nan         |
| MYT1     | RNF41    | True             |            -0.55566     | True                |                 -0.0442788 |
| PIK3R3   | INSYN1   | True             |             0.260941    | False               |                nan         |
| PIK3R3   | RAB40B   | True             |            -0.337578    | True                |                  0.0226717 |
| PIK3R3   | INSYN1   | True             |             0.260941    | False               |                nan         |
| RAB40B   | INSYN1   | True            

In [8]:
print(orf_similarity_df.query("YAP1 > 0.4 or YAP1 < -0.4")["YAP1"].to_markdown())

|        |     YAP1 |
|:-------|---------:|
| WWTR1  | 0.528834 |
| PLS3   | 0.429233 |
| CORO2B | 0.466192 |
| CORO2A | 0.558896 |
| RTKN   | 0.422219 |
| CNN1   | 0.476689 |
| PRKCE  | 0.452393 |
| SYT2   | 0.405234 |
| SYT1   | 0.406036 |
| YAP1   | 1        |
| ZNF704 | 0.544696 |


Connections present in both ORF and CRISPR

In [9]:
print(summary_df.query("Present_in_CRISPR == True and Present_in_ORF == True").to_markdown(index=False))

| gene_1   | gene_2   | Present_in_ORF   |   ORF_cosine_similarity | Present_in_CRISPR   |   CRISPR_cosine_similarity |
|:---------|:---------|:-----------------|------------------------:|:--------------------|---------------------------:|
| ZBTB16   | SLC39A1  | True             |            -0.294491    | True                |                 -0.627275  |
| MYT1     | RNF41    | True             |            -0.55566     | True                |                 -0.0442788 |
| PIK3R3   | RAB40B   | True             |            -0.337578    | True                |                  0.0226717 |
| TRAF2    | STK11    | True             |            -0.000926614 | True                |                  0.240619  |
| GPR176   | TSC22D1  | True             |             0.681274    | True                |                  0.291056  |
| GPR176   | CHRM4    | True             |             0.689368    | True                |                  0.486747  |
| TSC22D1  | CHRM4    | True            

Connections present only in ORF

In [10]:
print(summary_df.query("Present_in_CRISPR == False and Present_in_ORF == True").to_markdown(index=False))

| gene_1   | gene_2   | Present_in_ORF   |   ORF_cosine_similarity | Present_in_CRISPR   |   CRISPR_cosine_similarity |
|:---------|:---------|:-----------------|------------------------:|:--------------------|---------------------------:|
| RAB30    | NAT14    | True             |              0.701613   | False               |                        nan |
| PIK3R3   | INSYN1   | True             |              0.260941   | False               |                        nan |
| PIK3R3   | INSYN1   | True             |              0.260941   | False               |                        nan |
| RAB40B   | INSYN1   | True             |             -0.424922   | False               |                        nan |
| HOOK2    | NDEL1    | True             |             -0.614541   | False               |                        nan |
| HOOK2    | NDE1     | True             |             -0.6563     | False               |                        nan |
| HOOK2    | PAFAH1B1 | True            

Present only in CRISPR

In [11]:
print(summary_df.query("Present_in_CRISPR == True and Present_in_ORF == False").to_markdown(index=False))

| gene_1   | gene_2   | Present_in_ORF   |   ORF_cosine_similarity | Present_in_CRISPR   |   CRISPR_cosine_similarity |
|:---------|:---------|:-----------------|------------------------:|:--------------------|---------------------------:|
| GJB2     | PNPLA4   | False            |                     nan | True                |                   0.231417 |
| ATG7     | PNPLA4   | False            |                     nan | True                |                   0.413008 |
| MGLL     | PNPLA4   | False            |                     nan | True                |                  -0.548349 |
| CCL14    | PNPLA4   | False            |                     nan | True                |                   0.443871 |
| PNPLA4   | SLC39A1  | False            |                     nan | True                |                  -0.241096 |
