In [1]:
import pandas as pd

In [2]:
orf_genesets = [
    ["SLC39A1", "ZBTB16"],
    ["SARS2", "ECH1", "DGUOK", "MRPS2", "LDHAL6B", "UQCRFS1"],
    ["MYT1", "LZTS2", "CHRM4", "GPR176", "TSC22D1"],
    ["SPDL1", "PAFAH1B1", "NDEL1", "NDE1", "HOOK2", "HOOK1"],
    ["MYT1", "INSYN1", "RNF41"],
    ["ZFP36L1", "PIK3R3", "NRBP1", "INSYN1", "HOXC8", "RAB40C", "RAB40B"],
    ['YAP1', 'WWTR1', 'VGLL4', 'PRKCE', 'STK3', 'CEP72', 'IL20RB', 'MTMR9']
]

crispr_genesets = [
    ["SLC39A1", "ZBTB16"],
    ["FOXO3", "TGFB1"],
    ["SARS2", "ECH1", "PVR", "LAIR1", "SLC1A5", "UQCRFS1"],
    ["MYT1", "CHRM4", "GPR176", "TSC22D1"],
    ["PIK3R3", "ZFP36L1", "HOXC8", "NRBP1", "RAB40B"]
]

In [3]:
# Read ORF similarities

orf_similarities_df = pd.read_parquet("cos_sim/cosine_similarities_genes_orf.parquet")

# Read CRISPR similarities

crispr_similarities_df = pd.read_parquet("cos_sim/cosine_similarities_genes_crispr.parquet")

In [4]:
for gene in orf_genesets:
    print(orf_similarities_df.loc[gene, gene].to_markdown())
    print()

|         |   SLC39A1 |    ZBTB16 |
|:--------|----------:|----------:|
| SLC39A1 |  1        | -0.294491 |
| ZBTB16  | -0.294491 |  1        |

|         |    SARS2 |     ECH1 |    DGUOK |    MRPS2 |   LDHAL6B |   UQCRFS1 |
|:--------|---------:|---------:|---------:|---------:|----------:|----------:|
| SARS2   | 1        | 0.376947 | 0.592026 | 0.598109 |  0.490712 |  0.48721  |
| ECH1    | 0.376947 | 1        | 0.657316 | 0.626516 |  0.703084 |  0.632372 |
| DGUOK   | 0.592026 | 0.657316 | 1        | 0.923327 |  0.815027 |  0.818835 |
| MRPS2   | 0.598109 | 0.626516 | 0.923327 | 1        |  0.771309 |  0.780322 |
| LDHAL6B | 0.490712 | 0.703084 | 0.815027 | 0.771309 |  1        |  0.738865 |
| UQCRFS1 | 0.48721  | 0.632372 | 0.818835 | 0.780322 |  0.738865 |  1        |

|         |      MYT1 |      LZTS2 |      CHRM4 |    GPR176 |   TSC22D1 |
|:--------|----------:|-----------:|-----------:|----------:|----------:|
| MYT1    |  1        |  0.572674  | -0.149658  | -0.246265 | -0.1

In [5]:
for gene in crispr_genesets:
    print(crispr_similarities_df.loc[gene, gene].to_markdown())
    print()

|         |   SLC39A1 |    ZBTB16 |
|:--------|----------:|----------:|
| SLC39A1 |  1        | -0.304687 |
| ZBTB16  | -0.304687 |  1        |

|       |     FOXO3 |     TGFB1 |
|:------|----------:|----------:|
| FOXO3 |  1        | -0.206471 |
| TGFB1 | -0.206471 |  1        |

|         |    SARS2 |     ECH1 |        PVR |    LAIR1 |   SLC1A5 |    UQCRFS1 |
|:--------|---------:|---------:|-----------:|---------:|---------:|-----------:|
| SARS2   | 1        | 0.194478 |  0.315206  | 0.257602 | 0.232863 |  0.182465  |
| ECH1    | 0.194478 | 1        |  0.114323  | 0.315689 | 0.525064 |  0.10414   |
| PVR     | 0.315206 | 0.114323 |  1         | 0.185011 | 0.143079 | -0.0234959 |
| LAIR1   | 0.257602 | 0.315689 |  0.185011  | 1        | 0.357852 |  0.29154   |
| SLC1A5  | 0.232863 | 0.525064 |  0.143079  | 0.357852 | 1        |  0.203122  |
| UQCRFS1 | 0.182465 | 0.10414  | -0.0234959 | 0.29154  | 0.203122 |  1         |

|         |        MYT1 |      CHRM4 |    GPR176 |   TSC22D1 

In [15]:
orf_similarity_rank = (
    orf_similarities_df.unstack()
    .reset_index()
    .rename(columns={"level_0": "gene_1", "level_1": "gene_2", 0: "cosine_similarity"})
    .query("gene_1 != gene_2")
    .sort_values(by="cosine_similarity", ascending=False)
    .reset_index(drop=True)
    .reset_index()
    .rename(columns={"index": "rank"})
    .sort_values(by="rank", ascending=False)
    .reset_index(drop=True)
    .reset_index()
    .rename(columns={"index": "reverse_rank"})
)

In [20]:
orf_similarity_rank.query("gene_1=='MYT1' and gene_2=='RNF41'")

Unnamed: 0,reverse_rank,rank,gene_1,gene_2,cosine_similarity
26,26,49427903,MYT1,RNF41,-0.55566


In [23]:
crispr_similarities_df.loc["MYT1", "RNF41"]

0.025132442

In [22]:
crispr_similarity_rank = (
    crispr_similarities_df.unstack()
    .reset_index()
    .rename(columns={"level_0": "gene_1", "level_1": "gene_2", 0: "cosine_similarity"})
    .query("gene_1 != gene_2")
    .sort_values(by="cosine_similarity", ascending=False)
    .reset_index(drop=True)
    .reset_index()
    .rename(columns={"index": "rank"})
    .sort_values(by="rank", ascending=False)
    .reset_index(drop=True)
    .reset_index()
    .rename(columns={"index": "reverse_rank"})
)