In [1]:
import pandas as pd
import numpy as np

In [2]:
orf_genesets = {
    "ECH1": ["SARS2", "ECH1", "DGUOK", "MRPS2", "LDHAL6B", "UQCRFS1"],
    "MYT1": ["MYT1", "LZTS2", "CHRM4", "GPR176", "TSC22D1"],
    "RNF41": ["MYT1", "INSYN1", "RNF41"],
    "RAB40B": ["ZFP36L1", "PIK3R3", "NRBP1", "INSYN1", "HOXC8", "RAB40C", "RAB40B"],
    "YAP1": ["YAP1", "WWTR1", "VGLL4", "PRKCE", "STK3", "CEP72", "IL20RB", "MTMR9"],
}

crispr_genesets = {
    "ECH1": ["SARS2", "ECH1", "PVR", "LAIR1", "SLC1A5", "UQCRFS1"],
    "MYT1": ["MYT1", "CHRM4", "GPR176", "TSC22D1"],
    "RAB40B": ["PIK3R3", "ZFP36L1", "HOXC8", "NRBP1", "RAB40B"],
}

In [3]:
orf_similarity_percentile_df = pd.read_parquet("cos_sim/cosine_similarities_percentile_genes_orf.parquet")

crispr_similarity_percentile_df = pd.read_parquet("cos_sim/cosine_similarities_percentile_genes_crispr.parquet")

In [4]:
knowledge_graph_orf = pd.read_csv(
    "~/Downloads/orf_scores_merged.zip",
    sep="\t",
    usecols=["GENE1", "GENE2", "gene_mf__go", "gene_bp__go", "gene_pathway"],
).rename(
    columns={
        "GENE1": "gene_1",
        "GENE2": "gene_2",
        "gene_mf__go": "gene_mf",
        "gene_bp__go": "gene_bp",
    }
)

knowledge_graph_orf = pd.concat(
    [
        knowledge_graph_orf,
        knowledge_graph_orf.rename(columns={"gene_1": "gene_2", "gene_2": "gene_1"}),
    ],
    ignore_index=True,
)

knowledge_graph_orf = knowledge_graph_orf.assign(
    gene_kg=lambda x: np.max(x[[ "gene_mf", "gene_bp", "gene_pathway"]].abs(), axis=1)
).query("gene_1 != gene_2").drop(columns=["gene_mf", "gene_bp", "gene_pathway"])

knowledge_graph_crispr = pd.read_csv(
    "~/Downloads/crispr_scores_merged.zip",
    sep="\t",
    usecols=["GENE1", "GENE2", "gene_mf__go", "gene_bp__go", "gene_pathway"],
).rename(
    columns={
        "GENE1": "gene_1",
        "GENE2": "gene_2",
        "gene_mf__go": "gene_mf",
        "gene_bp__go": "gene_bp",
    }
)

knowledge_graph_crispr = pd.concat(
    [
        knowledge_graph_crispr,
        knowledge_graph_crispr.rename(columns={"gene_1": "gene_2", "gene_2": "gene_1"}),
    ],
    ignore_index=True,
)

knowledge_graph_crispr = knowledge_graph_crispr.assign(
    gene_kg=lambda x: np.max(x[[ "gene_mf", "gene_bp", "gene_pathway"]].abs(), axis=1)
).query("gene_1 != gene_2").drop(columns=["gene_mf", "gene_bp", "gene_pathway"])

In [5]:
for gene in orf_genesets:
    geneset = orf_genesets[gene]
    df = (
        orf_similarity_percentile_df[gene]
        .copy()
        .reset_index()
        .rename(columns={gene: "similarity"})
        .assign(gene_2=gene)
    )

    df = df.merge(knowledge_graph_orf, on=["gene_1", "gene_2"], how="left")
    df = df.query("gene_kg > 0.7").query("gene_1 not in @geneset").sort_values(
        by="similarity", ascending=False
    )

    geneset = geneset + df.head(2).gene_1.to_list()
    print(gene, geneset)

ECH1 ['SARS2', 'ECH1', 'DGUOK', 'MRPS2', 'LDHAL6B', 'UQCRFS1', 'ACADVL', 'HMGCL']
MYT1 ['MYT1', 'LZTS2', 'CHRM4', 'GPR176', 'TSC22D1', 'NEUROD1', 'MSC']
RNF41 ['MYT1', 'INSYN1', 'RNF41', 'PAK4', 'IKZF1']
RAB40B ['ZFP36L1', 'PIK3R3', 'NRBP1', 'INSYN1', 'HOXC8', 'RAB40C', 'RAB40B', 'HTR1D', 'TUBA3D']
YAP1 ['YAP1', 'WWTR1', 'VGLL4', 'PRKCE', 'STK3', 'CEP72', 'IL20RB', 'MTMR9', 'TXNIP', 'LDLRAD4']


In [6]:
for gene in crispr_genesets:
    geneset = crispr_genesets[gene]
    df = (
        crispr_similarity_percentile_df[gene]
        .copy()
        .reset_index()
        .rename(columns={gene: "similarity"})
        .assign(gene_2=gene)
    )

    df = df.merge(knowledge_graph_crispr, on=["gene_1", "gene_2"], how="left")
    df = df.query("gene_kg > 0.7").query("gene_1 not in @geneset").sort_values(
        by="similarity", ascending=False
    )

    geneset = geneset + df.head(2).gene_1.to_list()
    print(gene, geneset)

ECH1 ['SARS2', 'ECH1', 'PVR', 'LAIR1', 'SLC1A5', 'UQCRFS1', 'CYP3A4', 'F2']
MYT1 ['MYT1', 'CHRM4', 'GPR176', 'TSC22D1', 'GHRHR', 'SCX']
RAB40B ['PIK3R3', 'ZFP36L1', 'HOXC8', 'NRBP1', 'RAB40B', 'F10', 'HTR1B']
