In [28]:
import pandas as pd

In [29]:
old_orf_df = pd.read_csv("../datasets/metadata/orf.csv.gz").drop_duplicates(
    subset=["Metadata_Symbol"]
)
old_crispr_df = pd.read_csv("../datasets/metadata/crispr.csv.gz").drop_duplicates(
    subset=["Metadata_Symbol"]
)
new_orf_df = (
    pd.read_csv("output/orf.csv.gz")
    .drop_duplicates(subset=["Metadata_Symbol"])
    .dropna(subset=["Metadata_Approved_Symbol"])
)
new_crispr_df = (
    pd.read_csv("output/crispr.csv.gz")
    .drop_duplicates(subset=["Metadata_Symbol"])
    .dropna(subset=["Metadata_Approved_Symbol"])
)

In [30]:
human_protein_atlas_go_annotations = pd.read_csv(
    "output/human_protein_atlas_go_annotations.tsv.gz", sep="\t"
)
corum_df = pd.read_csv("output/CORUM_proteincomplex_annotations.tsv", sep="\t")

Overlap between ORFs and CRISPRs

In [31]:
old_orf_crispr = old_orf_df.merge(
    old_crispr_df, on="Metadata_Symbol", suffixes=("_orf", "_crispr"), how="inner"
)
old_orf_crispr.shape

(5252, 14)

In [32]:
new_orf_crispr = new_orf_df.merge(
    new_crispr_df,
    on="Metadata_Approved_Symbol",
    suffixes=("_orf", "_crispr"),
    how="inner",
)
new_orf_crispr.shape

(5342, 16)

In [33]:
old_orf_corum = old_orf_df.merge(
    corum_df, left_on="Metadata_Symbol", right_on="Symbol", how="inner"
)
old_orf_corum.shape

(3003, 14)

In [34]:
new_orf_corum = new_orf_df.merge(
    corum_df, left_on="Metadata_Approved_Symbol", right_on="Symbol", how="inner"
)
new_orf_corum.shape

(3007, 15)

In [35]:
old_crispr_corum = old_crispr_df.merge(
    corum_df, left_on="Metadata_Symbol", right_on="Symbol", how="inner"
)
old_crispr_corum.shape

(2208, 5)

In [36]:
new_crispr_corum = new_crispr_df.merge(
    corum_df, left_on="Metadata_Approved_Symbol", right_on="Symbol", how="inner"
)
new_crispr_corum.shape

(2242, 6)

In [37]:
old_orf_human_protein_atlas = old_orf_df.merge(
    human_protein_atlas_go_annotations,
    left_on="Metadata_Symbol",
    right_on="Gene",
    how="inner",
)
old_orf_human_protein_atlas.shape

(4409, 15)

In [38]:
new_orf_human_protein_atlas = new_orf_df.merge(
    human_protein_atlas_go_annotations,
    left_on="Metadata_Approved_Symbol",
    right_on="Gene",
    how="inner",
)
new_orf_human_protein_atlas.shape

(4416, 16)

In [39]:
old_crispr_human_protein_atlas = old_crispr_df.merge(
    human_protein_atlas_go_annotations,
    left_on="Metadata_Symbol",
    right_on="Gene",
    how="inner",
)
old_crispr_human_protein_atlas.shape

(4337, 6)

In [40]:
new_crispr_human_protein_atlas = new_crispr_df.merge(
    human_protein_atlas_go_annotations,
    left_on="Metadata_Approved_Symbol",
    right_on="Gene",
    how="inner",
)
new_crispr_human_protein_atlas.shape

(4405, 7)

In [41]:
print(
    pd.DataFrame(
        {
            "Old": [
                old_orf_crispr.shape[0],
                old_orf_corum.shape[0],
                old_crispr_corum.shape[0],
                old_orf_human_protein_atlas.shape[0],
                old_crispr_human_protein_atlas.shape[0],
            ],
            "New": [
                new_orf_crispr.shape[0],
                new_orf_corum.shape[0],
                new_crispr_corum.shape[0],
                new_orf_human_protein_atlas.shape[0],
                new_crispr_human_protein_atlas.shape[0],
            ],
        },
        index=[
            "ORF CRISPR Overlap",
            "ORF CORUM labels",
            "CRISPR CORUM labels",
            "ORF Protein class annotations",
            "CRISPR Protein class annotations",
        ],
    ).to_markdown()
)

|                                  |   Old |   New |
|:---------------------------------|------:|------:|
| ORF CRISPR Overlap               |  5252 |  5342 |
| ORF CORUM labels                 |  3003 |  3007 |
| CRISPR CORUM labels              |  2208 |  2242 |
| ORF Protein class annotations    |  4409 |  4416 |
| CRISPR Protein class annotations |  4337 |  4405 |
