In [1]:
import pandas as pd
import numpy as np

In [2]:
# HGNC gene name file

hgnc_df = pd.read_csv(
    "output/hgnc_approved_symbol_processed.tsv", sep='\t', dtype="string"
).rename(columns={"Approved_symbol": "Metadata_Approved_Symbol"})
hgnc_df.head()

Unnamed: 0,Metadata_Approved_Symbol,Previous_symbols,Gene_group_ID,NCBI_Gene_ID
0,A1BG,,594.0,1.0
1,A1BG-AS1,NCRNA00181|A1BGAS|A1BG-AS,1987.0,503538.0
2,A1CF,,725.0,29974.0
3,A1S9T,,,
4,A2M,,2148.0,2.0


In [3]:
# Create dataframes

approved_symbol_df = hgnc_df["Metadata_Approved_Symbol"].copy().drop_duplicates()
previous_symbols_df = (
    hgnc_df.loc[hgnc_df["Previous_symbols"].notna()]
    .assign(Previous_symbols=lambda x: x["Previous_symbols"].str.split("|"))
    .explode("Previous_symbols")
)[["Metadata_Approved_Symbol", "Previous_symbols"]]

In [4]:
# NCBI dataset

ncbi_df = pd.read_csv(
    "output/ncbi_dataset_processed.tsv", sep='\t', dtype="string"
).rename(columns={"Approved_Symbol": "Metadata_Approved_Symbol"})

### Add approved symbols to ORF gene names

In [5]:
orf_metadata_df = pd.read_csv("../datasets/metadata/orf.csv.gz")
orf_metadata_df.head()

Unnamed: 0,Metadata_JCP2022,Metadata_broad_sample,Metadata_Name,Metadata_Vector,Metadata_Transcript,Metadata_Symbol,Metadata_NCBI_Gene_ID,Metadata_Taxon_ID,Metadata_Gene_Description,Metadata_Prot_Match,Metadata_Insert_Length,Metadata_pert_type
0,JCP2022_900002,ccsbBroad304_00001,ORF008415.1_TRC304.1,pLX_304,NM_001160173.3,NAT1,9,9606,N-acetyltransferase 1,100.0,870.0,trt
1,JCP2022_900003,ccsbBroad304_00002,ORF005388.1_TRC304.1,pLX_304,NM_001088.3,AANAT,15,9606,aralkylamine N-acetyltransferase,100.0,621.0,trt
2,JCP2022_900004,ccsbBroad304_00003,ORF003876.1_TRC304.1,pLX_304,NM_000663.4,ABAT,18,9606,4-aminobutyrate aminotransferase,100.0,1500.0,trt
3,JCP2022_900005,ccsbBroad304_00007,ORF004679.1_TRC304.1,pLX_304,NM_000018.4,ACADVL,37,9606,acyl-CoA dehydrogenase very long chain,100.0,1965.0,trt
4,JCP2022_900006,ccsbBroad304_00008,ORF000425.1_TRC304.1,pLX_304,NM_001095.4,ASIC1,41,9606,acid sensing ion channel subunit 1,100.0,1584.0,trt


In [6]:
orf_metadata_df.shape

(15142, 12)

Separate negcons since they won't have Approved Symbols

In [7]:
negcons_df = orf_metadata_df.query("Metadata_pert_type!= 'trt'").assign(
    Metadata_Approved_Symbol=""
)
negcons_df.shape

(5, 13)

In [8]:
orf_metadata_df = orf_metadata_df.query("Metadata_pert_type== 'trt'")
orf_metadata_df.shape

(15137, 12)

Separate `BAD CONSTRUCT` 

In [9]:
bad_construct_df = orf_metadata_df.query(
    "Metadata_broad_sample=='BAD CONSTRUCT'"
).assign(Metadata_Approved_Symbol="")
bad_construct_df.shape

(1, 13)

In [10]:
orf_metadata_df = orf_metadata_df.query("Metadata_broad_sample!= 'BAD CONSTRUCT'")
orf_metadata_df.shape

(15136, 12)

Separate samples with `nan` in `Metadata_Symbol`

In [11]:
nan_df = orf_metadata_df.query("Metadata_Symbol.isna()").assign(
    Metadata_Approved_Symbol=""
)
nan_df.shape

(34, 13)

In [12]:
orf_metadata_df = orf_metadata_df.query("Metadata_Symbol.notna()")
orf_metadata_df.shape

(15102, 12)

Identify those genes whose Metadata_Symbol is the same as Approved Symbol

In [13]:
orf_merged_approved_symbol_df = orf_metadata_df.merge(
    approved_symbol_df,
    left_on="Metadata_Symbol",
    right_on="Metadata_Approved_Symbol",
    how="left",
    indicator=True,
)

In [14]:
orf_metadata_df = (
    orf_merged_approved_symbol_df.query("_merge=='left_only'")
    .drop("_merge", axis=1)
    .drop("Metadata_Approved_Symbol", axis=1)
)

orf_metadata_df.shape

(464, 12)

In [15]:
orf_merged_approved_symbol_df = orf_merged_approved_symbol_df.query(
    "_merge=='both'"
).drop("_merge", axis=1)

orf_merged_approved_symbol_df.shape

(14638, 13)

Check if `Previous_symbols` can match to `Metadata_Symbol` if they can't match to `Approved_Symbol`

In [16]:
orf_merged_previous_symbols_df = orf_metadata_df.merge(
    previous_symbols_df,
    left_on="Metadata_Symbol",
    right_on="Previous_symbols",
    how="left",
    indicator=True,
)

In [17]:
orf_metadata_df = orf_merged_previous_symbols_df.query("_merge=='left_only'").drop(
    "_merge", axis=1
).drop("Previous_symbols", axis=1).drop("Metadata_Approved_Symbol", axis=1)

orf_metadata_df.shape

(222, 12)

In [18]:
orf_merged_previous_symbols_df = (
    orf_merged_previous_symbols_df.query("_merge=='both'")
    .drop("_merge", axis=1)
    .drop("Previous_symbols", axis=1)
)

orf_merged_previous_symbols_df.shape

(242, 13)

For the remaining, check if `NCBI_Gene_ID` can be used to find the `Approved_Symbol`

In [19]:
orf_merged_ncbi_df = orf_metadata_df.merge(
    ncbi_df,
    left_on="Metadata_NCBI_Gene_ID",
    right_on="NCBI_Gene_ID",
    how="left",
    indicator=True,
)

In [20]:
orf_metadata_df = (
    orf_merged_ncbi_df.query("_merge=='left_only'")
    .drop("_merge", axis=1)
    .drop("NCBI_Gene_ID", axis=1)
    .drop("Metadata_Approved_Symbol", axis=1)
).assign(Metadata_Approved_Symbol="")

orf_metadata_df.shape

(179, 13)

In [21]:
orf_merged_ncbi_df = (
    orf_merged_ncbi_df.query("_merge=='both'")
    .drop("_merge", axis=1)
    .drop("NCBI_Gene_ID", axis=1)
)

orf_merged_ncbi_df.shape

(43, 13)

Most genes for which `Approved_Symbol` is `nan` are those with `LOC` in their names. The only ones that don't have `LOC` in their names are the following

In [22]:
orf_metadata_df.loc[~orf_metadata_df.Metadata_Symbol.str.contains("LOC")]

Unnamed: 0,Metadata_JCP2022,Metadata_broad_sample,Metadata_Name,Metadata_Vector,Metadata_Transcript,Metadata_Symbol,Metadata_NCBI_Gene_ID,Metadata_Taxon_ID,Metadata_Gene_Description,Metadata_Prot_Match,Metadata_Insert_Length,Metadata_pert_type,Metadata_Approved_Symbol
5,JCP2022_909414,ccsbBroad304_10113,ORF018302.1_TRC304.1,pLX_304,NM_207422.2,FLJ44635,392490,9606,TPT1-like protein,99.2,420.0,trt,
127,JCP2022_911087,ccsbBroad304_11893,ORF004795.1_TRC304.1,pLX_304,BC035371.1,HSPC047,29060,9606,,,480.0,trt,
129,JCP2022_911256,ccsbBroad304_12072,ORF003338.1_TRC304.1,pLX_304,BC104430.1,DKFZp566H0824,54744,9606,,,360.0,trt,
130,JCP2022_911397,ccsbBroad304_12219,ORF007065.1_TRC304.1,pLX_304,BC019830.1,PRO2012,55478,9606,,,192.0,trt,
133,JCP2022_911981,ccsbBroad304_12852,ORF007953.1_TRC304.1,pLX_304,BC004943.1,MGC10814,84757,9606,,,114.0,trt,
134,JCP2022_911985,ccsbBroad304_12856,ORF015904.1_TRC304.1,pLX_304,BC005072.1,MGC13008,84772,9606,,,171.0,trt,
157,JCP2022_912692,ccsbBroad304_13617,ORF009637.1_TRC304.1,pLX_304,BC032242.1,MGC40069,348035,9606,,,384.0,trt,
167,JCP2022_912757,ccsbBroad304_13688,ORF010332.1_TRC304.1,pLX_304,BC132807.1,FLJ41170,440200,9606,,,573.0,trt,


Combining all the dataframes

In [23]:
orf_df = (
    pd.concat(
        [
            orf_metadata_df,
            bad_construct_df,
            negcons_df,
            nan_df,
            orf_merged_approved_symbol_df,
            orf_merged_previous_symbols_df,
            orf_merged_ncbi_df,
        ],
        axis=0,
        ignore_index=True,
    )
    .sort_values(by="Metadata_JCP2022", ascending=True)
    .reset_index(drop=True)
)

orf_df.shape

(15142, 13)

Write to file

In [24]:
orf_df.to_csv("output/orf.csv.gz", index=False, compression="gzip")

### Add approved symbols to CRISPR gene names

In [25]:
crispr_metadata_df = pd.read_csv("../datasets/metadata/crispr.csv.gz")
crispr_metadata_df.Metadata_NCBI_Gene_ID = (
    crispr_metadata_df.Metadata_NCBI_Gene_ID.astype("Int64")
)
crispr_metadata_df.Metadata_NCBI_Gene_ID = (
    crispr_metadata_df.Metadata_NCBI_Gene_ID.astype(str)
)
crispr_metadata_df.replace('<NA>', "", inplace=True)
crispr_metadata_df.head()

Unnamed: 0,Metadata_JCP2022,Metadata_NCBI_Gene_ID,Metadata_Symbol
0,JCP2022_800001,,no-guide
1,JCP2022_800002,,non-targeting
2,JCP2022_800003,2.0,A2M
3,JCP2022_800004,127550.0,A3GALT2
4,JCP2022_800005,53947.0,A4GALT


In [26]:
crispr_metadata_df.shape

(7977, 3)

Separate negcons since they won't have Approved Symbols

In [27]:
negcons_df = crispr_metadata_df.query("Metadata_NCBI_Gene_ID==''").assign(
    Metadata_Approved_Symbol=""
)
negcons_df.shape

(2, 4)

In [28]:
crispr_metadata_df = crispr_metadata_df.query("Metadata_NCBI_Gene_ID!=''")
crispr_metadata_df.shape

(7975, 3)

Identify those genes whose Metadata_Symbol is the same as Approved Symbol

In [29]:
crispr_merged_approved_symbol_df = crispr_metadata_df.merge(
    approved_symbol_df,
    left_on="Metadata_Symbol",
    right_on="Metadata_Approved_Symbol",
    how="left",
    indicator=True,
)

In [30]:
crispr_metadata_df = (
    crispr_merged_approved_symbol_df.query("_merge=='left_only'")
    .drop("_merge", axis=1)
    .drop("Metadata_Approved_Symbol", axis=1)
)

crispr_metadata_df.shape

(156, 3)

In [31]:
crispr_merged_approved_symbol_df = (
    crispr_merged_approved_symbol_df.query("_merge=='both'")
    .drop("_merge", axis=1)
)

crispr_merged_approved_symbol_df.shape

(7819, 4)

Check if `Previous_symbols` can match to `Metadata_Symbol` if they can't match to `Approved_Symbol`

In [32]:
crispr_merged_previous_symbols_df = crispr_metadata_df.merge(
    previous_symbols_df,
    left_on="Metadata_Symbol",
    right_on="Previous_symbols",
    how="left",
    indicator=True,
)

In [33]:
crispr_metadata_df = (
    crispr_merged_previous_symbols_df.query("_merge=='left_only'")
    .drop("_merge", axis=1)
    .drop("Previous_symbols", axis=1)
    .drop("Metadata_Approved_Symbol", axis=1)
)

crispr_metadata_df.shape

(5, 3)

In [34]:
crispr_merged_previous_symbols_df = (
    crispr_merged_previous_symbols_df.query("_merge=='both'")
    .drop("_merge", axis=1)
    .drop("Previous_symbols", axis=1)
)

crispr_merged_previous_symbols_df.shape

(151, 4)

For the remaining, check if `NCBI_Gene_ID` can be used to find the `Approved_Symbol`

In [35]:
crispr_merged_ncbi_df = crispr_metadata_df.merge(
    ncbi_df,
    left_on="Metadata_NCBI_Gene_ID",
    right_on="NCBI_Gene_ID",
    how="left",
    indicator=True,
)

In [36]:
crispr_metadata_df = (
    crispr_merged_ncbi_df.query("_merge=='left_only'")
    .drop("_merge", axis=1)
    .drop("NCBI_Gene_ID", axis=1)
    .drop("Metadata_Approved_Symbol", axis=1)
).assign(Metadata_Approved_Symbol="")

crispr_metadata_df.shape

(0, 4)

In [37]:
crispr_merged_ncbi_df = (
    crispr_merged_ncbi_df.query("_merge=='both'")
    .drop("_merge", axis=1)
    .drop("NCBI_Gene_ID", axis=1)
)

crispr_merged_ncbi_df.shape

(5, 4)

Combine all the dataframes

In [38]:
crispr_df = (
    pd.concat(
        [
            negcons_df,
            crispr_merged_approved_symbol_df,
            crispr_merged_previous_symbols_df,
            crispr_merged_ncbi_df,
        ],
        axis=0,
        ignore_index=True,
    )
    .sort_values(by="Metadata_JCP2022", ascending=True)
    .reset_index(drop=True)
)

crispr_df.shape

(7977, 4)

Write to file

In [39]:
crispr_df.to_csv("output/crispr.csv.gz", index=False, compression="gzip")