In [1]:
import pandas as pd
import numpy as np

In [2]:
orf_metadata_df = pd.read_csv('output/orf_metadata.tsv.gz', sep='\t')
orf_metadata_df.head()

Unnamed: 0,Metadata_JCP2022,Metadata_broad_sample,Metadata_Name,Metadata_Vector,Metadata_Transcript,Metadata_Symbol,Metadata_NCBI_Gene_ID,Metadata_Taxon_ID,Metadata_Gene_Description,Metadata_Prot_Match,...,Metadata_disease_involvement,Metadata_subcellular_location,Metadata_gene_group_id,Metadata_homologue,Metadata_biological_process,Metadata_cellular_component,Metadata_molecular_function,Metadata_wikipathway,Metadata_dependency_probability,Metadata_TPM
0,JCP2022_900002,ccsbBroad304_00001,ORF008415.1_TRC304.1,pLX_304,NM_001160173.3,NAT1,9,9606,N-acetyltransferase 1,100.0,...,,,1133,True,GOBP_CELLULAR_RESPONSE_TO_XENOBIOTIC_STIMULUS|...,,GOMF_ACETYLTRANSFERASE_ACTIVITY|GOMF_ACYLTRANS...,,0.019718,2.028569
1,JCP2022_900003,ccsbBroad304_00002,ORF005388.1_TRC304.1,pLX_304,NM_001088.3,AANAT,15,9606,aralkylamine N-acetyltransferase,100.0,...,Disease variant,Cytosol,1134,True,GOBP_AMIDE_BIOSYNTHETIC_PROCESS|GOBP_AMIDE_MET...,GOCC_PERINUCLEAR_REGION_OF_CYTOPLASM,GOMF_14_3_3_PROTEIN_BINDING|GOMF_ACETYLTRANSFE...,WP_BIOGENIC_AMINE_SYNTHESIS|WP_BIOSYNTHESIS_AN...,0.022395,0.400538
2,JCP2022_900004,ccsbBroad304_00003,ORF003876.1_TRC304.1,pLX_304,NM_000663.4,ABAT,18,9606,4-aminobutyrate aminotransferase,100.0,...,Disease variant|FDA approved drug targets,Mitochondria,,True,GOBP_ACIDIC_AMINO_ACID_TRANSPORT|GOBP_ACID_SEC...,GOCC_CATALYTIC_COMPLEX|GOCC_INTRACELLULAR_PROT...,GOMF_METAL_CLUSTER_BINDING|GOMF_TRANSFERASE_AC...,WP_ALANINE_AND_ASPARTATE_METABOLISM|WP_FRAGILE...,0.003862,3.109361
3,JCP2022_900005,ccsbBroad304_00007,ORF004679.1_TRC304.1,pLX_304,NM_000018.4,ACADVL,37,9606,acyl-CoA dehydrogenase very long chain,100.0,...,Cardiomyopathy|Disease variant,Mitochondria,2258|974,True,GOBP_ALCOHOL_METABOLIC_PROCESS|GOBP_CELLULAR_K...,GOCC_ENVELOPE|GOCC_EXTRINSIC_COMPONENT_OF_MEMB...,GOMF_ACYL_COA_BINDING|GOMF_ADENYL_NUCLEOTIDE_B...,WP_FATTY_ACID_BETA_OXIDATION|WP_MITOCHONDRIAL_...,0.001691,7.98715
4,JCP2022_900006,ccsbBroad304_00008,ORF000425.1_TRC304.1,pLX_304,NM_001095.4,ASIC1,41,9606,acid sensing ion channel subunit 1,100.0,...,,Golgi apparatus|Plasma membrane,290,True,GOBP_ASSOCIATIVE_LEARNING|GOBP_BEHAVIOR|GOBP_B...,GOCC_CELL_SURFACE|GOCC_GOLGI_APPARATUS|GOCC_PR...,GOMF_GATED_CHANNEL_ACTIVITY|GOMF_INORGANIC_MOL...,,0.215086,4.014355


In [3]:
orf_metadata_df = orf_metadata_df.assign(
    Metadata_Gene_Symbol=lambda x: np.where(
        x.Metadata_Approved_Symbol.notna(),
        x.Metadata_Approved_Symbol,
        x.Metadata_Symbol,
    )
)

In [4]:
genes_with_multiple_JCP_IDs = (
    orf_metadata_df.Metadata_Gene_Symbol.value_counts()
    .reset_index()
    .query("Metadata_Gene_Symbol>1")
    .rename(columns={"index": "Metadata_Gene_Symbol", "Metadata_Gene_Symbol": "Metadata_Count"})
)

genes_with_multiple_JCP_IDs

Unnamed: 0,Metadata_Gene_Symbol,Metadata_Count
0,IGHG1,11
1,RSL24D1,8
2,FTL,6
3,NSUN5,6
4,KYAT1,6
...,...,...
2126,AXIN2,2
2127,ATF4,2
2128,RBMS1,2
2129,GALNT6,2


In [5]:
# map back to JCP IDs

orf_metadata_multiple_JCP_IDs_df = orf_metadata_df.merge(
    genes_with_multiple_JCP_IDs,
    on="Metadata_Gene_Symbol",
    how="inner",
)[["Metadata_JCP2022", "Metadata_Gene_Symbol", "Metadata_NCBI_Gene_ID", "Metadata_Count", "Metadata_Insert_Length", "Metadata_Prot_Match"]]

orf_metadata_multiple_JCP_IDs_df


Unnamed: 0,Metadata_JCP2022,Metadata_Gene_Symbol,Metadata_NCBI_Gene_ID,Metadata_Count,Metadata_Insert_Length,Metadata_Prot_Match
0,JCP2022_900004,ABAT,18,2,1500.0,100.0
1,JCP2022_905380,ABAT,18,2,1500.0,99.8
2,JCP2022_900016,ACVR1B,91,2,1515.0,100.0
3,JCP2022_913561,ACVR1B,91,2,1515.0,100.0
4,JCP2022_900017,ACVRL1,94,2,1509.0,100.0
...,...,...,...,...,...,...
4624,JCP2022_914535,PKN2,5586,2,2808.0,99.8
4625,JCP2022_914597,AK6,102157402,2,516.0,100.0
4626,JCP2022_914597,AK6,102157402,2,516.0,100.0
4627,JCP2022_914599,TBCD,6904,2,2250.0,90.6


In [6]:
orf_metadata_multiple_JCP_IDs_df.query("Metadata_Gene_Symbol=='RSL24D1'")

Unnamed: 0,Metadata_JCP2022,Metadata_Gene_Symbol,Metadata_NCBI_Gene_ID,Metadata_Count,Metadata_Insert_Length,Metadata_Prot_Match
3019,JCP2022_907711,RSL24D1,51187,8,489.0,99.3
3020,JCP2022_913196,RSL24D1,51187,8,489.0,96.9
3021,JCP2022_914804,RSL24D1,51187,8,489.0,99.3
3022,JCP2022_914805,RSL24D1,51187,8,450.0,92.0
3023,JCP2022_914806,RSL24D1,51187,8,489.0,100.0
3024,JCP2022_914807,RSL24D1,51187,8,489.0,100.0
3025,JCP2022_914808,RSL24D1,51187,8,489.0,99.3
3026,JCP2022_914809,RSL24D1,51187,8,489.0,98.7


In [7]:
orf_metadata_multiple_JCP_IDs_df.query("Metadata_Gene_Symbol=='RSL24D1'").sort_values(by=["Metadata_Insert_Length", "Metadata_Prot_Match"], ascending=False)

Unnamed: 0,Metadata_JCP2022,Metadata_Gene_Symbol,Metadata_NCBI_Gene_ID,Metadata_Count,Metadata_Insert_Length,Metadata_Prot_Match
3023,JCP2022_914806,RSL24D1,51187,8,489.0,100.0
3024,JCP2022_914807,RSL24D1,51187,8,489.0,100.0
3019,JCP2022_907711,RSL24D1,51187,8,489.0,99.3
3021,JCP2022_914804,RSL24D1,51187,8,489.0,99.3
3025,JCP2022_914808,RSL24D1,51187,8,489.0,99.3
3026,JCP2022_914809,RSL24D1,51187,8,489.0,98.7
3020,JCP2022_913196,RSL24D1,51187,8,489.0,96.9
3022,JCP2022_914805,RSL24D1,51187,8,450.0,92.0


In [8]:
orf_metadata_multiple_JCP_IDs_df.query("Metadata_Gene_Symbol=='RSL24D1'").sort_values(
    by=["Metadata_Insert_Length", "Metadata_Prot_Match"], ascending=False
).drop_duplicates("Metadata_Gene_Symbol", keep="first")

Unnamed: 0,Metadata_JCP2022,Metadata_Gene_Symbol,Metadata_NCBI_Gene_ID,Metadata_Count,Metadata_Insert_Length,Metadata_Prot_Match
3023,JCP2022_914806,RSL24D1,51187,8,489.0,100.0


In [9]:
orf_metadata_multiple_JCP_IDs_df.query("Metadata_Gene_Symbol=='HERC3'").sort_values(by=["Metadata_Insert_Length", "Metadata_Prot_Match"], ascending=False)

Unnamed: 0,Metadata_JCP2022,Metadata_Gene_Symbol,Metadata_NCBI_Gene_ID,Metadata_Count,Metadata_Insert_Length,Metadata_Prot_Match
3878,JCP2022_910556,HERC3,8916,2,1104.0,100.0
3879,JCP2022_910556,HERC3,8916,2,1104.0,100.0
