In [1]:
import pandas as pd


## take the Genes from Task2 gene list and check number of invalid gene symbols

In [4]:
# download the most up-to-date hgnc symbols and aliases
## last download on 06/27/2024
!bash ./utils/download_genenames.sh

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0curl: (6) Could not resolve host: www.genenames.org


In [5]:
from collections import defaultdict
import csv
class GeneValidator:
    def __init__(self, file_path):
        self.gene_symbol_set = set()
        self.alias_map = defaultdict(str)

        with open(file_path, 'r') as file:
            reader = csv.reader(file, delimiter='\t')
            next(reader)  # Skip header row
            for cells in reader:
                if cells[0]:
                    gene = cells[0].upper()
                    self.gene_symbol_set.add(gene)

                    if len(cells) > 1 and cells[1]:
                        previous_symbols = cells[1].split(", ")
                        for symbol in previous_symbols:
                            self.alias_map[symbol.upper()] = gene

                    if len(cells) > 2 and cells[2]:
                        alias_symbols = cells[2].split(", ")
                        for alias in alias_symbols:
                            self.alias_map[alias.upper()] = gene

    def validate_human_genes(self, genes):
        official_genes = set()
        invalid_genes = set()
        updated_genes = {}

        for raw_term in genes:
            # print(f'validate Hugo symbol for {raw_term}')
            term = raw_term.upper()
            if term in self.gene_symbol_set:
                official_genes.add(term)
            elif term in self.alias_map:
                official_gene = self.alias_map[term]
                official_genes.add(official_gene)
                updated_genes[term] = official_gene
                invalid_genes.add(term)
            else:
                invalid_genes.add(term)

        return {
            'official_genes': official_genes,
            'invalid': invalid_genes,
            'updated_genes': updated_genes
        }


In [6]:
df = pd.read_csv('data/omics_revamped_LLM_DF.tsv', sep='\t')
def get_genes(row):
    return row['GeneList'].split(' ')

total_genes = df.apply(get_genes, axis=1)
total_genes_list = [item for sublist in total_genes for item in sublist]
total_genes_set = set(total_genes_list)
len(total_genes_set)

6260

In [7]:

# Usage example:
file_path = "./hgnc_genes.tsv"

validator = GeneValidator(file_path)
result = validator.validate_human_genes(total_genes_set)
# print(result)

updated_gene_symbols = list(result['official_genes'])
invalid_gene_symbols = list(result['invalid'])
updated_genes_mapping = result['updated_genes']
# print("Updated gene symbols:", updated_gene_symbols)
print("number of invalid genes: ", len(invalid_gene_symbols))
print("Invalid gene symbols:", invalid_gene_symbols)
# print("Updated genes mapping:", updated_genes_mapping)



number of invalid genes:  384
Invalid gene symbols: ['C14ORF2', 'LOC147004', 'C19ORF52', 'MUM1', 'C2ORF43', 'GNB2L1', 'PQLC3', 'C10ORF10', 'ALS2CR11', 'C9ORF89', 'C8ORF37', 'H2BFS', 'LOC100505650', 'LDOC1L', 'GRAMD3', 'HIST1H2AM', 'LRRC48', 'WDR78', 'C1ORF194', 'LOC400622', 'CASC4', 'LARS', 'TMEM56', 'MAATS1', 'C1ORF213', 'RPS6P6', 'C22ORF24', 'C2ORF70', 'LOC730183', 'LHFP', 'GPR126', 'HIST1H2AH', 'LOC283701', 'HIAT1', 'SEPP1', 'WBP5', 'LOC642852', 'C3ORF58', 'H2AFV', 'KIAA1161', 'KIAA1377', 'TSSC1', 'CENPBD1', 'LOC100288860', 'SLC9A3R1', 'LOC283745', 'C16ORF52', 'PKI55', 'CRAMP1L', 'FAM129C', 'ACRC', 'RFWD2', 'LOC643355', 'WRB', 'C1ORF168', 'PAPD5', 'DNAJC3-AS1', 'ZNF720', 'KIAA0355', 'KIAA1033', 'WARS', 'NUDT16P1', 'ANKRD20A4', 'LOC389831', 'FAM183A', 'ZFYVE20', 'FAM57A', 'LOC349160', 'LOC727916', 'AQPEP', 'FAM63A', 'FAM196B', 'KIAA1244', 'C9ORF16', 'HIST1H2BD', 'DSCR3', 'KIAA1715', 'ZCCHC11', 'C11ORF63', 'FAM71F1', 'CASC10', 'PCDHB18', 'RSG1', 'CCDC94', 'GBA', 'HIST1H4B', 'H3F3C', '

In [8]:
updated_genes_mapping

{'HIAT1': 'MFSD14A',
 'PAPD5': 'TENT4B',
 'CASC10': 'MIR1915HG',
 'H2AFZ': 'H2AZ1',
 'TCEB3': 'ELOA',
 'C20ORF27': 'ADISSP',
 'KIAA0125': 'FAM30A',
 'MARCH3': 'MARCHF3',
 'FAM64A': 'PIMREG',
 'FAM86C1': 'FAM86C1P',
 'KIAA0226L': 'RUBCNL',
 'SEPW1': 'SELENOW',
 'NUPL1': 'NUP58',
 'GRASP': 'TAMALIN',
 'GPR64': 'ADGRG2',
 'HIST1H4C': 'H4C3',
 'VNN3': 'VNN3P',
 'C7ORF55': 'FMC1',
 'C11ORF95': 'ZFTA',
 'C14ORF79': 'CLBA1',
 'GNB2L1': 'RACK1',
 'LARS': 'LARS1',
 'RFWD2': 'COP1',
 'C1ORF194': 'CFAP276',
 'C19ORF45': 'SAXO5',
 'C17ORF97': 'LIAT1',
 'C7ORF49': 'CYREN',
 'C11ORF73': 'HIKESHI',
 'HIST2H2BF': 'H2BC18',
 'FAM198B': 'GASK1B',
 'C19ORF43': 'TRIR',
 'C9ORF53': 'CDKN2A-DT',
 'C11ORF63': 'JHY',
 'KIAA1211L': 'CRACDL',
 'FAM166A': 'CIMIP2A',
 'CCDC189': 'CFAP119',
 'ATP5E': 'ATP5F1E',
 'WDR11-AS1': 'WDR11-DT',
 'C19ORF52': 'TIMM29',
 'HIST1H2AM': 'H2AC17',
 'TSSC1': 'EIPR1',
 'FAM63A': 'MINDY1',
 'HIST1H2AI': 'H2AC13',
 'ATP5J2': 'ATP5MF',
 'PPAP2B': 'PLPP3',
 'WDR61': 'SKIC8',
 'B3GALTL

In [9]:
def update_gene_list(row):
    genes = row['GeneList'].split(' ')
    updated_genes = []
    updated_status = False
    num_updates = 0 
    for gene in genes:
        if gene in updated_genes_mapping:
            updated_genes.append(updated_genes_mapping[gene])
            updated_status = True
            num_updates += 1
        else:
            updated_genes.append(gene)

    assert len(genes) == len(updated_genes), f"Length of genes: {len(genes)} and updated_genes: {len(updated_genes)} are not the same"
    return ' '.join(updated_genes), updated_status, num_updates

In [10]:
df[['updated GeneList', 'update status', 'num genes updated']] = df.apply(lambda row: pd.Series(update_gene_list(row)), axis=1)
df.loc[df['update status'] == True, :].sort_values(by='num genes updated', ascending=False).head(10)

Unnamed: 0,Source,GeneSetID,GeneSetName,GeneList,n_Genes,LLM Name,LLM Analysis,Score,updated GeneList,update status,num genes updated
292,Viral_Infections,SARS-dORF6_84Hour...137,SARS-dORF6 84Hour...137,HOTTIP NEU1 ARFGAP3 MAPKAPK2 SLC35B4 MIS12 HIN...,99,System of unrelated proteins,The provided list of interacting proteins enco...,0.0,HOTTIP NEU1 ARFGAP3 MAPKAPK2 SLC35B4 MIS12 HIN...,True,12
241,L1000,BRD-A41250203_-666_MCF7_6.0_h_10.0_um,BRD-A41250203 -666 MCF7 6.0 h 10.0 um,1060P11.3 ABCC5 AGT AKR1C2 APOC1 APPBP2 ASNS A...,92,Cellular Stress Response and Apoptosis Regulation,1. The proteins in this system are involved in...,0.92,1060P11.3 ABCC5 AGT AKR1C2 APOC1 APPBP2 ASNS A...,True,11
14,NeST,Cluster1-5,Cluster1-5,ANKRD45 BBOF1 C1orf194 C2orf73 C11orf63 C11orf...,93,Ciliogenesis and Ciliary Function,The proteins listed are predominantly associat...,0.92,ANKRD45 BBOF1 C1orf194 C2orf73 C11orf63 C11orf...,True,8
283,Viral_Infections,SARS-BatSRBD_0Hour...202,SARS-BatSRBD 0Hour...202,BRSK1 TFPI CHST4 SCARA3 TMEM177 KIF2A HNRNPK G...,95,System of unrelated proteins,The provided list of proteins encompasses a wi...,0.0,BRSK1 TFPI CHST4 SCARA3 TMEM177 KIF2A HNRNPK G...,True,8
268,Viral_Infections,A-Vietnam-1203_CIP048_RG4-2004(H5N1)NS1trunc_7...,A-Vietnam-1203 CIP048 RG4-2004(H5N1)NS1trunc 7...,LRRN4 C3ORF20 NELL2 FSTL3 SLC4A9 TAC1 PRB1 CAR...,53,System of unrelated proteins,The provided list of interacting proteins enco...,0.0,LRRN4 C3ORF20 NELL2 FSTL3 SLC4A9 TAC1 PRB1 CAR...,True,8
58,NeST,Cluster2-139,Cluster2-139,ACBD6 ACRV1 ACYP1 C4orf48 CAMTA1 CCDC85B CTRL ...,32,Chromatin Organization and Gene Expression Reg...,"1. The histone proteins HIST1H2AH, HIST1H2AI, ...",0.85,ACBD6 ACRV1 ACYP1 C4orf48 CAMTA1 CCDC85B CTRL ...,True,8
221,L1000,BRD-A35033682_-666_MCF7_6.0_h_10.0_um,BRD-A35033682 -666 MCF7 6.0 h 10.0 um,ABCB6 AK4 AMDHD2 BHLHE41 C1S CD58 CFD CHI3L1 C...,74,System of unrelated proteins,The provided list of proteins encompasses a wi...,0.0,ABCB6 AK4 AMDHD2 BHLHE41 C1S CD58 CFD CHI3L1 C...,True,7
155,L1000,BRD-A11678676_-666_MCF7_6.0_h_10.0_um,BRD-A11678676 -666 MCF7 6.0 h 10.0 um,1060P11.3 AQP3 BASP1 BNIP3L BTG1 C6ORF48 CCNG2...,75,Cellular Stress Response and Apoptosis Regulation,"1. Several proteins in this system, such as BN...",0.85,1060P11.3 AQP3 BASP1 BNIP3L BTG1 SNHG32 CCNG2 ...,True,7
277,Viral_Infections,Enterovirus 71_8Hour,Enterovirus 71 8Hour,NELL2 PDYN OR7A5 GREB1 PCP4L1 LOC100128002 KRT...,95,System of unrelated proteins,The provided list of proteins encompasses a wi...,0.0,NELL2 PDYN OR7A5 GREB1 PCP4L1 LOC100128002 KRT...,True,7
271,Viral_Infections,Dhori Virus_24Hour,Dhori Virus 24Hour,SLC39A5 SVEP1 DPYSL5 KIAA1919 GPR89A FAAH2 HEY...,92,System of unrelated proteins,The provided list of proteins encompasses a wi...,0.0,SLC39A5 SVEP1 DPYSL5 MFSD4B GPR89A FAAH2 HEY2 ...,True,7


In [11]:
df.loc[df['update status'] == True,:].shape

(187, 11)

In [12]:
df.columns

Index(['Source', 'GeneSetID', 'GeneSetName', 'GeneList', 'n_Genes', 'LLM Name',
       'LLM Analysis', 'Score', 'updated GeneList', 'update status',
       'num genes updated'],
      dtype='object')

In [13]:
df[['Source', 'GeneSetID', 'GeneSetName', 'GeneList','updated GeneList', 'n_Genes', 'LLM Name',
       'LLM Analysis', 'Score']].to_csv('data/omics_revamped_LLM_updated_genes_DF.tsv', sep='\t', index=False)