In [1]:
import pandas as pd 
import numpy as np

In [2]:
def set_term(column1,column2):
    if not pd.isnull(column1):
        return column1
    if not pd.isnull(column2):
        return column2
    else:
        return np.nan

In [3]:
def separate_id(column,idx):
    for i in column:
        if idx in i:
            column.remove(i)
            if idx == 'UniProtKB' or idx == 'NCBIGene':
                return i.split(':')[-1]
            return i
    return np.nan

In [4]:
pd.options.mode.chained_assignment = None

# Gene Ontology

In [5]:
important_columns_go = ['Class ID','Preferred Label','Synonyms','Obsolete','database_cross_reference','has_obo_namespace']

In [6]:
go_df = pd.read_csv('data/Genetic/GO.csv', usecols=important_columns_go, dtype=str)
go_df

Unnamed: 0,Class ID,Preferred Label,Synonyms,Obsolete,database_cross_reference,has_obo_namespace
0,http://purl.obolibrary.org/obo/GO_0015219,obsolete protein-DNA complex transmembrane tra...,DNA-protein complex transmembrane transporter ...,true,,molecular_function
1,http://purl.obolibrary.org/obo/GO_1902084,fumagillin metabolic process,fumagillin metabolism,false,,biological_process
2,http://purl.obolibrary.org/obo/GO_0106291,superoxide-generating NADH oxidase activity.,,false,RHEA:63184,molecular_function
3,http://purl.obolibrary.org/obo/GO_1903896,positive regulation of IRE1-mediated unfolded ...,upregulation of UPR signaling by IRE1 stress s...,false,,biological_process
4,http://purl.obolibrary.org/obo/GO_0051203,peptidyl-aspartic acid reduction to form L-asp...,peptidyl-aspartic acid reduction to form L-asp...,false,RESID:AA0373,biological_process
...,...,...,...,...,...,...
50600,http://purl.obolibrary.org/obo/GO_0019508,"2,5-dihydroxypyridine catabolic process to fum...","2,5-dihydroxypyridine breakdown to fumarate|py...",false,MetaCyc:PWY-722,biological_process
50601,http://purl.obolibrary.org/obo/GO_0034378,chylomicron assembly,,false,,biological_process
50602,http://purl.obolibrary.org/obo/GO_0070362,GO_0070362,,true,,
50603,http://purl.obolibrary.org/obo/GO_0050239,pyrithiamine deaminase activity,1-(4-amino-2-methylpyrimid-5-ylmethyl)-3-(beta...,false,RHEA:14537|MetaCyc:PYRITHIAMIN-DEAMINASE-RXN|E...,molecular_function


In [7]:
go_df = go_df[go_df['Obsolete']=='false']
go_df.drop('Obsolete',inplace=True,axis=1)
print(go_df.shape)

(43987, 5)


In [8]:
go_df['GO_id'] = go_df['Class ID'].apply(lambda x: x.split('/')[-1].split('_')[-1] if isinstance(x,str) else np.nan)
go_df.drop('Class ID',inplace=True,axis=1)

In [9]:
go_df.rename(columns={'Preferred Label':'Term','database_cross_reference':'cross_reference'},inplace=True)

In [10]:
go_df['Synonyms'] = go_df['Synonyms'].apply(lambda x: [f for f in x.split('|')] if isinstance(x,str) else np.nan)

In [11]:
go_df['cross_reference'] = go_df['cross_reference'].apply(lambda x: [f for f in x.split('|')] if isinstance(x,str) else np.nan)

In [12]:
go_df['type'] = go_df['has_obo_namespace'].apply(lambda x: x.replace('_',' ') if isinstance(x,str) else np.nan)

In [13]:
go_df.drop('has_obo_namespace',inplace=True,axis=1)

In [14]:
go_df

Unnamed: 0,Term,Synonyms,cross_reference,GO_id,type
1,fumagillin metabolic process,[fumagillin metabolism],,1902084,biological process
2,superoxide-generating NADH oxidase activity.,,[RHEA:63184],0106291,molecular function
3,positive regulation of IRE1-mediated unfolded ...,[upregulation of UPR signaling by IRE1 stress ...,,1903896,biological process
4,peptidyl-aspartic acid reduction to form L-asp...,[peptidyl-aspartic acid reduction to form L-as...,[RESID:AA0373],0051203,biological process
6,negative regulation of planar cell polarity pa...,,,2000149,biological process
...,...,...,...,...,...
50597,negative regulation of ribosomal large subunit...,[negative regulation of ribosomal large subuni...,,2000204,biological process
50600,"2,5-dihydroxypyridine catabolic process to fum...","[2,5-dihydroxypyridine breakdown to fumarate, ...",[MetaCyc:PWY-722],0019508,biological process
50601,chylomicron assembly,,,0034378,biological process
50603,pyrithiamine deaminase activity,[1-(4-amino-2-methylpyrimid-5-ylmethyl)-3-(bet...,"[RHEA:14537, MetaCyc:PYRITHIAMIN-DEAMINASE-RXN...",0050239,molecular function


# Ontology of Genes and Genomes

In [15]:
important_columns_ogg = ['Preferred Label','Synonyms','Obsolete','database_cross_reference','full name from nomenclature authority','type of gene','NCBI GeneID','organism NCBITaxon ID']

In [16]:
ogg_df = pd.read_csv('data/Genetic/OGG.csv', usecols=important_columns_ogg, dtype=str)
ogg_df

Unnamed: 0,Preferred Label,Synonyms,Obsolete,database_cross_reference,full name from nomenclature authority,NCBI GeneID,organism NCBITaxon ID,type of gene
0,NDUFB3P5,,false,,NADH dehydrogenase (ubiquinone) 1 beta subcomp...,93997,9606,pseudo
1,RPL34P13,RPL34_3_616,false,HGNC:36398,ribosomal protein L34 pseudogene 13,100270997,9606,pseudo
2,EFNB3,EPLG8|EFL6|LERK8,false,HGNC:3228|Ensembl:ENSG00000108947|Vega:OTTHUMG...,ephrin-B3,1949,9606,protein-coding
3,MIR532,MIRN532|hsa-mir-532,false,HGNC:32795|miRBase:MI0003205,microRNA 532,693124,9606,ncRNA
4,GTSF1,FAM112B,false,HPRD:08150|Ensembl:ENSG00000170627|Vega:OTTHUM...,gametocyte specific factor 1,121355,9606,protein-coding
...,...,...,...,...,...,...,...,...
69683,mdlB,mdl|JW5061|ECK0443,false,EcoGene:EG14374,,945088,511145,protein-coding
69684,RVBD_0838,,false,,,13318740,83332,protein-coding
69685,SNORD62A,U62A|RNU62|U62,false,HGNC:10219,"small nucleolar RNA, C/D box 62A",26786,9606,snoRNA
69686,LOC101928500,,false,,,101928500,9606,ncRNA


In [17]:
ogg_df = ogg_df[ogg_df['Obsolete']=='false']
ogg_df.drop('Obsolete',inplace=True,axis=1)
print(ogg_df.shape)

(69688, 7)


In [18]:
ogg_df

Unnamed: 0,Preferred Label,Synonyms,database_cross_reference,full name from nomenclature authority,NCBI GeneID,organism NCBITaxon ID,type of gene
0,NDUFB3P5,,,NADH dehydrogenase (ubiquinone) 1 beta subcomp...,93997,9606,pseudo
1,RPL34P13,RPL34_3_616,HGNC:36398,ribosomal protein L34 pseudogene 13,100270997,9606,pseudo
2,EFNB3,EPLG8|EFL6|LERK8,HGNC:3228|Ensembl:ENSG00000108947|Vega:OTTHUMG...,ephrin-B3,1949,9606,protein-coding
3,MIR532,MIRN532|hsa-mir-532,HGNC:32795|miRBase:MI0003205,microRNA 532,693124,9606,ncRNA
4,GTSF1,FAM112B,HPRD:08150|Ensembl:ENSG00000170627|Vega:OTTHUM...,gametocyte specific factor 1,121355,9606,protein-coding
...,...,...,...,...,...,...,...
69683,mdlB,mdl|JW5061|ECK0443,EcoGene:EG14374,,945088,511145,protein-coding
69684,RVBD_0838,,,,13318740,83332,protein-coding
69685,SNORD62A,U62A|RNU62|U62,HGNC:10219,"small nucleolar RNA, C/D box 62A",26786,9606,snoRNA
69686,LOC101928500,,,,101928500,9606,ncRNA


# Protein Ontology

In [19]:
important_columns_pr = ['Preferred Label','Synonyms','Obsolete','database_cross_reference','http://www.geneontology.org/formats/oboInOwl#id','only_in_taxon','http://www.w3.org/2000/01/rdf-schema#comment']

In [20]:
pr_df = pd.read_csv('data/Genetic/PR.csv', usecols=important_columns_pr, dtype=str)
pr_df

Unnamed: 0,Preferred Label,Synonyms,Obsolete,database_cross_reference,http://www.geneontology.org/formats/oboInOwl#id,http://www.w3.org/2000/01/rdf-schema#comment,only_in_taxon
0,solute carrier organic anion transporter famil...,hSLCO3A1/iso:h3,false,UniProtKB:Q9UIG8-3,PR:Q9UIG8-3,Category=organism-sequence.,http://purl.obolibrary.org/obo/NCBITaxon_9606
1,trbG (Escherichia coli K-12),,false,,NCBIGene:1263580,Category=external.,http://purl.obolibrary.org/obo/NCBITaxon_83333
2,interleukin enhancer-binding factor 2 isoform ...,mILF2/iso:1,false,UniProtKB:Q9CXY6-1,PR:Q9CXY6-1,Category=organism-sequence.,http://purl.obolibrary.org/obo/NCBITaxon_10090
3,gag-pol polyprotein (Human immunodeficiency vi...,Pr160Gag-Pol (HIV-1 M:D_Z2/CDC-Z34)|gag-pol (H...,false,UniProtKB:P12499,PR:P12499,Category=organism-gene.,http://purl.obolibrary.org/obo/NCBITaxon_11683
4,serine/threonine-protein kinase MRCK alpha iso...,hCDC42BPA/iso:h6,false,UniProtKB:Q5VT25-6,PR:Q5VT25-6,Category=organism-sequence.,http://purl.obolibrary.org/obo/NCBITaxon_9606
...,...,...,...,...,...,...,...
331975,diamine acetyltransferase 1 (chicken),SSAT-1 (chicken)|putrescine acetyltransferase ...,false,UniProtKB:Q8AXL1,PR:Q8AXL1,Category=organism-gene.,http://purl.obolibrary.org/obo/NCBITaxon_9031
331976,synaptophysin-like protein 1 isoform m2 (mouse),mSYPL1/iso:m2,false,UniProtKB:O09117-2,PR:O09117-2,Category=organism-sequence.,http://purl.obolibrary.org/obo/NCBITaxon_10090
331977,zinc finger protein 3 homolog,ZFP3|zfp-3|ZNF752,false,,PR:000017637,Category=gene.,
331978,RKD2 (Arabidopsis thaliana),,false,,Araport:AT1G74480,Category=external.,http://purl.obolibrary.org/obo/NCBITaxon_3702


In [21]:
pr_df = pr_df[pr_df['Obsolete']=='false']
pr_df.drop('Obsolete',inplace=True,axis=1)
print(pr_df.shape)

(326674, 6)


In [22]:
renamed_pr = {'Preferred Label':'Term','database_cross_reference':'cross_reference','http://www.geneontology.org/formats/oboInOwl#id':'id','only_in_taxon':'taxon_id','http://www.w3.org/2000/01/rdf-schema#comment':'type_of_ent'}
pr_df = pr_df.rename(columns=renamed_pr)

In [23]:
pr_df['taxon_id'] = pr_df['taxon_id'].apply(lambda x: x.split('/')[-1].split('_')[-1] if isinstance(x,str) else np.nan)

In [24]:
pr_df['type_of_ent'] = pr_df['type_of_ent'].apply(lambda x: x[x.find('Category=') + len('Category='):] if isinstance(x,str) else np.nan)

In [25]:
pr_df['type_of_ent'] = pr_df['type_of_ent'].apply(lambda x: x.split('.')[0] if isinstance(x,str) else np.nan)

In [26]:
pr_df['type_of_ent'] = pr_df['type_of_ent'].apply(lambda x: x.replace('-',' ') if isinstance(x,str) else np.nan)

In [27]:
pr_df['cross_reference'] = pr_df['cross_reference'].apply(lambda x: [i for i in x.split('|')] if isinstance(x,str) else np.nan)

In [28]:
pr_df['UniprotID'] = pr_df.apply(lambda x: separate_id(x.cross_reference,'UniProtKB') if isinstance(x.cross_reference,list) else np.nan,axis=1)

In [29]:
pr_df['NCBI_id'] = pr_df['id'].apply(lambda x: x.split(':')[-1] if x.startswith('NCBIGene') else np.nan)

In [30]:
pr_df.count()

Term               326674
Synonyms           224830
cross_reference    178542
id                 326674
type_of_ent        324708
taxon_id           287437
UniprotID          170646
NCBI_id              5380
dtype: int64

In [31]:
pr_df.sample(10)

Unnamed: 0,Term,Synonyms,cross_reference,id,type_of_ent,taxon_id,UniprotID,NCBI_id
297340,peroxisome proliferator-activated receptor gam...,rPPARG/iso:r1,[],PR:O88275-2,organism sequence,10116,O88275-2,
33368,protein TFG isoform h1 (human),hTFG/iso:h1,[],PR:Q92734-1,organism sequence,9606,Q92734-1,
34245,60S ribosomal protein L15-1 (Arabidopsis thali...,At-RPL15A|RPL15A|At4g16720|FCAALL.416|dl4385c,[],PR:O23515,organism gene,3702,O23515,
203467,LOC560371 (zebrafish),,,NCBIGene:560371,external,7955,,560371.0
9422,"polyadenylate-binding protein, cytoplasmic and...",Spom972h-pab1(SPAC57A7.04c)|polyadenylate tail...,[],PR:P31209,organism gene,284812,P31209,
74331,serine/threonine-protein kinase PBL13 (Arabido...,At-PBL13|ser/Thr protein kinase ACIK1b (Arabid...,[],PR:F4JZW1,organism gene,3702,F4JZW1,
299480,protein arginine N-methyltransferase SFM1 (yeast),ySFM1|SPOUT family methyltransferase 1 (yeast)...,[],PR:Q12314,organism gene,559292,Q12314,
59233,carbohydrate sulfotransferase 1 (mouse),keratan sulfate Gal-6 sulfotransferase (mouse)...,[],PR:Q9EQC0,organism gene,10090,Q9EQC0,
260124,ogm1 (Schizosaccharomyces pombe),,,PomBase:SPAC22A12.07c,external,4896,,
26541,QWRF motif-containing protein 6 isoform 1 (Ara...,At-QWRF6/iso:1,[],PR:Q5BPM6-1,organism sequence,3702,Q5BPM6-1,


In [32]:
pr_df.rename(columns={'NCBI_id':'NCBI GeneID'},inplace=True)

In [33]:
pr_df

Unnamed: 0,Term,Synonyms,cross_reference,id,type_of_ent,taxon_id,UniprotID,NCBI GeneID
0,solute carrier organic anion transporter famil...,hSLCO3A1/iso:h3,[],PR:Q9UIG8-3,organism sequence,9606,Q9UIG8-3,
1,trbG (Escherichia coli K-12),,,NCBIGene:1263580,external,83333,,1263580
2,interleukin enhancer-binding factor 2 isoform ...,mILF2/iso:1,[],PR:Q9CXY6-1,organism sequence,10090,Q9CXY6-1,
3,gag-pol polyprotein (Human immunodeficiency vi...,Pr160Gag-Pol (HIV-1 M:D_Z2/CDC-Z34)|gag-pol (H...,[],PR:P12499,organism gene,11683,P12499,
4,serine/threonine-protein kinase MRCK alpha iso...,hCDC42BPA/iso:h6,[],PR:Q5VT25-6,organism sequence,9606,Q5VT25-6,
...,...,...,...,...,...,...,...,...
331975,diamine acetyltransferase 1 (chicken),SSAT-1 (chicken)|putrescine acetyltransferase ...,[],PR:Q8AXL1,organism gene,9031,Q8AXL1,
331976,synaptophysin-like protein 1 isoform m2 (mouse),mSYPL1/iso:m2,[],PR:O09117-2,organism sequence,10090,O09117-2,
331977,zinc finger protein 3 homolog,ZFP3|zfp-3|ZNF752,,PR:000017637,gene,,,
331978,RKD2 (Arabidopsis thaliana),,,Araport:AT1G74480,external,3702,,


# Merge Both Ontologies

In [34]:
df1 = pd.concat([pr_df,ogg_df])
df1

Unnamed: 0,Term,Synonyms,cross_reference,id,type_of_ent,taxon_id,UniprotID,NCBI GeneID,Preferred Label,database_cross_reference,full name from nomenclature authority,organism NCBITaxon ID,type of gene
0,solute carrier organic anion transporter famil...,hSLCO3A1/iso:h3,[],PR:Q9UIG8-3,organism sequence,9606,Q9UIG8-3,,,,,,
1,trbG (Escherichia coli K-12),,,NCBIGene:1263580,external,83333,,1263580,,,,,
2,interleukin enhancer-binding factor 2 isoform ...,mILF2/iso:1,[],PR:Q9CXY6-1,organism sequence,10090,Q9CXY6-1,,,,,,
3,gag-pol polyprotein (Human immunodeficiency vi...,Pr160Gag-Pol (HIV-1 M:D_Z2/CDC-Z34)|gag-pol (H...,[],PR:P12499,organism gene,11683,P12499,,,,,,
4,serine/threonine-protein kinase MRCK alpha iso...,hCDC42BPA/iso:h6,[],PR:Q5VT25-6,organism sequence,9606,Q5VT25-6,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69683,,mdl|JW5061|ECK0443,,,,,,945088,mdlB,EcoGene:EG14374,,511145,protein-coding
69684,,,,,,,,13318740,RVBD_0838,,,83332,protein-coding
69685,,U62A|RNU62|U62,,,,,,26786,SNORD62A,HGNC:10219,"small nucleolar RNA, C/D box 62A",9606,snoRNA
69686,,,,,,,,101928500,LOC101928500,,,9606,ncRNA


In [35]:
df1 = df1[(~df1.duplicated(subset='NCBI GeneID',keep='last')) | (df1['NCBI GeneID'].isnull())]

In [36]:
df1

Unnamed: 0,Term,Synonyms,cross_reference,id,type_of_ent,taxon_id,UniprotID,NCBI GeneID,Preferred Label,database_cross_reference,full name from nomenclature authority,organism NCBITaxon ID,type of gene
0,solute carrier organic anion transporter famil...,hSLCO3A1/iso:h3,[],PR:Q9UIG8-3,organism sequence,9606,Q9UIG8-3,,,,,,
1,trbG (Escherichia coli K-12),,,NCBIGene:1263580,external,83333,,1263580,,,,,
2,interleukin enhancer-binding factor 2 isoform ...,mILF2/iso:1,[],PR:Q9CXY6-1,organism sequence,10090,Q9CXY6-1,,,,,,
3,gag-pol polyprotein (Human immunodeficiency vi...,Pr160Gag-Pol (HIV-1 M:D_Z2/CDC-Z34)|gag-pol (H...,[],PR:P12499,organism gene,11683,P12499,,,,,,
4,serine/threonine-protein kinase MRCK alpha iso...,hCDC42BPA/iso:h6,[],PR:Q5VT25-6,organism sequence,9606,Q5VT25-6,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69683,,mdl|JW5061|ECK0443,,,,,,945088,mdlB,EcoGene:EG14374,,511145,protein-coding
69684,,,,,,,,13318740,RVBD_0838,,,83332,protein-coding
69685,,U62A|RNU62|U62,,,,,,26786,SNORD62A,HGNC:10219,"small nucleolar RNA, C/D box 62A",9606,snoRNA
69686,,,,,,,,101928500,LOC101928500,,,9606,ncRNA


In [37]:
df1['Term'] = df1.apply(lambda x: set_term(x['Preferred Label'],x.Term),axis=1)
df1.drop('Preferred Label',axis=1,inplace=True)

In [38]:
df1['Synonyms'] = df1.apply(lambda x: set_term(x.Synonyms,x['full name from nomenclature authority']),axis=1)
df1.drop('full name from nomenclature authority',axis=1,inplace=True)

In [39]:
df1['cross_reference'] = df1['cross_reference'].apply(lambda x: np.nan if x == [] else x)

In [40]:
df1['NCBItaxon_id'] = df1.apply(lambda x: set_term(x['taxon_id'],x['organism NCBITaxon ID']),axis=1)
df1.drop(['organism NCBITaxon ID','taxon_id'],axis=1,inplace=True)

In [41]:
df1['cross_reference'] = df1['cross_reference'].apply(lambda d: d if isinstance(d, list) else [])

In [42]:
df1['database_cross_reference'] = df1['database_cross_reference'].apply(lambda x: [i for i in x.split('|')] if isinstance(x,str) else [] )

In [43]:
df1['id'] = df1['id'].apply(lambda x: [i for i in x.split('|')] if isinstance(x,str) else [] )

In [44]:
df1['cross_reference'] = df1['cross_reference'] + df1['database_cross_reference'] + df1['id']

In [45]:
df1.drop(['database_cross_reference','id'],axis=1,inplace=True)

In [46]:
df1.sample(10)

Unnamed: 0,Term,Synonyms,cross_reference,type_of_ent,UniprotID,NCBI GeneID,type of gene,NCBItaxon_id
100307,centlein isoform m4 (mouse),mCNTLN/iso:m4,[PR:A2AM05-4],organism sequence,A2AM05-4,,,10090.0
46186,inhibitor of growth protein 5 isoform h2 (human),hING5/iso:h2,[PR:Q8WYH8-2],organism sequence,Q8WYH8-2,,,9606.0
235358,rpsI (Shewanella oneidensis MR-1),,[NCBIGene:1171577],external,,1171577.0,,211586.0
189722,ptmaa (zebrafish),,[ZFIN:ZDB-GENE-030131-7647],external,,,,7955.0
13984,SNRPGP2,HsT2742,[HGNC:30999],,,100130003.0,pseudo,9606.0
254312,SLC6A8 (human),,[HGNC:11055],external,,,,9606.0
190079,sperm-egg fusion protein TMEM95,TMEM95|transmembrane protein 95|UNQ9390/PRO34281,[PR:000032608],gene,,,,
75627,flagellar protein FliO,fliO|flaP|flbD,[PR:000033309],gene,,,,
250245,DENN domain-containing protein 4B,DENND4B|brain specific protein 4|brain-specifi...,[PR:000031876],gene,,,,
170106,dystonin isoform m5 (mouse),dystonin isoform BPAG1-e (mouse)|mDST/iso:m5,[PR:Q91ZU6-5],organism sequence,Q91ZU6-5,,,10090.0


In [47]:
df1['type of gene'] = df1['type of gene'].apply(lambda d: d + ' gene' if isinstance(d, str) else np.nan)

In [48]:
df1['type'] = df1.apply(lambda x: set_term(x.type_of_ent,x['type of gene']),axis=1)
df1.drop(['type of gene','type_of_ent'],axis=1,inplace=True)

In [49]:
df1['Synonyms'] = df1['Synonyms'].apply(lambda x: [i for i in x.split('|')] if isinstance(x,str) else np.nan )

In [50]:
df1

Unnamed: 0,Term,Synonyms,cross_reference,UniprotID,NCBI GeneID,NCBItaxon_id,type
0,solute carrier organic anion transporter famil...,[hSLCO3A1/iso:h3],[PR:Q9UIG8-3],Q9UIG8-3,,9606,organism sequence
1,trbG (Escherichia coli K-12),,[NCBIGene:1263580],,1263580,83333,external
2,interleukin enhancer-binding factor 2 isoform ...,[mILF2/iso:1],[PR:Q9CXY6-1],Q9CXY6-1,,10090,organism sequence
3,gag-pol polyprotein (Human immunodeficiency vi...,"[Pr160Gag-Pol (HIV-1 M:D_Z2/CDC-Z34), gag-pol ...",[PR:P12499],P12499,,11683,organism gene
4,serine/threonine-protein kinase MRCK alpha iso...,[hCDC42BPA/iso:h6],[PR:Q5VT25-6],Q5VT25-6,,9606,organism sequence
...,...,...,...,...,...,...,...
69683,mdlB,"[mdl, JW5061, ECK0443]",[EcoGene:EG14374],,945088,511145,protein-coding gene
69684,RVBD_0838,,[],,13318740,83332,protein-coding gene
69685,SNORD62A,"[U62A, RNU62, U62]",[HGNC:10219],,26786,9606,snoRNA gene
69686,LOC101928500,,[],,101928500,9606,ncRNA gene


In [51]:
df1.count()

Term               396027
Synonyms           265007
cross_reference    396028
UniprotID          170646
NCBI GeneID         74587
NCBItaxon_id       356713
type               393915
dtype: int64

## Apend Gene Ontology terms

In [52]:
df1 = pd.concat([df1,go_df])

In [53]:
df1

Unnamed: 0,Term,Synonyms,cross_reference,UniprotID,NCBI GeneID,NCBItaxon_id,type,GO_id
0,solute carrier organic anion transporter famil...,[hSLCO3A1/iso:h3],[PR:Q9UIG8-3],Q9UIG8-3,,9606,organism sequence,
1,trbG (Escherichia coli K-12),,[NCBIGene:1263580],,1263580,83333,external,
2,interleukin enhancer-binding factor 2 isoform ...,[mILF2/iso:1],[PR:Q9CXY6-1],Q9CXY6-1,,10090,organism sequence,
3,gag-pol polyprotein (Human immunodeficiency vi...,"[Pr160Gag-Pol (HIV-1 M:D_Z2/CDC-Z34), gag-pol ...",[PR:P12499],P12499,,11683,organism gene,
4,serine/threonine-protein kinase MRCK alpha iso...,[hCDC42BPA/iso:h6],[PR:Q5VT25-6],Q5VT25-6,,9606,organism sequence,
...,...,...,...,...,...,...,...,...
50597,negative regulation of ribosomal large subunit...,[negative regulation of ribosomal large subuni...,,,,,biological process,2000204
50600,"2,5-dihydroxypyridine catabolic process to fum...","[2,5-dihydroxypyridine breakdown to fumarate, ...",[MetaCyc:PWY-722],,,,biological process,0019508
50601,chylomicron assembly,,,,,,biological process,0034378
50603,pyrithiamine deaminase activity,[1-(4-amino-2-methylpyrimid-5-ylmethyl)-3-(bet...,"[RHEA:14537, MetaCyc:PYRITHIAMIN-DEAMINASE-RXN...",,,,molecular function,0050239


# CTD Genes

In [54]:
ctd_df = pd.read_csv('data/Genetic/CTD_genes.csv', dtype=str)
ctd_df

Unnamed: 0,GeneSymbol,GeneName,GeneID,AltGeneIDs,Synonyms,BioGRIDIDs,PharmGKBIDs,UniProtIDs
0,03B03F,"DNA segment, 03B03F (Research Genetics)",27777,,,,,
1,03B03R,"DNA segment, 03B03R (Research Genetics)",27778,,,,,
2,03.MMHAP34FRA.SEQ,"DNA segment, 03.MMHAP34FRA.seq",53288,,,,,
3,064YA,,5658107,,,,,
4,102G4T7,"DNA segment, 102g4T7",56573,,,,,
...,...,...,...,...,...,...,...,...
544904,ZZEF1.S,"zinc finger, ZZ-type with EF-hand domain 1 S h...",108709661,,XELAEV_18015398mg|zinc finger ZZ-type and EF-h...,,,A0A1L8H8K3
544905,ZZZ-1,G_PROTEIN_RECEP_F1_2 domain-containing protein,185208,,CELE_F32D8.10,,,Q19959
544906,ZZZ3,zinc finger ZZ-type containing 3,26009,100053222|100074855|100227013|100343576|100379...,A306_07098|Anapl_04670|AS27_00889|ATAC1|ATAC c...,117482|224485,PA134873184,A0A096MN54|A0A0D9S6Y1|A0A1A7ZGG4|A0A1D5QUG5|A0...
544907,ZZZ3.L,"zinc finger, ZZ-type containing 3 L homeolog",108714197,,XELAEV_18022920mg|ZZ-type zinc finger-containi...,,,A0A1L8GLP2


In [55]:
ctd_df.drop('AltGeneIDs',axis=1,inplace=True)

In [56]:
ctd_df['Synonyms'] = ctd_df['Synonyms'].apply(lambda x: [i for i in x.split('|')] if isinstance(x,str) else [])

In [57]:
ctd_df['GeneName'] = ctd_df['GeneName'].apply(lambda x: [i for i in x.split('|')] if isinstance(x,str) else [])

In [58]:
ctd_df['Synonyms'] = ctd_df['Synonyms'] + ctd_df['GeneName']

In [59]:
ctd_df.drop('GeneName',axis=1,inplace=True)

In [60]:
ctd_df['BioGRIDIDs'] = ctd_df['BioGRIDIDs'].apply(lambda x: ['BioGRIDIDs:' + i for i in x.split('|')] if isinstance(x,str) else [])

In [61]:
ctd_df['PharmGKBIDs'] = ctd_df['PharmGKBIDs'].apply(lambda x: ['PharmGKBIDs:' + i for i in x.split('|')] if isinstance(x,str) else [])

In [62]:
ctd_df['cross_reference'] = ctd_df['BioGRIDIDs'] + ctd_df['PharmGKBIDs']
ctd_df.drop(['PharmGKBIDs','BioGRIDIDs'],axis=1,inplace=True)

In [63]:
ctd_df.rename(columns={'GeneID':'NCBI GeneID','UniProtIDs':'UniprotID','GeneSymbol':'Term'},inplace=True)

In [64]:
ctd_df.sample(10)

Unnamed: 0,Term,NCBI GeneID,Synonyms,UniprotID,cross_reference
519191,TSP_08207,10906771,[nuclease domain-containing protein 1],,[]
68191,BM1_36045,6102096,[bZIP transcription factor family protein],A0A0K0JF08|A0A4E9FS26,[]
89348,C5S72_MGT16,35987226,[tRNA],,[]
185698,E2990_MGT21,39412053,[tRNA],,[]
401298,PATZ1,23598,"[A306_02535, and zinc finger-containing protei...",A0A024R1F8|A0A024R1H7|A0A024R1M5|A0A087YJF3|A0...,"[BioGRIDIDs:117133, BioGRIDIDs:207852, PharmGK..."
473982,T23B12.5,188777,"[CELE_T23B12.5, Uncharacterized protein]",O17000,[]
77096,BVC38_GT20,30685962,[tRNA],,[]
527394,TSPAN31.S,495004,"[sarcoma-amplified sequence homolog A, sas, te...",A0A1L8H9X1|Q5XHG6,[]
458394,SRAE_1000184200,36375947,[Basic-leucine zipper domain-containing protein],,[]
519197,TSP_08213,10906777,[absent in melanoma 1 protein],,[]


# Merge all DataFrames

In [65]:
df_final = pd.concat([ctd_df,df1])
df_final

Unnamed: 0,Term,NCBI GeneID,Synonyms,UniprotID,cross_reference,NCBItaxon_id,type,GO_id
0,03B03F,27777,"[DNA segment, 03B03F (Research Genetics)]",,[],,,
1,03B03R,27778,"[DNA segment, 03B03R (Research Genetics)]",,[],,,
2,03.MMHAP34FRA.SEQ,53288,"[DNA segment, 03.MMHAP34FRA.seq]",,[],,,
3,064YA,5658107,[],,[],,,
4,102G4T7,56573,"[DNA segment, 102g4T7]",,[],,,
...,...,...,...,...,...,...,...,...
50597,negative regulation of ribosomal large subunit...,,[negative regulation of ribosomal large subuni...,,,,biological process,2000204
50600,"2,5-dihydroxypyridine catabolic process to fum...",,"[2,5-dihydroxypyridine breakdown to fumarate, ...",,[MetaCyc:PWY-722],,biological process,0019508
50601,chylomicron assembly,,,,,,biological process,0034378
50603,pyrithiamine deaminase activity,,[1-(4-amino-2-methylpyrimid-5-ylmethyl)-3-(bet...,,"[RHEA:14537, MetaCyc:PYRITHIAMIN-DEAMINASE-RXN...",,molecular function,0050239


In [66]:
df_final = df_final[(~df_final.duplicated(subset='NCBI GeneID',keep='first')) | (df_final['NCBI GeneID'].isnull())]
print(df_final.shape)

(946584, 8)


In [69]:
df_final['UniprotID'] = df_final['UniprotID'].apply(lambda x: [f for f in x.split('|')] if isinstance(x,str) else np.nan)

In [70]:
df_final.count()

Term               946582
NCBI GeneID        581156
Synonyms           803547
UniprotID          430954
cross_reference    911592
NCBItaxon_id       318373
type               399562
GO_id               43987
dtype: int64

In [71]:
df_final.sample(10)

Unnamed: 0,Term,NCBI GeneID,Synonyms,UniprotID,cross_reference,NCBItaxon_id,type,GO_id
423127,RDH8,50700.0,"[AS28_00214, Cadr_000025383, CK820_G0026559, C...","[A0A091DE59, A0A1S2ZXC9, A0A1S3GN20, A0A1U7QT8...","[BioGRIDIDs:119119, PharmGKBIDs:PA34309]",,,
37440,pronephros development,,[pronephric kidney development],,,,biological process,48793.0
405448,PHUM_PHUM131500,8234162.0,[lipopolysaccharide-induced transcription fact...,[E0VEE2],[],,,
234274,H2O20_MGT11,56138382.0,[tRNA],,[],,,
392727,OR12D26,100058631.0,"[olfactory receptor 12D3-like, olfactory recep...",[A0A3Q2HRK2],[],,,
239047,HELRODRAFT_113366,20195420.0,[hypothetical protein],[T1EFR8],[],,,
225075,Vps29 (rat),,,,[RGD:1308332],10116.0,external,
262279,WD repeat-containing protein 35 (rat),,"[rWDR35, naofen (rat), Wdr35]",[A6N6J5],[PR:A6N6J5],10116.0,organism gene,
33009,Fpr-rs7 (mouse),,,,[MGI:2448177],10090.0,external,
182564,submaxillary gland androgen-regulated protein ...,,[mSmr2/iso:alpha],[O09133-1],[PR:O09133-1],10090.0,organism sequence,


# To JSON Genetic document

In [72]:
df_final.fillna('',inplace=True)

In [73]:
df_final.cross_reference = df_final.cross_reference.apply(lambda y: '' if len(y)==0 else y)

In [74]:
df_final.sample(5)

Unnamed: 0,Term,NCBI GeneID,Synonyms,UniprotID,cross_reference,NCBItaxon_id,type,GO_id
226417,G7W86_MGT20,44151287.0,[tRNA],,,,,
360124,MS3_0020292,24588721.0,"[MS3_00934, uncharacterized protein]",[A0A094ZG88],,,,
382233,NECAME_18047,25358072.0,[hypothetical protein],[W2TGD5],,,,
42596,"diablo homolog, mitochondrial isoform 1 (human)",,[hDIABLO/iso:1],[Q9NR28-1],[PR:Q9NR28-1],9606.0,organism sequence,
306439,LOAG_12746,9950212.0,[hypothetical protein],[A0A1I7VC07],,,,


In [75]:
documents = []
num = 0;
for i in df_final.itertuples(index=False):
    gene_document = {}
    gene_document['term'] = i[0]
    gene_document['synonyms']= i[2]
    gene_document['uniprot_id']= i[3]
    gene_document['ncbi_gene_id']= i[1]
    gene_document['ncbi_taxon_id'] = i[5]
    gene_document['type']= i[6]
    gene_document['cross_reference']= i[4]
    gene_document['GO_id']= i[7]  
    documents.append(gene_document) 
    num+=1
    if num%50000 == 0:
        print(num/len(df_final)*100,'% genes processed')

5.2821513991362625 % genes processed
10.564302798272525 % genes processed
15.846454197408788 % genes processed
21.12860559654505 % genes processed
26.41075699568131 % genes processed
31.692908394817575 % genes processed
36.97505979395384 % genes processed
42.2572111930901 % genes processed
47.53936259222636 % genes processed
52.82151399136262 % genes processed
58.10366539049889 % genes processed
63.38581678963515 % genes processed
68.66796818877141 % genes processed
73.95011958790768 % genes processed
79.23227098704395 % genes processed
84.5144223861802 % genes processed
89.79657378531647 % genes processed
95.07872518445272 % genes processed


In [76]:
len(documents)

946584

In [77]:
documents[654363]

{'term': 'putative ATP-dependent RNA helicase DHX57 isoform h3 (human)',
 'synonyms': ['hDHX57/iso:h3'],
 'uniprot_id': ['Q6P158-3'],
 'ncbi_gene_id': '',
 'ncbi_taxon_id': '9606',
 'type': 'organism sequence',
 'cross_reference': ['PR:Q6P158-3'],
 'GO_id': ''}

## Save and load resultant document

In [78]:
with open('data/Genetic/genetic.json', 'w') as fout:
    json.dump(documents, fout)

In [79]:
with open('data/Genetic/genetic.json', 'r') as fout:
    genetic = json.loads(fout.read())
    print(len(genetic))

946584
