# Iterate through file in order to reclassify GO terms.
Using this tutorial: https://nbviewer.jupyter.org/urls/dessimozlab.github.io/go-handbook/GO%20Tutorial%20in%20Python%20-%20Solutions.ipynb
    We will iterate through out file

In [39]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from math import isnan
import goatools

# Import data

In [25]:
data = pd.read_excel(r'/Users/hellpark/Desktop/Bioinformatics/model_objects/GO-SLIM/Copy of CurrentGOterms(1).xlsx',
                  skiprows = 2)
df = pd.DataFrame(data, columns=['GO Access','Function',
                                'Gene Num.','Category', 'Genes'])

In [26]:
df[0:3]

Unnamed: 0,GO Access,Function,Gene Num.,Category,Genes
0,GO:0055085,transmembrane transport,125,metabolism,TD01GL000101|TD01GL000142|TD01GL000261|TD01GL0...
1,GO:0006865,amino acid transport,61,metabolism,TD01GL000165|TD01GL000166|TD01GL000168|TD01GL0...
2,GO:0005975,carbohydrate metabolic process,149,metabolism,TD01GL000005|TD01GL000009|TD01GL000010|TD01GL0...


In [147]:
data2 = pd.read_excel(r'/Users/hellpark/Desktop/Bioinformatics/model_objects/GO-SLIM/Copy of Go to Gene(1).xlsx',
                     sheet_name='Sheet3')
newdf = pd.DataFrame(data2)

In [148]:
newdf[0:3]

Unnamed: 0,header
0,GO:0071973
1,GO:0055085
2,GO:0051920


# Import the OBO parser from GOATools



In [33]:
from goatools import obo_parser
import wget
import os

Now, we can download the OBO file into the './data' folder using the following. We are going to download the go-basic.obo version of the ontology, which is guaranteed to be acyclic, which means that annotations can be propagated up the graph.

In [63]:
go_obo_url = 'http://www.geneontology.org/ontology/subsets/goslim_generic.obo'
data_folder = os.getcwd() + '/data'

In [64]:
# Check if we have the ./data directory already
if(not os.path.isfile(data_folder)):
    # Emulate mkdir -p (no error if folder exists)
    try:
        os.mkdir(data_folder)
    except OSError as e:
        if(e.errno != 17):
            raise e
else:
    raise Exception('Data path (' + data_folder + ') exists as a file. '
                   'Please rename, remove or change the desired location of the data path.')

# Check if the file exists already



In [65]:
if(not os.path.isfile(data_folder+'/go-basic.obo')):
    go_obo = wget.download(go_obo_url, data_folder+'/go-basic.obo')
else:
    go_obo = data_folder+'/go-basic.obo'

The path to the GO OBO file is now stored in the variable go_obo.



In [66]:
print(go_obo)

/Users/hellpark/Bioinformatics/data/go-basic.obo


Now we can create a dictionary of the GO terms, using the obo_parser from GOATools.


In [67]:
go = obo_parser.GODag(go_obo)

/Users/hellpark/Bioinformatics/data/go-basic.obo: fmt(1.2) rel(2021-08-18) 47,217 GO Terms


What is the name of our first missing gene, GO:0004177![image.png](attachment:image.png)

In [90]:
go_id = 'GO:0004177'
go_term = go[go_id]
print(go_term)

GO:0004177	level-05	depth-05	aminopeptidase activity [molecular_function]


Now recursively print all of GO:0004177 parent

In [73]:
def transitive_closure(go_term, go):
    go_term_set = set()
    find_parents(go_term, go, go_term_set)
    find_children(go_term, go, go_term_set)
    return go_term_set
    
def find_parents(term1, go, go_term_set={}, ret=False):
    for term2 in term1.parents:
        go_term_set.update({term2})
        
        # Recurse on term to find all parents
        find_parents(term2, go, go_term_set)          
    if(ret):
        return go_term_set

def find_children(term1, go, go_term_set={}, ret=False):
    for term2 in term1.children:
        go_term_set.update({term2})
        
        # Recurse on term to find all children
        find_children(term2, go, go_term_set)
    if(ret):
        return go_term_set

In [149]:
#output file
dfnew = pd.DataFrame(columns=['new gene'])

In [150]:
for i in newdf.index:
    go_id = newdf.at[i,'header']
    go_term = go[go_id]
    go_term_set = transitive_closure(go_term, go)
    dfadd=pd.DataFrame(go_term_set)
    dfadd['new gene'] = go_id
    dfnew=dfnew.append(dfadd)

In [155]:
dfnew.to_csv('/Users/hellpark/Desktop/Bioinformatics/model_objects/GO-SLIM/GO_output.csv')

In [128]:
dfnew=dfnew.append(dfadd)

In [129]:
print(dfnew)

     new gene                                                  0
0  GO:0004177  GO:0045148\tlevel-06\tdepth-06\ttripeptide ami...
1  GO:0004177  GO:0003674\tlevel-00\tdepth-00\tmolecular_func...
2  GO:0004177  GO:0070005\tlevel-06\tdepth-06\tcysteine-type ...
3  GO:0004177  GO:0070006\tlevel-06\tdepth-06\tmetalloaminope...
4  GO:0004177  GO:0008233\tlevel-03\tdepth-03\tpeptidase acti...
5  GO:0004177  GO:0003824\tlevel-01\tdepth-01\tcatalytic acti...
6  GO:0004177  GO:0070009\tlevel-06\tdepth-06\tserine-type am...
7  GO:0004177  GO:0140096\tlevel-02\tdepth-02\tcatalytic acti...
8  GO:0004177  GO:0008238\tlevel-04\tdepth-04\texopeptidase a...
9  GO:0004177  GO:0016787\tlevel-02\tdepth-02\thydrolase acti...


In [89]:
go_term_set

{GOTerm('GO:0003674'):
   id:GO:0003674
   item_id:GO:0003674
   name:molecular_function
   namespace:molecular_function
   _parents: 0 items
   parents: 0 items
   children: 24 items
   level:0
   depth:0
   is_obsolete:False
   alt_ids: 1 items
     GO:0005554,
 GOTerm('GO:0003735'):
   id:GO:0003735
   item_id:GO:0003735
   name:structural constituent of ribosome
   namespace:molecular_function
   _parents: 1 items
     GO:0005198
   parents: 1 items
     GO:0005198	level-01	depth-01	structural molecule activity [molecular_function]
   children: 0 items
   level:2
   depth:2
   is_obsolete:False
   alt_ids: 7 items
     GO:0003738
     GO:0003736
     GO:0003739
     GO:0003737
     GO:0003740
     GO:0003741
     GO:0003742,
 GOTerm('GO:0005199'):
   id:GO:0005199
   item_id:GO:0005199
   name:structural constituent of cell wall
   namespace:molecular_function
   _parents: 1 items
     GO:0005198
   parents: 1 items
     GO:0005198	level-01	depth-01	structural molecule activity [mo

# For GO-team reclassify paper, make all go terms in pretty format

In [163]:
data = pd.read_excel(r'/Users/hellpark/Desktop/Bioinformatics September 2021/model_objects/GO-SLIM/MolecularFunction_GO.xlsx',
                     sheet_name = 'MF')
df = pd.DataFrame(data)

In [164]:
data

Unnamed: 0,FastaNum,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,53
0,TD01GL000439,"{GO:0000334;3-hydroxyanthranilate 3,4-dioxygen...",{GO:0005488;binding;MolecularFunction,"{GO:0016702;oxidoreductase activity, acting on...",{GO:0005506;iron ion binding;MolecularFunction,"{GO:0016701;oxidoreductase activity, acting on...",{GO:0046872;metal ion binding;MolecularFunction,{GO:0043169;cation binding;MolecularFunction,{GO:0016491;oxidoreductase activity;MolecularF...,{GO:0051213;dioxygenase activity;MolecularFunc...,...,,,,,,,,,,
1,TD01GL001535,"{GO:0000721;(R,R)-butanediol dehydrogenase act...",{GO:0005488;binding;MolecularFunction,{GO:0046872;metal ion binding;MolecularFunction,"{GO:0016616;oxidoreductase activity, acting on...",{GO:0043169;cation binding;MolecularFunction,{GO:0008270;zinc ion binding;MolecularFunction,{GO:0016491;oxidoreductase activity;MolecularF...,"{GO:0016614;oxidoreductase activity, acting on...",{GO:0046914;transition metal ion binding;Molec...,...,,,,,,,,,,
2,TD01GL002572,"{GO:0000906;6,7-dimethyl-8-ribityllumazine syn...",{GO:0016867;intramolecular transferase activit...,{GO:0016866;intramolecular transferase activit...,{GO:0016853;isomerase activity;MolecularFunction,{GO:0003824;catalytic activity;MolecularFunction,{GO:0003674;molecular_function;MolecularFunction,,,,...,,,,,,,,,,
3,TD01GL003352,"{GO:0000906;6,7-dimethyl-8-ribityllumazine syn...",{GO:0016867;intramolecular transferase activit...,{GO:0016866;intramolecular transferase activit...,{GO:0016853;isomerase activity;MolecularFunction,{GO:0003824;catalytic activity;MolecularFunction,{GO:0003674;molecular_function;MolecularFunction,,,,...,,,,,,,,,,
4,TD01GL002234,{GO:0000988;protein binding transcription fact...,{GO:0000989;transcription factor binding trans...,{GO:0003674;molecular_function;MolecularFunction,{GO:0016989;sigma factor antagonist activity;M...,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2251,TD01GL000927,{GO:1901505;carbohydrate derivative transporte...,{GO:0015932;nucleobase-containing compound tra...,{GO:0005337;nucleoside transmembrane transport...,{GO:0022857;transmembrane transporter activity...,{GO:0022891;substrate-specific transmembrane t...,{GO:0005215;transporter activity;MolecularFunc...,{GO:0022892;substrate-specific transporter act...,{GO:0003674;molecular_function;MolecularFunction,,...,,,,,,,,,,
2252,TD01GL001072,{GO:1901505;carbohydrate derivative transporte...,"{GO:0016772;transferase activity, transferring...",{GO:0005488;binding;MolecularFunction,{GO:0008982;protein-N(PI)-phosphohistidine-sug...,{GO:0016301;kinase activity;MolecularFunction,{GO:0022804;active transmembrane transporter a...,{GO:0022857;transmembrane transporter activity...,{GO:0022891;substrate-specific transmembrane t...,{GO:0005215;transporter activity;MolecularFunc...,...,,,,,,,,,,
2253,TD01GL003029,{GO:1901618;organic hydroxy compound transmemb...,{GO:0008514;organic anion transmembrane transp...,{GO:0008028;monocarboxylic acid transmembrane ...,{GO:0005342;organic acid transmembrane transpo...,{GO:0046943;carboxylic acid transmembrane tran...,{GO:0022857;transmembrane transporter activity...,{GO:0022891;substrate-specific transmembrane t...,{GO:0005215;transporter activity;MolecularFunc...,{GO:0022892;substrate-specific transporter act...,...,,,,,,,,,,
2254,TD01GL002890,{GO:1901618;organic hydroxy compound transmemb...,{GO:0015199;amino-acid betaine transmembrane t...,{GO:0015651;quaternary ammonium group transmem...,{GO:0015101;organic cation transmembrane trans...,{GO:0008514;organic anion transmembrane transp...,{GO:0072349;modified amino acid transmembrane ...,{GO:0008028;monocarboxylic acid transmembrane ...,{GO:0005342;organic acid transmembrane transpo...,{GO:0046943;carboxylic acid transmembrane tran...,...,,,,,,,,,,


In [161]:
df

Unnamed: 0,FastaNum,GO
0,TD01GL000001,
1,TD01GL000002,
2,TD01GL000003,
3,TD01GL000004,
4,TD01GL000005,
...,...,...
2251,TD01GL003813,
2252,TD01GL003814,
2253,TD01GL003815,
2254,TD01GL003816,
