In [1]:
import urllib.parse
import urllib.request
from io import StringIO
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import time
pd.set_option('display.max_colwidth', 1)

# Leer 14.379 genes

In [2]:
genes_npy = pd.read_csv("genesUnique.txt", header=None)[0].values
len(genes_npy)

14379

In [3]:
#genes_run2 = genes_npy[5800:]
#genes_run2

# Aux funcs

In [4]:
def get_sets_from_pandas_col(col):
    if col.isnull().all():
        return set()
    else:
        list_of_lists = col.dropna().str.split("; ").values.tolist()
        flattened    = [val for sublist in list_of_lists for val in sublist]
        return set(flattened)
    
def go_set_to_file(go_set):
    go_ids    = [go[go.find("[")+1:go.find("]")] for go in list(go_set)]
    go_descrs = [go[:go.find(' [GO:')] for go in list(go_set)]
    
    return pd.DataFrame({'GO Id': go_ids, 'GO Description': go_descrs}).sort_values(by=['GO Description'])

# Main loop

In [5]:
#http://www.uniprot.org/uniprot/?query=Q9ZUA2&format=tab&columns=id%2Cgo
# VER --> https://www.uniprot.org/help/uniprotkb_column_names
    
url = "https://www.uniprot.org/uniprot/"


list_df_gen = []
all_biological_process_GOs = set()
all_molecular_function_GOs = set()
all_cellular_component_GOs = set()

for i,gen in enumerate(tqdm(genes_npy)):
    
    ######################## FORMAR LA URL
    params = {
        "query": "gene:"+gen+"+AND+organism:9606", #+AND+reviewed:yes",
        "columns": "genes,id,reviewed,go-id,go(biological process),go(molecular function),go(cellular component)",
        "format": "tab"
    }
    data = urllib.parse.urlencode(params).encode('utf-8')
    req = urllib.request.Request(url, data)
    
    ######################## LLAMAR A LA API DE UNIPROT
    ok = False
    while not ok:
        try:
            #response = urllib.request.urlopen(request).read()
            #with urllib.request.urlopen(req) as f:
            #    response = f.read().decode('utf-8')
            response = urllib.request.urlopen(req)
            response = response.read().decode('utf-8')
            ok = True
        except:
            print("Exception! Waiting 10 secs")
            time.sleep(10)

    ######################## TRATAR LA RESPUESTA
    if response=="":
        print(i,"--->",gen)
        continue
        
    #https://stackoverflow.com/questions/54102980/convert-a-tab-and-newline-delimited-string-to-pandas-dataframe
    df_gen = pd.read_csv(StringIO(response), sep='\t')
    df_gen["Position gen"] = i
    df_gen["Searched gen"] = gen
    
    all_biological_process_GOs |= get_sets_from_pandas_col(df_gen["Gene ontology (biological process)"])
    all_molecular_function_GOs |= get_sets_from_pandas_col(df_gen["Gene ontology (molecular function)"])
    all_cellular_component_GOs |= get_sets_from_pandas_col(df_gen["Gene ontology (cellular component)"])

    list_df_gen.append(df_gen[["Position gen", "Searched gen", "Gene names", "Entry", "Status", "Gene ontology IDs"]])
    
    if (i+1)%100==0:
        # TEMPORAL SAVING
        df_genes              = pd.concat(list_df_gen, ignore_index=True)
        df_biological_process = go_set_to_file(all_biological_process_GOs)
        df_cellular_component = go_set_to_file(all_cellular_component_GOs)
        df_molecular_function = go_set_to_file(all_molecular_function_GOs)
        
        df_genes.to_csv("results/genes_with_GO.csv", index=False)
        df_biological_process.to_csv("results/GO_biological_process.csv", index=False)
        df_cellular_component.to_csv("results/GO_cellular_component.csv", index=False)
        df_molecular_function.to_csv("results/GO_molecular_function.csv", index=False)

df_genes              = pd.concat(list_df_gen, ignore_index=True)
df_biological_process = go_set_to_file(all_biological_process_GOs)
df_cellular_component = go_set_to_file(all_cellular_component_GOs)
df_molecular_function = go_set_to_file(all_molecular_function_GOs)

df_genes.to_csv("results/genes_with_GO.csv", index=False)
df_biological_process.to_csv("results/GO_biological_process.csv", index=False)
df_cellular_component.to_csv("results/GO_cellular_component.csv", index=False)
df_molecular_function.to_csv("results/GO_molecular_function.csv", index=False)

#df_genes.head()

HBox(children=(FloatProgress(value=0.0, max=14379.0), HTML(value='')))

35 ---> hsa-mir-4763
95 ---> AC009237.1
332 ---> AC008394.1
383 ---> RP4-539M6.19
393 ---> AC015804.1
423 ---> CTD-2021H9.3
474 ---> PRAMEF3
520 ---> WI2-3658N16.1
521 ---> AC013269.5
615 ---> RP11-111K18.1
619 ---> FAM75B
732 ---> AC107021.1
781 ---> AC138969.4
833 ---> RP11-671M22.1
843 ---> RP11-351M8.1
940 ---> AC005035.1
971 ---> RP11-111M22.2
989 ---> RP1-32I10.10
1002 ---> AC009113.1
1059 ---> AC090616.2
1068 ---> AC092329.1
1085 ---> AC000111.6
1095 ---> RP11-268F1.2.1
1102 ---> RP11-766F14.2
1141 ---> AL953854.2
1147 ---> U82695.9
1293 ---> AC021218.2
1315 ---> NAMPTL
1393 ---> AL136219.1
1407 ---> RP11-82O18.1
1441 ---> AC092171.1
1629 ---> AC024563.1
1693 ---> RP11-433C9.2
1729 ---> RP11-113D6.10
1888 ---> AC006465.3
1968 ---> AC012215.1
2049 ---> RP11-108O10.8
2065 ---> RP4-734P14.4
2093 ---> AC011450.1
2270 ---> RP11-51F16.8
2370 ---> RP11-204N11.1
2377 ---> RP11-17M16.1
2457 ---> AC005537.2
2494 ---> AC005329.1
2528 ---> DLEU2L
2574 ---> AL355390.1
2580 ---> RAD21-AS1
258

10584 ---> AC007952.5
10624 ---> CASP16
10630 ---> RP4-559A3.7
10635 ---> AC021860.1
10636 ---> AC106017.1
10643 ---> AL589765.1
10665 ---> RP1-170O19.20
10682 ---> AC005609.1
10684 ---> AC005003.1
10696 ---> C8orf44-SGK3
10706 ---> KB-1507C5.2
10710 ---> AP000708.1
10712 ---> AP006621.5
10715 ---> RP11-770J1.4
10719 ---> AC016586.1
10726 ---> AC079602.1
10736 ---> AL590822.1
10743 ---> CTD-2368P22.1
10751 ---> RP11-47I22.4
10756 ---> RP11-315D16.2
10775 ---> AC027307.3
10782 ---> AC112693.2
10784 ---> AC079210.1
10785 ---> AC138655.1
10786 ---> AL359091.2
10790 ---> RP11-156E8.1
10906 ---> FRG1B
11024 ---> RP11-863K10.7
11078 ---> C9orf38
11234 ---> LINC00998
11254 ---> AC007390.5
11265 ---> TMEM257
11344 ---> AC144568.2
11365 ---> AL020996.1
11366 ---> RP11-817J15.3
11393 ---> RP11-295D22.1
11400 ---> AC117395.1
11401 ---> AC008964.1
11404 ---> AC012360.2
11405 ---> AC004899.1
11406 ---> AC119673.1
11407 ---> AL354808.2
11417 ---> AC068987.1
11418 ---> AC135178.1
11429 ---> C7orf71
1

# Write files

In [6]:
df_molecular_function = go_set_to_file(all_molecular_function_GOs)
df_molecular_function.head()

Unnamed: 0,GO Id,GO Description
76,GO:0005524,ATP binding
55,GO:0016887,ATPase activity
48,GO:0015433,ATPase-coupled peptide antigen transmembrane transporter activity
20,GO:0015440,ATPase-coupled peptide transmembrane transporter activity
81,GO:0042626,ATPase-coupled transmembrane transporter activity


In [7]:
df_genes.to_csv("results/genes_with_GO.csv", index=False)

df_biological_process.to_csv("results/GO_biological_process.csv", index=False)
df_cellular_component.to_csv("results/GO_cellular_component.csv", index=False)
df_molecular_function.to_csv("results/GO_molecular_function.csv", index=False)

In [8]:
! head -n 10 results/GO_molecular_function.csv

GO Id,GO Description
GO:0005524,ATP binding
GO:0016887,ATPase activity
GO:0015433,ATPase-coupled peptide antigen transmembrane transporter activity
GO:0015440,ATPase-coupled peptide transmembrane transporter activity
GO:0042626,ATPase-coupled transmembrane transporter activity
GO:0031730,CCR5 chemokine receptor binding
GO:0003677,DNA binding
GO:0001228,"DNA-binding transcription activator activity, RNA polymerase II-specific"
GO:0003700,DNA-binding transcription factor activity


In [9]:
! head -n 3 results/genes_with_GO.csv

Searched gen,Gene names,Entry,Status,Gene ontology IDs
RGS7BP,RGS7BP R7BP,Q6MZT1,reviewed,GO:0005634; GO:0005737; GO:0007186; GO:0009968; GO:0060078; GO:0098978; GO:0099026; GO:0099031
VPS33B,VPS33B,Q9H267,reviewed,GO:0005737; GO:0005764; GO:0005765; GO:0005770; GO:0005794; GO:0006904; GO:0007032; GO:0015031; GO:0016192; GO:0017185; GO:0030136; GO:0030897; GO:0031091; GO:0031901; GO:0031902; GO:0032400; GO:0032418; GO:0032963; GO:0044877; GO:0048471; GO:0055037; GO:0061025; GO:0070889
