In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from Bio import Entrez
from tqdm import tqdm
import pickle

In [76]:
data = pd.read_csv (r'data/SFARI-Genes.csv')
cols = data.columns
N, D = data.shape

In [79]:
gene2go = open_pickle_file("gene2go.pickle")
eid = parse_eid()
gene_symbols = data['gene-symbol'].unique()

In [91]:
def save_to_txt_file(contents, filename):
    DIR = os.getcwd() + "/results/" + filename
    
    with open(DIR, "w") as f:
        f.write(contents)
        
def save_to_pickle_file(contents, filename):
    DIR = os.getcwd() + "/data/" + filename
    
    with open(DIR, "wb") as handle:
        pickle.dump(contents, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
def open_pickle_file(filename):
    DIR = os.getcwd() + "/data/" + filename
    
    with open(DIR, "rb") as handle:
        data = pickle.load(handle)
        
    return data

def save_to_npy_file(contents, filename):
    np.save(os.getcwd() + "/data/" + filename, contents)
    
def open_npy_file(filename):
    np.load(os.getcwd() + "/data/" + filename, allow_pickle=True)
    
def parse_eid():
    DIR = os.getcwd() + "/results/task2-1-eid.txt"
    eid = {}
    
    with open(DIR) as f:
        lines = f.readlines()
        
    for l in lines:
        t1 = l.replace("\n", "").split(": ")
        gene = t1[0]

        if len(t1) > 1:
            eid[gene] = t1[1].split(", ")
        else:
            eid[gene] = []
               
    return eid

def get_entrezs(symbols):
    entrezs = {}
    
    for s in symbols:
        entrezs[s] = eid[g]
        
    return entrezs

def get_GO_indexes(symbols):
    indexes = {}
    entrezs = get_entrezs(symbols)
    errors = 0
    
    tax_ids = gene2go["#tax_id"]
    human_gene_indexes = tax_ids[tax_ids == "9606"].index
    human_genes = gene2go.filter(items=human_gene_indexes, axis = 0)
    human_gene_ids = human_genes["GeneID"]

    for g in tqdm(gene_symbols):
        gene_entrez = eid[g]

        if len(gene_entrez) == 0:
            print(g)
            continue
        else:
            gene_entrez = gene_entrez[0]

        gene_index = human_gene_ids[human_gene_ids == gene_entrez].index.unique()

        indexes[g] = gene_index
    
    if errors > 0:
        print("Warning {} errors exist".format(errors))
        
    return indexes, errors

In [74]:
genes_no_ids = ["MSNP1AS", "RP11-1407O15.2", "RPS10P2-AS1"]
symbol_data = data["gene-symbol"]
indexes = []

for g in genes_no_ids:
    index = symbol_data[symbol_data == g].index[0]
    indexes.append(index)
    
new_data = data.filter(items=indexes, axis=0)
print(indexes)
display(new_data)

[576, 771, 790]


Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,number-of-reports
576,9,MSNP1AS,"Moesinpseudogene 1, antisense",,5,"Genetic Association, Functional",2.0,0,13
771,9,RP11-1407O15.2,,,17,Rare Single Gene Mutation,3.0,0,1
790,9,RPS10P2-AS1,ribosomal protein S10 pseudogene 2 anti-sense 1,,20,"Genetic Association, Functional",2.0,0,4


In [20]:
symbol_data[771]

'RP11-1407O15.2'

# Task 1

In [4]:
gene_symbols = data['gene-symbol'].unique()
print(gene_symbols.size)

1023


In [5]:
# Save Entrez scores

In [46]:
Entrez.email = "s1803764@ed.ac.uk"
output = ""
eid = {}

for i in tqdm(range(gene_symbols.size)):
    g = gene_symbols[i]
    handle = Entrez.esearch(db="gene", retmax=10, term=g + "[sym] homo sapiens[Organism]")
    record = Entrez.read(handle)
    #output += g + ": " + ", ".join(str(x) for x in record['IdList'])
    #output += "\n"
    eid[g] = record['IdList']
    
    if len(record["IdList"]) == 0:
        print(g)
    
#save_to_txt_file(output, "task2-1-eid.txt")

 56%|██████████████████████▌                 | 577/1023 [06:05<04:08,  1.79it/s]

MSNP1AS


 75%|██████████████████████████████▏         | 772/1023 [08:04<02:21,  1.78it/s]

RP11-1407O15.2


 77%|██████████████████████████████▉         | 791/1023 [08:15<02:05,  1.85it/s]

RPS10P2-AS1


100%|███████████████████████████████████████| 1023/1023 [10:37<00:00,  1.60it/s]


# Task 2

In [6]:
gene2go = pd.read_csv(os.getcwd() + "/data/gene2go", dtype=str, delimiter="\t")
save_to_pickle_file(gene2go, "gene2go.pickle")

In [20]:
gene2go

Unnamed: 0,#tax_id,GeneID,GO_ID,Evidence,Qualifier,GO_term,PubMed,Category
0,3702,814629,GO:0005634,ISM,located_in,nucleus,-,Component
1,3702,814629,GO:0008150,ND,involved_in,biological_process,-,Process
2,3702,814630,GO:0003700,ISS,enables,DNA-binding transcription factor activity,11118137,Function
3,3702,814630,GO:0005634,ISM,located_in,nucleus,-,Component
4,3702,814630,GO:0006355,TAS,acts_upstream_of_or_within,"regulation of transcription, DNA-templated",11118137,Process
...,...,...,...,...,...,...,...,...
2084173,559292,65052912,GO:0005575,ND,is_active_in,cellular_component,-,Component
2084174,559292,65052912,GO:0008150,ND,involved_in,biological_process,-,Process
2084175,559292,65052913,GO:0003674,ND,enables,molecular_function,-,Function
2084176,559292,65052913,GO:0005575,ND,is_active_in,cellular_component,-,Component


In [90]:
gene2go = open_pickle_file("gene2go.pickle")
eid = parse_eid()

In [None]:
go_terms = {}

errors = 0

tax_ids = gene2go["#tax_id"]
human_gene_indexes = tax_ids[tax_ids == "9606"].index
human_genes = gene2go.filter(items=human_gene_indexes, axis = 0)
human_gene_ids = human_genes["GeneID"]
print(human_gene_indexes.size)

for g in tqdm(gene_symbols):
    gene_entrez = eid[g]
    
    if len(gene_entrez) == 0:
        break
    else:
        gene_entrez = gene_entrez[0]

    gene_index = human_gene_ids[human_gene_ids == gene_entrez].index.unique()

    print(g)
    print(gene_entrez)
    print(gene_index)
    
    if gene_index.size >= 1:
        if not gene_index[0] in human_gene_indexes:
            errors += 1
            continue
        else:
            gene_index = int(gene_index[0])

    print(gene_index)
    go_term = gene2go.iloc[gene_index]["GO_term"]
    go_terms[g] = go_term

    print(go_term)
    print()

save_to_pickle_file(go_terms, "go-terms.pickle")

331423


  0%|▏                                         | 5/1023 [00:00<00:24, 41.65it/s]

ABAT
18
Int64Index([643651, 643652, 643653, 643654, 643655, 643656, 643657, 643658,
            643659, 643660, 643661, 643662, 643663, 643664, 643665, 643666,
            643667, 643668, 643669, 643670, 643671, 643672, 643673, 643674,
            643675, 643676, 643677, 643678, 643679, 643680, 643681, 643682,
            643683, 643684, 643685, 643686, 643687, 643688, 643689, 643690,
            643691],
           dtype='int64')
643651
response to hypoxia

ABCA10
10349
Int64Index([825355, 825356, 825357, 825358, 825359, 825360, 825361, 825362], dtype='int64')
825355
lipid transporter activity

ABCA13
154664
Int64Index([944280, 944281, 944282, 944283, 944284, 944285, 944286, 944287,
            944288, 944289, 944290, 944291, 944292, 944293, 944294],
           dtype='int64')
944280
lipid transporter activity

ABCA7
10347
Int64Index([825298, 825299, 825300, 825301, 825302, 825303, 825304, 825305,
            825306, 825307, 825308, 825309, 825310, 825311, 825312, 825313,
            8

  1%|▌                                        | 15/1023 [00:00<00:23, 42.67it/s]

ACTN4
81
Int64Index([644993, 644994, 644995, 644996, 644997, 644998, 644999, 645000,
            645001, 645002, 645003, 645004, 645005, 645006, 645007, 645008,
            645009, 645010, 645011, 645012, 645013, 645014, 645015, 645016,
            645017, 645018, 645019, 645020, 645021, 645022, 645023, 645024,
            645025, 645026, 645027, 645028, 645029, 645030, 645031, 645032,
            645033, 645034, 645035, 645036, 645037, 645038, 645039, 645040,
            645041, 645042, 645043, 645044, 645045, 645046],
           dtype='int64')
644993
RNA polymerase II transcription regulatory region sequence-specific DNA binding

ACY1
95
Int64Index([645537, 645538, 645539, 645540, 645541, 645542, 645543, 645544,
            645545],
           dtype='int64')
645537
aminoacylase activity

ADA
100
Int64Index([645552, 645553, 645554, 645555, 645556, 645557, 645558, 645559,
            645560, 645561, 645562, 645563, 645564, 645565, 645566, 645567,
            645568, 645569, 645570, 645

  2%|█                                        | 25/1023 [00:00<00:24, 40.41it/s]

AFF2
2334
Int64Index([692698, 692699, 692700, 692701, 692702, 692703, 692704, 692705,
            692706, 692707, 692708, 692709, 692710, 692711, 692712],
           dtype='int64')
692698
G-quadruplex RNA binding

AGAP1
116987
Int64Index([932851, 932852, 932853, 932854, 932855, 932856, 932857, 932858,
            932859, 932860],
           dtype='int64')
932851
GTPase activity

AGAP2
116986
Int64Index([932821, 932822, 932823, 932824, 932825, 932826, 932827, 932828,
            932829, 932830, 932831, 932832, 932833, 932834, 932835, 932836,
            932837, 932838, 932839, 932840, 932841, 932842, 932843, 932844,
            932845, 932846, 932847, 932848, 932849, 932850],
           dtype='int64')
932821
GTPase activity

ADSS2
159
Int64Index([647196, 647197, 647198, 647199, 647200, 647201, 647202, 647203,
            647204, 647205, 647206, 647207, 647208, 647209, 647210, 647211,
            647212, 647213, 647214, 647215, 647216],
           dtype='int64')
647196
magnesium ion bind

  4%|█▍                                       | 36/1023 [00:00<00:21, 45.28it/s]

AHDC1
27245
Int64Index([860606, 860607], dtype='int64')
860606
DNA binding

AHI1
54806
Int64Index([880169, 880170, 880171, 880172, 880173, 880174, 880175, 880176,
            880177, 880178, 880179, 880180, 880181, 880182, 880183, 880184,
            880185, 880186, 880187, 880188, 880189, 880190, 880191, 880192,
            880193, 880194, 880195, 880196, 880197, 880198, 880199, 880200,
            880201, 880202],
           dtype='int64')
880169
morphogenesis of a polarized epithelium

AKAP9
10142
Int64Index([822011, 822012, 822013, 822014, 822015, 822016, 822017, 822018,
            822019, 822020, 822021, 822022, 822023, 822024, 822025, 822026,
            822027, 822028, 822029, 822030, 822031, 822032, 822033, 822034,
            822035, 822036, 822037, 822038, 822039, 822040, 822041, 822042,
            822043, 822044, 822045, 822046, 822047, 822048, 822049, 822050,
            822051, 822052, 822053, 822054, 822055, 822056, 822057, 822058,
            822059, 822060, 822061, 82

  5%|█▉                                       | 47/1023 [00:01<00:20, 47.87it/s]

AGO2
27161
Int64Index([859931, 859932, 859933, 859934, 859935, 859936, 859937, 859938,
            859939, 859940, 859941, 859942, 859943, 859944, 859945, 859946,
            859947, 859948, 859949, 859950, 859951, 859952, 859953, 859954,
            859955, 859956, 859957, 859958, 859959, 859960, 859961, 859962,
            859963, 859964, 859965, 859966, 859967, 859968, 859969, 859970,
            859971, 859972, 859973, 859974, 859975, 859976, 859977, 859978,
            859979, 859980, 859981, 859982, 859983, 859984, 859985, 859986,
            859987, 859988, 859989, 859990, 859991, 859992, 859993, 859994,
            859995, 859996, 859997, 859998, 859999],
           dtype='int64')
859931
RNA 7-methylguanosine cap binding

AHNAK
79026
Int64Index([907047, 907048, 907049, 907050, 907051, 907052, 907053, 907054,
            907055, 907056, 907057, 907058, 907059, 907060, 907061, 907062,
            907063, 907064, 907065, 907066, 907067, 907068, 907069, 907070,
            907071, 

  6%|██▎                                      | 57/1023 [00:01<00:20, 46.82it/s]

ARHGEF10
9639
Int64Index([814627, 814628, 814629, 814630, 814631, 814632, 814633, 814634,
            814635, 814636, 814637, 814638, 814639, 814640, 814641, 814642,
            814643, 814644],
           dtype='int64')
814627
guanyl-nucleotide exchange factor activity

ARHGEF9
23229
Int64Index([845570, 845571, 845572, 845573, 845574, 845575, 845576], dtype='int64')
845570
guanyl-nucleotide exchange factor activity

ARID1B
57492
Int64Index([896201, 896202, 896203, 896204, 896205, 896206, 896207, 896208,
            896209, 896210, 896211, 896212, 896213, 896214, 896215, 896216],
           dtype='int64')
896201
DNA binding

ARID2
196528
Int64Index([947687, 947688, 947689, 947690, 947691, 947692, 947693, 947694,
            947695, 947696, 947697, 947698, 947699, 947700, 947701],
           dtype='int64')
947687
heart morphogenesis

ARNT2
9915
Int64Index([818273, 818274, 818275, 818276, 818277, 818278, 818279, 818280,
            818281, 818282, 818283, 818284, 818285, 818286, 818287, 

  7%|██▋                                      | 67/1023 [00:01<00:20, 46.16it/s]

ASTN2
23245
Int64Index([845848, 845849, 845850, 845851, 845852, 845853, 845854, 845855,
            845856, 845857, 845858, 845859, 845860, 845861, 845862, 845863],
           dtype='int64')
845848
neuron migration

ASXL3
80816
Int64Index([914625, 914626, 914627, 914628, 914629, 914630, 914631, 914632], dtype='int64')
914625
DNA binding

ATP10A
57194
Int64Index([895040, 895041, 895042, 895043, 895044, 895045, 895046, 895047,
            895048, 895049, 895050, 895051, 895052, 895053, 895054, 895055,
            895056, 895057, 895058, 895059, 895060],
           dtype='int64')
895040
magnesium ion binding

ATP1A1
476
Int64Index([654721, 654722, 654723, 654724, 654725, 654726, 654727, 654728,
            654729, 654730, 654731, 654732, 654733, 654734, 654735, 654736,
            654737, 654738, 654739, 654740, 654741, 654742, 654743, 654744,
            654745, 654746, 654747, 654748, 654749, 654750, 654751, 654752,
            654753, 654754, 654755, 654756, 654757, 654758, 654759, 654

  8%|███                                      | 77/1023 [00:01<00:20, 45.24it/s]

ADORA2A
135
Int64Index([646565, 646566, 646567, 646568, 646569, 646570, 646571, 646572,
            646573, 646574, 646575, 646576, 646577, 646578, 646579, 646580,
            646581, 646582, 646583, 646584, 646585, 646586, 646587, 646588,
            646589, 646590, 646591, 646592, 646593, 646594, 646595, 646596,
            646597, 646598, 646599, 646600, 646601, 646602, 646603, 646604,
            646605, 646606, 646607, 646608, 646609, 646610, 646611, 646612,
            646613, 646614, 646615, 646616, 646617, 646618, 646619, 646620,
            646621, 646622, 646623, 646624, 646625, 646626, 646627, 646628,
            646629, 646630, 646631, 646632, 646633, 646634, 646635, 646636],
           dtype='int64')
646565
G protein-coupled adenosine receptor activity

AP2S1
1175
Int64Index([669900, 669901, 669902, 669903, 669904, 669905, 669906, 669907,
            669908, 669909, 669910, 669911, 669912, 669913, 669914],
           dtype='int64')
669900
protein binding

AVPR1A
552
Int64I

  9%|███▍                                     | 87/1023 [00:01<00:20, 45.59it/s]

866056
dolichyl-phosphate-glucose-glycolipid alpha-glucosyltransferase activity

AR
367
Int64Index([652609, 652610, 652611, 652612, 652613, 652614, 652615, 652616,
            652617, 652618, 652619, 652620, 652621, 652622, 652623, 652624,
            652625, 652626, 652627, 652628, 652629, 652630, 652631, 652632,
            652633, 652634, 652635, 652636, 652637, 652638, 652639, 652640,
            652641, 652642, 652643, 652644, 652645, 652646, 652647, 652648,
            652649, 652650, 652651, 652652, 652653, 652654, 652655, 652656,
            652657, 652658, 652659, 652660, 652661, 652662, 652663, 652664,
            652665, 652666, 652667, 652668, 652669, 652670, 652671, 652672,
            652673, 652674, 652675, 652676, 652677, 652678, 652679, 652680,
            652681, 652682, 652683, 652684, 652685, 652686, 652687, 652688,
            652689, 652690, 652691, 652692, 652693, 652694, 652695, 652696,
            652697],
           dtype='int64')
652609
negative regulation of

  9%|███▉                                     | 97/1023 [00:02<00:21, 43.99it/s]

BTAF1
9044
Int64Index([805863, 805864, 805865, 805866, 805867, 805868, 805869, 805870,
            805871, 805872, 805873, 805874],
           dtype='int64')
805863
DNA binding

BTRC
8945
Int64Index([804740, 804741, 804742, 804743, 804744, 804745, 804746, 804747,
            804748, 804749, 804750, 804751, 804752, 804753, 804754, 804755,
            804756, 804757, 804758, 804759, 804760, 804761, 804762, 804763,
            804764, 804765, 804766, 804767, 804768, 804769, 804770, 804771,
            804772, 804773, 804774, 804775, 804776, 804777, 804778, 804779,
            804780, 804781],
           dtype='int64')
804740
protein polyubiquitination

C12orf57
113246
Int64Index([930400, 930401, 930402, 930403, 930404, 930405, 930406, 930407,
            930408, 930409, 930410, 930411],
           dtype='int64')
930400
molecular_function

C15orf62
643338
Int64Index([969403, 969404, 969405, 969406, 969407, 969408, 969409, 969410], dtype='int64')
969403
cytoplasm

C4B
721
Int64Index([660573

 10%|████▏                                   | 107/1023 [00:02<00:20, 45.40it/s]

CACNA1D
776
Int64Index([661190, 661191, 661192, 661193, 661194, 661195, 661196, 661197,
            661198, 661199, 661200, 661201, 661202, 661203, 661204, 661205,
            661206, 661207, 661208, 661209, 661210, 661211, 661212, 661213,
            661214, 661215, 661216, 661217, 661218, 661219, 661220],
           dtype='int64')
661190
voltage-gated calcium channel activity

CACNA1E
777
Int64Index([661221, 661222, 661223, 661224, 661225, 661226, 661227, 661228,
            661229, 661230, 661231, 661232],
           dtype='int64')
661221
voltage-gated calcium channel activity

CACNA1F
778
Int64Index([661233, 661234, 661235, 661236, 661237, 661238, 661239, 661240,
            661241, 661242, 661243, 661244, 661245, 661246],
           dtype='int64')
661233
photoreceptor outer segment

CACNA1G
8913
Int64Index([804191, 804192, 804193, 804194, 804195, 804196, 804197, 804198,
            804199, 804200, 804201, 804202, 804203, 804204, 804205, 804206,
            804207, 804208, 804209, 

 11%|████▌                                   | 117/1023 [00:02<00:20, 45.10it/s]

CADPS
8618
Int64Index([798693, 798694, 798695, 798696, 798697, 798698, 798699, 798700,
            798701, 798702, 798703, 798704, 798705],
           dtype='int64')
798693
protein binding

CADPS2
93664
Int64Index([929305, 929306, 929307, 929308, 929309, 929310, 929311, 929312,
            929313, 929314, 929315, 929316],
           dtype='int64')
929305
nucleoplasm

CAMK2A
815
Int64Index([662007, 662008, 662009, 662010, 662011, 662012, 662013, 662014,
            662015, 662016, 662017, 662018, 662019, 662020, 662021, 662022,
            662023, 662024, 662025, 662026, 662027, 662028, 662029, 662030,
            662031, 662032, 662033, 662034, 662035, 662036, 662037, 662038,
            662039, 662040, 662041, 662042, 662043, 662044, 662045, 662046,
            662047, 662048, 662049, 662050, 662051],
           dtype='int64')
662007
G1/S transition of mitotic cell cycle

CAMK2B
816
Int64Index([662052, 662053, 662054, 662055, 662056, 662057, 662058, 662059,
            662060, 662061,

 12%|████▉                                   | 127/1023 [00:02<00:20, 42.96it/s]

CASZ1
54897
Int64Index([880988, 880989, 880990, 880991, 880992, 880993, 880994, 880995,
            880996, 880997, 880998, 880999, 881000, 881001],
           dtype='int64')
880988
chromatin

CC2D1A
54862
Int64Index([880640, 880641, 880642, 880643, 880644, 880645, 880646, 880647,
            880648, 880649, 880650, 880651, 880652, 880653, 880654, 880655],
           dtype='int64')
880640
RNA polymerase II cis-regulatory region sequence-specific DNA binding

CCDC88C
440193
Int64Index([967056, 967057, 967058, 967059, 967060, 967061, 967062, 967063,
            967064, 967065, 967066, 967067, 967068, 967069, 967070, 967071,
            967072, 967073, 967074, 967075, 967076, 967077, 967078],
           dtype='int64')
967056
regulation of protein phosphorylation

CCDC91
55297
Int64Index([884975, 884976, 884977, 884978, 884979, 884980, 884981, 884982,
            884983],
           dtype='int64')
884975
nucleoplasm

CCIN
881
Int64Index([663820, 663821, 663822, 663823, 663824, 663825, 6638

 13%|█████▍                                  | 138/1023 [00:03<00:19, 46.16it/s]

CD38
952
Int64Index([665318, 665319, 665320, 665321, 665322, 665323, 665324, 665325,
            665326, 665327, 665328, 665329, 665330, 665331, 665332, 665333,
            665334, 665335, 665336, 665337, 665338, 665339, 665340, 665341,
            665342, 665343, 665344, 665345, 665346, 665347, 665348, 665349,
            665350, 665351, 665352, 665353, 665354, 665355, 665356, 665357,
            665358, 665359],
           dtype='int64')
665318
response to hypoxia

CDC42BPB
9578
Int64Index([813775, 813776, 813777, 813778, 813779, 813780, 813781, 813782,
            813783, 813784, 813785, 813786, 813787, 813788, 813789, 813790,
            813791, 813792, 813793, 813794, 813795, 813796, 813797, 813798,
            813799, 813800, 813801, 813802, 813803, 813804],
           dtype='int64')
813775
magnesium ion binding

BRINP3
339479
Int64Index([957385, 957386, 957387, 957388, 957389, 957390, 957391, 957392,
            957393, 957394, 957395, 957396, 957397],
           dtype='int64')


 14%|█████▊                                  | 148/1023 [00:03<00:19, 45.54it/s]

CDK13
8621
Int64Index([798718, 798719, 798720, 798721, 798722, 798723, 798724, 798725,
            798726, 798727, 798728, 798729, 798730, 798731, 798732, 798733,
            798734, 798735, 798736, 798737, 798738, 798739, 798740, 798741,
            798742, 798743, 798744, 798745, 798746, 798747, 798748, 798749,
            798750, 798751, 798752, 798753],
           dtype='int64')
798718
cyclin-dependent protein kinase holoenzyme complex

CDK19
23097
Int64Index([843845, 843846, 843847, 843848, 843849, 843850, 843851, 843852,
            843853, 843854, 843855, 843856, 843857, 843858],
           dtype='int64')
843845
protein serine/threonine kinase activity

CDK8
1024
Int64Index([667314, 667315, 667316, 667317, 667318, 667319, 667320, 667321,
            667322, 667323, 667324, 667325, 667326, 667327, 667328, 667329,
            667330],
           dtype='int64')
667314
protein kinase activity

CDKL5
6792
Int64Index([774052, 774053, 774054, 774055, 774056, 774057, 774058, 774059,
   

 15%|██████▏                                 | 158/1023 [00:03<00:18, 46.54it/s]

In [95]:
human_gene_indexes

Int64Index([643502, 643503, 643504, 643505, 643506, 643507, 643508, 643509,
            643510, 643511,
            ...
            974915, 974916, 974917, 974918, 974919, 974920, 974921, 974922,
            974923, 974924],
           dtype='int64', length=331423)

In [103]:
go_terms

{'ABAT': 'response to hypoxia',
 'ABCA10': 'lipid transporter activity',
 'ABCA13': 'lipid transporter activity',
 'ABCA7': 'Golgi membrane',
 'ACE': 'kidney development',
 'ACHE': 'acetylcholine catabolic process in synaptic cleft',
 'ACTB': 'regulation of cyclin-dependent protein serine/threonine kinase activity',
 'ACTL6B': 'chromatin binding',
 'ACTN4': 'RNA polymerase II transcription regulatory region sequence-specific DNA binding',
 'ACY1': 'aminoacylase activity',
 'ADA': 'allantoin metabolic process',
 'ADCY3': 'adenylate cyclase activity',
 'ADCY5': 'G protein-coupled adenosine receptor signaling pathway',
 'ADK': 'RNA binding',
 'ADNP': 'chromatin',
 'ADORA3': 'G protein-coupled adenosine receptor activity',
 'ADSL': 'response to hypoxia',
 'AFF2': 'G-quadruplex RNA binding',
 'AGAP1': 'GTPase activity',
 'AGAP2': 'GTPase activity',
 'ADSS2': 'magnesium ion binding',
 'AGBL4': 'metallocarboxypeptidase activity',
 'AGMO': 'iron ion binding',
 'AGO1': 'P-body',
 'AGO3': 'conde

# Task 3

In [38]:
eid = parse_eid()
go_terms = open_pickle_file("go-terms.pickle")

In [66]:
gene_scores = data["gene-score"]
unique_gs = gene_scores.unique()
task3 = {}


for g in unique_gs:
    score_indexes =  gene_scores[gene_scores == g].index
    symbols = data["gene-symbol"].filter(items=score_indexes, axis=0)  
    task3[g] = symbols

In [67]:
task3

{3.0: 0         ABAT
 1       ABCA10
 2       ABCA13
 3        ABCA7
 4          ACE
          ...  
 1014      WWP1
 1015     XRCC6
 1016     YWHAG
 1020    ZNF827
 1022    ZWILCH
 Name: gene-symbol, Length: 515, dtype: object,
 2.0: 5          ACHE
 10          ADA
 11        ADCY3
 19        AGAP2
 23         AGO1
          ...   
 1000      ZC3H4
 1002    ZMYND11
 1013    ZNF804A
 1017      ZMIZ1
 1018      ZMYM2
 Name: gene-symbol, Length: 221, dtype: object,
 1.0: 6         ACTB
 14        ADNP
 16        ADSL
 17        AFF2
 27       AHDC1
          ...  
 984       UBR1
 998     ZBTB20
 1004    ZNF292
 1006    ZNF462
 1019    ZMYND8
 Name: gene-symbol, Length: 206, dtype: object,
 nan: Series([], Name: gene-symbol, dtype: object)}

# Task 4

In [81]:
gene2go = open_pickle_file("gene2go.pickle")
eid = parse_eid()

In [92]:
cols = ["GO_ID", "GO_term", "GO_count"]

indexes = get_GO_indexes(symbol_data)

100%|███████████████████████████████████████| 1023/1023 [00:25<00:00, 40.51it/s]


In [93]:
indexes

({'ABAT': Int64Index([643651, 643652, 643653, 643654, 643655, 643656, 643657, 643658,
              643659, 643660, 643661, 643662, 643663, 643664, 643665, 643666,
              643667, 643668, 643669, 643670, 643671, 643672, 643673, 643674,
              643675, 643676, 643677, 643678, 643679, 643680, 643681, 643682,
              643683, 643684, 643685, 643686, 643687, 643688, 643689, 643690,
              643691],
             dtype='int64'),
  'ABCA10': Int64Index([825355, 825356, 825357, 825358, 825359, 825360, 825361, 825362], dtype='int64'),
  'ABCA13': Int64Index([944280, 944281, 944282, 944283, 944284, 944285, 944286, 944287,
              944288, 944289, 944290, 944291, 944292, 944293, 944294],
             dtype='int64'),
  'ABCA7': Int64Index([825298, 825299, 825300, 825301, 825302, 825303, 825304, 825305,
              825306, 825307, 825308, 825309, 825310, 825311, 825312, 825313,
              825314, 825315, 825316, 825317, 825318, 825319, 825320, 825321,
              