In [None]:
import numpy as np
from collections import Counter

import matplotlib.pyplot as plt

import pandas as pd
import os
import sys
sys.path.append('../')
from deepfold.data.aminoacids import MAXLEN
from deepfold.data.ontology import Ontology
from deepfold.data.utils.data_utils import is_exp_code, is_cafa_target

###  Input file

In [5]:
data_base_path = u"/Users/robin/xbiome/datasets/protein"

### INPUT FILES ###
# Gene Ontology file in OBO Format
go_file = os.path.join(data_base_path, u"go.obo")

# UniProt/SwissProt knowledgebase file in text format (archived)
swissprot_file = os.path.join(data_base_path, u"uniprot_sprot.dat.gz")

### OUTPUT FILES ###
# Result file with a list of proteins, sequences and annotations from swissprot_file in pandas.DataFrame format
out_swissprot_pd_file = os.path.join(data_base_path, u"swissprot.pkl")

# Result file with a list of terms for prediction task
out_terms_file = os.path.join(data_base_path, u"terms.pkl")

# Result file with a DataFrame for training
train_data_file = os.path.join(data_base_path, u"train_data.pkl")

# Result file with a DataFrame for testing
test_data_file = os.path.join(data_base_path, u"test_data.pkl")

# Minimum number of annotated proteins in each GO annotation
min_count = 50

data_path_dict = {}

### INPUT FILES ###
data_path_dict['go'] = go_file
data_path_dict['uniprot_sprot'] = swissprot_file
### OUTPUT FILES ###
data_path_dict['swissprot'] = out_swissprot_pd_file
data_path_dict['train_data'] = train_data_file
data_path_dict['test_data'] = test_data_file
data_path_dict['terms'] = out_terms_file

data_path_dict

{'go': '/Users/robin/xbiome/datasets/protein/go.obo',
 'uniprot_sprot': '/Users/robin/xbiome/datasets/protein/uniprot_sprot.dat.gz',
 'swissprot': '/Users/robin/xbiome/datasets/protein/swissprot.pkl',
 'train_data': '/Users/robin/xbiome/datasets/protein/train_data.pkl',
 'test_data': '/Users/robin/xbiome/datasets/protein/test_data.pkl',
 'terms': '/Users/robin/xbiome/datasets/protein/terms.pkl'}

# Gene Ontology from .obo File

In [8]:
go = Ontology(data_path_dict['go'], with_rels=True)

# UniProt/SWISS-PROT

## Parsing uniprot_sprot.dat.gz by load_data(swissprot_file) in uni2pandas.py

*download text format from https://www.uniprot.org/downloads#uniprotkblink*

https://web.expasy.org/docs/userman.html#OX_line

**The ID (IDentification) line is always the first line of an entry.**
    
    ID   EntryName Status; SequenceLength.

**The AC (ACcession number) line lists the accession number(s) associated with an entry.**

    AC   AC_number_1;[ AC_number_2;]...[ AC_number_N;]

**The OX (Organism taxonomy cross-reference) line is used to indicate the identifier of a specific organism in a taxonomic database.**

    OX   Taxonomy_database_Qualifier=Taxonomic code;

**The DR (Database cross-Reference) lines are used as pointers to information in external data resources that is related to UniProtKB entries.**

    DR   RESOURCE_ABBREVIATION; RESOURCE_IDENTIFIER; OPTIONAL_INFORMATION_1[; OPTIONAL_INFORMATION_2][; OPTIONAL_INFORMATION_3].

***\<GO: Gene Ontology (GO) database\>***

*For GO, the resource identifier is the accession number (also called the Unique Identifier in some databases) of the referenced entry.*

*For GO, this field OPTIONAL_INFORMATION_1 is a 1-letter abbreviation for one of the 3 ontology aspects, separated from the GO term by a column. If the term is longer than 46 characters, the first 43 characters are indicated followed by 3 dots ('...'). The abbreviations for the 3 distinct aspects of the ontology are P (biological Process), F (molecular Function), and C (cellular Component).*

*For GO, this field OPTIONAL_INFORMATION_2 is a 3-character GO evidence code. The GO evidence code is followed by the source database from which the cross-reference was obtained, separated by a colon. The definitions of the evidence codes are: IDA=inferred from direct assay, IMP=inferred from mutant phenotype, IGI=inferred from genetic interaction, IPI=inferred from physical interaction, IEP=inferred from expression pattern, TAS=traceable author statement, NAS=non-traceable author statement, IC=inferred by curator, ISS=inferred from sequence or structural similarity.*

*Example*

DR   GO; GO:0005576; C:extracellular region; IEA:UniProtKB-SubCell.

DR   GO; GO:0001772; C:immunological synapse; TAS:UniProtKB.

DR   GO; GO:0005634; C:nucleus; TAS:UniProtKB.  

***\<InterPro: Integrated resource of protein families, domains and functional sites (InterPro)\>***

*For InterPro, the resource identifier is the accession number (also called the Unique Identifier in some databases) of the referenced entry.*

*For InterPro, this field OPTIONAL_INFORMATION_1 is the entry name.*

*Example*

DR   InterPro; IPR009003; Peptidase_S1_PA.

DR   InterPro; IPR001314; Peptidase_S1A.

DR   InterPro; IPR001254; Trypsin_dom.

DR   InterPro; IPR018114; TRYPSIN_HIS.  

**The SQ (SeQuence header) line marks the beginning of the sequence data and gives a quick summary of its content.**

    SQ   SEQUENCE XXXX AA; XXXXX MW; XXXXXXXXXXXXXXXX CRC64;
    
*The sequence data line has a line code consisting of two blanks rather than the two-letter codes used until now. The sequence counts 60 amino acids per line, in groups of 10 amino acids, beginning in position 6 of the line.*

*Example*

SQ   SEQUENCE   97 AA;  9110 MW;  E3C20C259858B830 CRC64;

     MTILASICKL GNTKSTSSSI GSSYSSAVSF GSNSVSCGEC GGDGPSFPNA SPRTGVKAGV

     NVDGLLGAIG KTVNGMLISP NGGGGGMGMG GGSCGCI

In [11]:
from deepfold.data.utils.load_swissport import load_swissport

In [12]:
proteins, accessions, sequences, annotations, interpros, orgs = load_swissport(
    data_path_dict['uniprot_sprot'])


## Converting into pandas.DataFrame

In [13]:
df = pd.DataFrame({
        'proteins': proteins,
        'accessions': accessions,
        'sequences': sequences,
        'annotations': annotations,
        'interpros': interpros,
        'orgs': orgs
    })

df

Unnamed: 0,proteins,accessions,sequences,annotations,interpros,orgs
0,001R_FRG3G,Q6GZX4;,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,[GO:0046782|IEA],[IPR007031],654924
1,002L_FRG3G,Q6GZX3;,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,"[GO:0033644|IEA, GO:0016021|IEA]",[IPR004251],654924
2,002R_IIV3,Q197F8;,MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWK...,[],[],345201
3,003L_IIV3,Q197F7;,MYQAINPCPQSWYGSPQLEREIVCKMSGAPHYPNYYPVHPNALGGA...,[],[],345201
4,003R_FRG3G,Q6GZX2;,MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVY...,[],[],654924
...,...,...,...,...,...,...
565249,Z_SABVB,Q6UY62;,MGNSKSKSKLSANQYEQQTVNSTKQVAILKRQAEPSLYGRHNCRCC...,"[GO:0044220|IEA, GO:0020002|IEA, GO:0016020|IE...","[IPR024183, IPR038485, IPR003224]",2169992
565250,Z_SHEEP,P08105;,MSSSLEITSFYSFIWTPHIGPLLFGIGLWFSMFKEPSHFCPCQHPH...,[],[],9940
565251,Z_TACVF,Q88470;,MGNCNRTQKPSSSSNNLEKPPQAAEFRRTAEPSLYGRYNCKCCWFA...,"[GO:0044220|IEA, GO:0020002|IEA, GO:0016020|IE...","[IPR024183, IPR038485, IPR003224]",928313
565252,Z_TAMVU,A9JR22;,MGLRYSKEVRDRHGDKDPEGRIPITQTMPQTLYGRYNCKSCWFANK...,"[GO:0044220|IEA, GO:0020002|IEA, GO:0016020|IE...","[IPR024183, IPR038485, IPR003224]",45223


## Filtering proteins with experimental annotations

In [14]:
index = []
annotations = []
for i, row in enumerate(df.itertuples()):
    annots = []
    for annot in row.annotations:
        go_id, code = annot.split('|')
        if is_exp_code(code):
            annots.append(go_id)
    # Ignore proteins without experimental annotations
    if len(annots) == 0:
        continue
    index.append(i)
    annotations.append(annots)
df = df.iloc[index]
df = df.reset_index()
df['exp_annotations'] = annotations

df

Unnamed: 0,index,proteins,accessions,sequences,annotations,interpros,orgs,exp_annotations
0,225,11K_PAVHV,P0DJZ0;,MQNNTTGMDTKSLKNCGQPKAVCTHCKHSPPCPQPGCVTKRPPVPP...,"[GO:0030430|IDA, GO:0039526|IEA]",[],648237,[GO:0030430]
1,226,11S1_CARIL,B5KVH4;,MAKPILLSIYLCLIIVALFNGCLAQSGGRQQHKFGQCQLNRLDALE...,"[GO:0019863|IEA, GO:0045735|IC, GO:0048316|IEP...","[IPR022379, IPR006044, IPR006045, IPR014710, I...",32201,"[GO:0045735, GO:0048316, GO:0010431]"
2,230,11S2_SESIN,Q9XHP0;,MVAFKFLLALSLSLLVSAAIAQTREPRLTQGQQCRFQRISGAQPSL...,"[GO:0042735|NAS, GO:0045735|NAS, GO:0010431|IEP]","[IPR022379, IPR006044, IPR006045, IPR014710, I...",4182,[GO:0010431]
3,244,128UP_DROME,P32234; Q9V648;,MSTILEKISAIESEMARTQKNKATSAHLGLLKAKLAKLRRELISPK...,"[GO:0005737|IBA, GO:0005525|IDA, GO:0002181|IBA]","[IPR012675, IPR031167, IPR006073, IPR031662, I...",7227,[GO:0005525]
4,256,13KDA_SCYCA,P83011;,MIFTAXDRSAIEXV,"[GO:0005783|IEA, GO:0043231|IDA]",[],7830,[GO:0043231]
...,...,...,...,...,...,...,...,...
77461,565225,ZYX_XENLA,A5H447;,MDPAAPATRMTSSFTINISTPSFYNPPKKFAPVVPPKPKINPFKAP...,"[GO:0005737|IEA, GO:0005925|ISS, GO:0001725|IS...",[IPR001781],8355,"[GO:0008134, GO:0006357]"
77462,565229,ZZZ3_HUMAN,Q8IYH5; B7WPC6; Q6N004; Q6N070; Q8IYP0; Q8IYR1...,MAASRSTRVTRSTVGLNGLDESFCGRTLRNRSIAHPEEISSNSQVR...,"[GO:0005671|IBA, GO:0005730|IDA, GO:0005654|ID...","[IPR009057, IPR017930, IPR001005, IPR000433, I...",9606,"[GO:0005730, GO:0005654]"
77463,565230,ZZZ3_MOUSE,Q6KAQ7; Q3TMK6; Q3V189;,MVGTCHSMAASRSTRVTRSTVGLNGLDESFCGRTLRNRSIAHPEEI...,"[GO:0005671|IDA, GO:0005730|ISO, GO:0005654|IS...","[IPR009057, IPR017930, IPR001005, IPR000433, I...",10090,[GO:0005671]
77464,565239,Z_LASSJ,O73557;,MGNKQAKAPESKDSPRASLIPDATHLGPQFCKSCWFENKGLVECNN...,"[GO:0044220|IEA, GO:0020002|IEA, GO:0016020|IE...","[IPR024183, IPR038485, IPR003224]",11622,[GO:0046761]


## Propagate annotations

In [15]:
prop_annotations = []
for i, row in df.iterrows():
    # Propagate annotations
    annot_set = set()
    annots = row['exp_annotations']
    for go_id in annots:
        annot_set |= go.get_anchestors(go_id)
    annots = list(annot_set)
    prop_annotations.append(annots)
df['prop_annotations'] = prop_annotations

df

Unnamed: 0,index,proteins,accessions,sequences,annotations,interpros,orgs,exp_annotations,prop_annotations
0,225,11K_PAVHV,P0DJZ0;,MQNNTTGMDTKSLKNCGQPKAVCTHCKHSPPCPQPGCVTKRPPVPP...,"[GO:0030430|IDA, GO:0039526|IEA]",[],648237,[GO:0030430],"[GO:0043656, GO:0033643, GO:0030430, GO:000557..."
1,226,11S1_CARIL,B5KVH4;,MAKPILLSIYLCLIIVALFNGCLAQSGGRQQHKFGQCQLNRLDALE...,"[GO:0019863|IEA, GO:0045735|IC, GO:0048316|IEP...","[IPR022379, IPR006044, IPR006045, IPR014710, I...",32201,"[GO:0045735, GO:0048316, GO:0010431]","[GO:0032502, GO:0045735, GO:0010431, GO:007169..."
2,230,11S2_SESIN,Q9XHP0;,MVAFKFLLALSLSLLVSAAIAQTREPRLTQGQQCRFQRISGAQPSL...,"[GO:0042735|NAS, GO:0045735|NAS, GO:0010431|IEP]","[IPR022379, IPR006044, IPR006045, IPR014710, I...",4182,[GO:0010431],"[GO:0032502, GO:0010431, GO:0071695, GO:000815..."
3,244,128UP_DROME,P32234; Q9V648;,MSTILEKISAIESEMARTQKNKATSAHLGLLKAKLAKLRRELISPK...,"[GO:0005737|IBA, GO:0005525|IDA, GO:0002181|IBA]","[IPR012675, IPR031167, IPR006073, IPR031662, I...",7227,[GO:0005525],"[GO:0035639, GO:0000166, GO:0043167, GO:003255..."
4,256,13KDA_SCYCA,P83011;,MIFTAXDRSAIEXV,"[GO:0005783|IEA, GO:0043231|IDA]",[],7830,[GO:0043231],"[GO:0043231, GO:0016020, GO:0005622, GO:000557..."
...,...,...,...,...,...,...,...,...,...
77461,565225,ZYX_XENLA,A5H447;,MDPAAPATRMTSSFTINISTPSFYNPPKKFAPVVPPKPKINPFKAP...,"[GO:0005737|IEA, GO:0005925|ISS, GO:0001725|IS...",[IPR001781],8355,"[GO:0008134, GO:0006357]","[GO:0010556, GO:0006725, GO:0006355, GO:004423..."
77462,565229,ZZZ3_HUMAN,Q8IYH5; B7WPC6; Q6N004; Q6N070; Q8IYP0; Q8IYR1...,MAASRSTRVTRSTVGLNGLDESFCGRTLRNRSIAHPEEISSNSQVR...,"[GO:0005671|IBA, GO:0005730|IDA, GO:0005654|ID...","[IPR009057, IPR017930, IPR001005, IPR000433, I...",9606,"[GO:0005730, GO:0005654]","[GO:0043231, GO:0070013, GO:0005730, GO:000563..."
77463,565230,ZZZ3_MOUSE,Q6KAQ7; Q3TMK6; Q3V189;,MVGTCHSMAASRSTRVTRSTVGLNGLDESFCGRTLRNRSIAHPEEI...,"[GO:0005671|IDA, GO:0005730|ISO, GO:0005654|IS...","[IPR009057, IPR017930, IPR001005, IPR000433, I...",10090,[GO:0005671],"[GO:0043231, GO:0070013, GO:0032991, GO:000563..."
77464,565239,Z_LASSJ,O73557;,MGNKQAKAPESKDSPRASLIPDATHLGPQFCKSCWFENKGLVECNN...,"[GO:0044220|IEA, GO:0020002|IEA, GO:0016020|IE...","[IPR024183, IPR038485, IPR003224]",11622,[GO:0046761],"[GO:0046761, GO:0046753, GO:0044403, GO:001905..."


## adding CAFA information

In [16]:
cafa_target = []
for i, row in enumerate(df.itertuples()):
    if is_cafa_target(row.orgs):
        cafa_target.append(True)
    else:
        cafa_target.append(False)
df['cafa_target'] = cafa_target

df

Unnamed: 0,index,proteins,accessions,sequences,annotations,interpros,orgs,exp_annotations,prop_annotations,cafa_target
0,225,11K_PAVHV,P0DJZ0;,MQNNTTGMDTKSLKNCGQPKAVCTHCKHSPPCPQPGCVTKRPPVPP...,"[GO:0030430|IDA, GO:0039526|IEA]",[],648237,[GO:0030430],"[GO:0043656, GO:0033643, GO:0030430, GO:000557...",False
1,226,11S1_CARIL,B5KVH4;,MAKPILLSIYLCLIIVALFNGCLAQSGGRQQHKFGQCQLNRLDALE...,"[GO:0019863|IEA, GO:0045735|IC, GO:0048316|IEP...","[IPR022379, IPR006044, IPR006045, IPR014710, I...",32201,"[GO:0045735, GO:0048316, GO:0010431]","[GO:0032502, GO:0045735, GO:0010431, GO:007169...",False
2,230,11S2_SESIN,Q9XHP0;,MVAFKFLLALSLSLLVSAAIAQTREPRLTQGQQCRFQRISGAQPSL...,"[GO:0042735|NAS, GO:0045735|NAS, GO:0010431|IEP]","[IPR022379, IPR006044, IPR006045, IPR014710, I...",4182,[GO:0010431],"[GO:0032502, GO:0010431, GO:0071695, GO:000815...",False
3,244,128UP_DROME,P32234; Q9V648;,MSTILEKISAIESEMARTQKNKATSAHLGLLKAKLAKLRRELISPK...,"[GO:0005737|IBA, GO:0005525|IDA, GO:0002181|IBA]","[IPR012675, IPR031167, IPR006073, IPR031662, I...",7227,[GO:0005525],"[GO:0035639, GO:0000166, GO:0043167, GO:003255...",True
4,256,13KDA_SCYCA,P83011;,MIFTAXDRSAIEXV,"[GO:0005783|IEA, GO:0043231|IDA]",[],7830,[GO:0043231],"[GO:0043231, GO:0016020, GO:0005622, GO:000557...",False
...,...,...,...,...,...,...,...,...,...,...
77461,565225,ZYX_XENLA,A5H447;,MDPAAPATRMTSSFTINISTPSFYNPPKKFAPVVPPKPKINPFKAP...,"[GO:0005737|IEA, GO:0005925|ISS, GO:0001725|IS...",[IPR001781],8355,"[GO:0008134, GO:0006357]","[GO:0010556, GO:0006725, GO:0006355, GO:004423...",False
77462,565229,ZZZ3_HUMAN,Q8IYH5; B7WPC6; Q6N004; Q6N070; Q8IYP0; Q8IYR1...,MAASRSTRVTRSTVGLNGLDESFCGRTLRNRSIAHPEEISSNSQVR...,"[GO:0005671|IBA, GO:0005730|IDA, GO:0005654|ID...","[IPR009057, IPR017930, IPR001005, IPR000433, I...",9606,"[GO:0005730, GO:0005654]","[GO:0043231, GO:0070013, GO:0005730, GO:000563...",True
77463,565230,ZZZ3_MOUSE,Q6KAQ7; Q3TMK6; Q3V189;,MVGTCHSMAASRSTRVTRSTVGLNGLDESFCGRTLRNRSIAHPEEI...,"[GO:0005671|IDA, GO:0005730|ISO, GO:0005654|IS...","[IPR009057, IPR017930, IPR001005, IPR000433, I...",10090,[GO:0005671],"[GO:0043231, GO:0070013, GO:0032991, GO:000563...",True
77464,565239,Z_LASSJ,O73557;,MGNKQAKAPESKDSPRASLIPDATHLGPQFCKSCWFENKGLVECNN...,"[GO:0044220|IEA, GO:0020002|IEA, GO:0016020|IE...","[IPR024183, IPR038485, IPR003224]",11622,[GO:0046761],"[GO:0046761, GO:0046753, GO:0044403, GO:001905...",False


## Pickling pandas.DataFrame to swissprot.pkl

In [17]:
df.to_pickle(data_path_dict['swissprot'])

print("DATA FILE" ,len(df))

DATA FILE 77466


## Splitting into train/test   

In [19]:
# load DataFrame file and split it into two sets
def load_data_to_df_sets(data_file, split, random_split=True):
    # load DataFrame file
    df = pd.read_pickle(data_file)
    # total number
    n = len(df)
    # index to split
    split_idx = int(n * split)
    # shuffle index
    index = np.arange(n)
    if random_split:
        np.random.seed(seed=0)
        np.random.shuffle(index)
    # Split into set1 and set2
    set1_df = df.iloc[index[:split_idx]]
    set2_df = df.iloc[index[split_idx:]]

    return set1_df, set2_df


train_df, test_df = load_data_to_df_sets(data_path_dict['swissprot'], 0.95)

print('Number of train proteins:', len(train_df))
train_df.to_pickle(data_path_dict['train_data'])
print("Train dataset is saved to %s\n"%(data_path_dict['train_data']))

print('Number of test proteins:', len(test_df))
test_df.to_pickle(data_path_dict['test_data'])
print("Test dataset is saved to %s\n"%(data_path_dict['test_data']))

Number of train proteins: 73592
Train dataset is saved to /Users/robin/xbiome/datasets/protein/train_data.pkl

Number of test proteins: 3874


PermissionError: [Errno 13] Permission denied: '/Users/robin/xbiome/datasets/protein/test_data.pkl'

## Filtering terms with annotations more than min_count

In [None]:
# get frequency of each term in 'prop_annotations'
cnt = Counter()
for i, row in df.iterrows():
    for term in row['prop_annotations']:
        cnt[term] += 1

print("Number of prop_annotations:", len(cnt))

Number of prop_annotations: 31145


In [None]:
# Filter terms with annotations more than min_count
res = {}
for key, val in cnt.items():
    if val >= min_count:
        ont = key.split(':')[0]
        if ont not in res:
            res[ont] = []
        res[ont].append(key)
        
terms = []
for key, val in res.items():
    print(key, len(val))
    terms += val

print("Number of prop_annotations with proteins more than %d: %d (%.2f%%)"
      %(min_count, len(terms), 100*len(terms)/len(cnt)))

GO 5874
Number of prop_annotations with proteins more than 50: 5874 (18.86%)


In [None]:
# Pickle the list of terms to terms.pkl
terms_df = pd.DataFrame({'terms': terms})
terms_df.to_pickle(data_path_dict['terms'])

terms_df

Unnamed: 0,terms
0,GO:0030430
1,GO:0110165
2,GO:0033643
3,GO:0043657
4,GO:0033646
...,...
5869,GO:0004984
5870,GO:0050911
5871,GO:0005665
5872,GO:0000315


In [None]:
terms = terms_df['terms'].values.flatten()

terms

array(['GO:0030430', 'GO:0110165', 'GO:0033643', ..., 'GO:0005665',
       'GO:0000315', 'GO:0005762'], dtype=object)

In [None]:
terms_dict = {v: i for i, v in enumerate(terms)}

terms_dict

{'GO:0030430': 0,
 'GO:0110165': 1,
 'GO:0033643': 2,
 'GO:0043657': 3,
 'GO:0033646': 4,
 'GO:0043656': 5,
 'GO:0005575': 6,
 'GO:0018995': 7,
 'GO:0048731': 8,
 'GO:0048316': 9,
 'GO:0003674': 10,
 'GO:0010431': 11,
 'GO:0048856': 12,
 'GO:0032502': 13,
 'GO:0032504': 14,
 'GO:0000003': 15,
 'GO:0009791': 16,
 'GO:0048608': 17,
 'GO:0071695': 18,
 'GO:0061458': 19,
 'GO:0048609': 20,
 'GO:0003006': 21,
 'GO:0010154': 22,
 'GO:0021700': 23,
 'GO:0007275': 24,
 'GO:0032501': 25,
 'GO:0008150': 26,
 'GO:0022414': 27,
 'GO:1901363': 28,
 'GO:0017076': 29,
 'GO:0097159': 30,
 'GO:0032553': 31,
 'GO:0036094': 32,
 'GO:0005525': 33,
 'GO:0035639': 34,
 'GO:0043167': 35,
 'GO:0043168': 36,
 'GO:0032561': 37,
 'GO:0005488': 38,
 'GO:0019001': 39,
 'GO:0097367': 40,
 'GO:0032555': 41,
 'GO:0000166': 42,
 'GO:1901265': 43,
 'GO:0043229': 44,
 'GO:0016020': 45,
 'GO:0043226': 46,
 'GO:0043231': 47,
 'GO:0043227': 48,
 'GO:0005622': 49,
 'GO:0071383': 50,
 'GO:0009755': 51,
 'GO:0097305': 52,
 'G

In [None]:
nb_classes = len(terms)

nb_classes

5874