In [2]:
import pandas as pd
import os

In [3]:
data_path = '/data/xbiome/protein_classification/cafa3'
train_data_file = os.path.join(data_path, 'train_data.pkl')
test_data_file = os.path.join(data_path, 'test_data.pkl')
terms_file = os.path.join(data_path, 'terms.pkl')

In [15]:
go_file = os.path.join(data_path, 'go_cafa3.obo')

In [6]:
train_data = pd.read_pickle(train_data_file)

In [4]:
train_data.head()

Unnamed: 0,proteins,sequences,annotations
0,A0A060X6Z0,MPISSSSSSSTKSMRRAASELERSDSVTSPRFIGRRQSLIEDARKE...,"{GO:0004511, GO:0016491, GO:0097458, GO:004429..."
1,A0A068FIK2,MEVGGGSEECCVKVAVHVRPLIGDEKVQGCKDCVTVIPGKPQVQIG...,"{GO:0043232, GO:0030863, GO:0044422, GO:000557..."
2,A0A075F932,MVSESHHEALAAPPATTVAAAPPSNVTEPASPGGGGGKEDAFSKLK...,"{GO:0023051, GO:0032940, GO:0032504, GO:190353..."
3,A0A078CGE6,MARQMTSSQFHKSKTLDNKYMLGDEIGKGAYGRVYIGLDLENGDFV...,"{GO:0044428, GO:0019538, GO:0046777, GO:006500..."
4,A0A086F3E3,MTKGRLEAFSDGVLAIIITIMVLELKVPEGSSWASLQPILPRFLAY...,"{GO:0055085, GO:0005216, GO:0051179, GO:002284..."


In [10]:
len(train_data.iloc[0,2])

18

In [11]:
terms = pd.read_pickle(terms_file)
len(terms)

29015

In [4]:
test_data = pd.read_pickle(test_data_file)
len(test_data)

3328

In [14]:
class Ontology(object):
    """
    [Term]
    id: GO:0000003
    name: reproduction
    namespace: biological_process
    alt_id: GO:0019952
    alt_id: GO:0050876
    def: "The production of new individuals that contain some portion of genetic material \
        inherited from one or more parent organisms." [GOC:go_curators, GOC:isa_complete,\
        GOC:jl, ISBN:0198506732]
    subset: goslim_agr
    subset: goslim_chembl
    subset: goslim_flybase_ribbon
    subset: goslim_pir
    subset: goslim_plant
    synonym: "reproductive physiological process" EXACT []
    xref: Wikipedia:Reproduction
    is_a: GO:0008150 ! biological_process
    disjoint_from: GO:0044848 ! biological phase
    """
    def __init__(self, filename='data/go.obo', with_rels=False):
        self.ontology = self.load_obo(filename, with_rels=with_rels)
        self.ic = None

    def load_obo(self, filename, with_rels=False):
        ontlogy = dict()
        goobj = None
        with open(filename, 'r', encoding='utf-8') as f:
            for line in f.readlines():
                line = line.strip()
                if not line:
                    continue
                if line == '[Term]':
                    if goobj is not None:
                        ontlogy[goobj['id']] = goobj

                    goobj = dict()
                    goobj['is_a'] = list()
                    goobj['part_of'] = list()
                    goobj['regulates'] = list()
                    goobj['alt_ids'] = list()
                    goobj['is_obsolete'] = False
                    continue

                elif line == '[Typedef]':
                    if goobj is not None:
                        ontlogy[goobj['id']] = goobj
                    goobj = None

                else:
                    if goobj is None:
                        continue

                    subline = line.split(': ')
                    if subline[0] == 'id':
                        goobj['id'] = subline[1]
                    elif subline[0] == 'alt_id':
                        goobj['alt_ids'].append(subline[1])
                    elif subline[0] == 'namespace':
                        goobj['namespace'] = subline[1]
                    elif subline[0] == 'is_a':
                        goobj['is_a'].append(subline[1].split(' ! ')[0])
                    elif with_rels and subline[0] == 'relationship':
                        it = subline[1].split()
                        # add all types of relationships
                        goobj['is_a'].append(it[1])
                    elif subline[0] == 'name':
                        goobj['name'] = subline[1]
                    elif subline[0] == 'is_obsolete' and subline[1] == 'true':
                        goobj['is_obsolete'] = True
            if goobj is not None:
                ontlogy[goobj['id']] = goobj
            for term_id in list(ontlogy.keys()):
                for alt_id in ontlogy[term_id]['alt_ids']:
                    ontlogy[alt_id] = ontlogy[term_id]
                if ontlogy[term_id]['is_obsolete']:
                    del ontlogy[term_id]

            for term_id, val in ontlogy.items():
                if 'children' not in val:
                    val['children'] = set()
                for p_id in val['is_a']:
                    if p_id in ontlogy:
                        if 'children' not in ontlogy[p_id]:
                            ontlogy[p_id]['children'] = set()
                        ontlogy[p_id]['children'].add(term_id)
        return ontlogy

    def has_term(self, term_id):
        return term_id in self.ontology

    def get_term(self, term_id):
        if self.has_term(term_id):
            return self.ontology[term_id]
        return None

    def calculate_ic(self, annots):
        cnt = Counter()
        for x in annots:
            cnt.update(x)
        self.ic = {}
        for go_id, n in cnt.items():
            parents = self.get_parents(go_id)
            if len(parents) == 0:
                min_n = n
            else:
                min_n = min([cnt[x] for x in parents])

            self.ic[go_id] = math.log(min_n / n, 2)

    def get_ic(self, go_id):
        if self.ic is None:
            raise Exception('Not yet calculated')
        if go_id not in self.ic:
            return 0.0
        return self.ic[go_id]

    def get_anchestors(self, term_id):
        if term_id not in self.ontology:
            return set()
        term_set = set()
        q = deque()
        q.append(term_id)
        while (len(q) > 0):
            t_id = q.popleft()
            if t_id not in term_set:
                term_set.add(t_id)
                for parent_id in self.ontology[t_id]['is_a']:
                    if parent_id in self.ontology:
                        q.append(parent_id)
        return term_set

    def get_parents(self, term_id):
        if term_id not in self.ontology:
            return set()
        term_set = set()
        for parent_id in self.ontology[term_id]['is_a']:
            if parent_id in self.ontology:
                term_set.add(parent_id)
        return term_set

    def get_namespace_terms(self, namespace):
        terms = set()
        for go_id, goobj in self.ontology.items():
            if goobj['namespace'] == namespace:
                terms.add(go_id)
        return terms

    def get_namespace(self, term_id):
        return self.ontology[term_id]['namespace']

    def get_term_set(self, term_id):
        if term_id not in self.ontology:
            return set()
        term_set = set()
        q = deque()
        q.append(term_id)
        while len(q) > 0:
            t_id = q.popleft()
            if t_id not in term_set:
                term_set.add(t_id)
                for ch_id in self.ontology[t_id]['children']:
                    q.append(ch_id)
        return term_set

In [33]:
def seperate(data_file, go_file):
    df = pd.read_pickle(data_file)
    ont = Ontology(go_file, with_rels=True)
    bpo_proteins = []
    bpo_sequences = []
    bpo_annotations = []
    mfo_proteins = []
    mfo_sequences = []
    mfo_annotations = []
    cco_proteins = []
    cco_sequences = []
    cco_annotations = []
    for item in df.iterrows():
        protien = item[1]['proteins']
        seq = item[1]['sequences']
        annotation = item[1]['annotations']
        bpo_annotation = []
        mfo_annotation = []
        cco_annotation = []
        for term in annotation:
            if ont.get_namespace(term) == 'biological_process':
                bpo_annotation.append(term)
            elif ont.get_namespace(term) == 'molecular_function':
                mfo_annotation.append(term)
            elif ont.get_namespace(term) == 'cellular_component':
                cco_annotation.append(term)
        if len(bpo_annotation) > 0:
            bpo_proteins.append(protien)
            bpo_sequences.append(seq)
            bpo_annotations.append(bpo_annotation)
        if len(mfo_annotation) > 0:
            mfo_proteins.append(protien)
            mfo_sequences.append(seq)
            mfo_annotations.append(mfo_annotation)
        if len(cco_annotation) > 0:
            cco_proteins.append(protien)
            cco_sequences.append(seq)
            cco_annotations.append(cco_annotation)
        break
    bpo_df = pd.DataFrame({
        'proteins': bpo_proteins,
        'sequences': bpo_sequences,
        'annotations': bpo_annotations,
    })
    mfo_df = pd.DataFrame({
        'proteins': mfo_proteins,
        'sequences': mfo_sequences,
        'annotations': mfo_annotations,
    })
    cco_df = pd.DataFrame({
        'proteins': cco_proteins,
        'sequences': cco_sequences,
        'annotations': cco_annotations,
    })
    return bpo_df, mfo_df, cco_df


In [34]:
bpo_df, mfo_df, cco_df = seperate(train_data_file, go_file)

In [35]:
bpo_df.to_pickle()

Unnamed: 0,proteins,sequences,annotations


In [36]:
mfo_df

Unnamed: 0,proteins,sequences,annotations
0,A0A060X6Z0,MPISSSSSSSTKSMRRAASELERSDSVTSPRFIGRRQSLIEDARKE...,"[GO:0004497, GO:0016714, GO:0016491, GO:000451..."


In [37]:
cco_df

Unnamed: 0,proteins,sequences,annotations
0,A0A060X6Z0,MPISSSSSSSTKSMRRAASELERSDSVTSPRFIGRRQSLIEDARKE...,"[GO:0043204, GO:0005575, GO:0070852, GO:004302..."


In [32]:
test_data

Unnamed: 0,proteins,sequences,annotations
0,T100900000026,MAESFKELDPDSSMGKALEMTCAIQNQLARILAEFEMTLERDVLQP...,"{GO:0044085, GO:0005911, GO:0031529, GO:005134..."
1,T100900000046,MRLCIPQVLLALFLSMLTAPGEGSRRRATQEDTTQPALLRLSDHLL...,"{GO:0042802, GO:0005488, GO:0005515, GO:0003674}"
2,T100900000115,MNNLSFSELCCLFCCPPCPGKIASKLAFLPPDPTYTLMCDESGSRW...,"{GO:0033365, GO:0010008, GO:0098794, GO:007194..."
3,T100900000116,MPEPGPRMNGFSLGELCWLFCCPPCPSRIAAKLAFLPPEPTYTVLA...,"{GO:0035601, GO:0033365, GO:0060341, GO:000208..."
4,T100900000141,MAVPGPTARAGARPRLDLQLVQRFVRIQKVFFPSWSSQNVLMFMTL...,"{GO:0071310, GO:0034097, GO:0051716, GO:007088..."
...,...,...,...
3323,T992870001087,MIDGKTANEIFDSIRQHIIAGTLRAEDSLPPVRELASELKVNRNTV...,"{GO:0010629, GO:0097159, GO:1901362, GO:000635..."
3324,T992870001259,MKQGLQLRLSQQLAMTPQLQQAIRLLQLSTLELQQELQQALENNPL...,"{GO:0000975, GO:0006355, GO:0090304, GO:001060..."
3325,T992870001336,MDYQNNVSEERVAEMIWDAVSEGATLKDVHGIPQDMMDGLYAHAYE...,"{GO:0042802, GO:0005488, GO:0005515, GO:0003674}"
3326,T992870001601,MTVDSNTSSGRGNDPEQIDLIELLLQLWRGKMTIIVAVIIAILLAV...,"{GO:0042802, GO:0005488, GO:0005515, GO:0003674}"


In [38]:
data_path = '/data/xbiome/protein_classification/cafa3'

save_bpo = os.path.join(data_path, 'bpo')
save_mfo = os.path.join(data_path, 'mfo')
save_cco = os.path.join(data_path, 'cco')
bpo_train = os.path.join(save_bpo, 'bpo_train_data.pkl')
df = pd.read_pickle(bpo_train)
df




Unnamed: 0,proteins,sequences,annotations
0,A0A075F932,MVSESHHEALAAPPATTVAAAPPSNVTEPASPGGGGGKEDAFSKLK...,"[GO:0044765, GO:0050794, GO:1903530, GO:002305..."
1,A0A078CGE6,MARQMTSSQFHKSKTLDNKYMLGDEIGKGAYGRVYIGLDLENGDFV...,"[GO:0071704, GO:0050794, GO:0044267, GO:001953..."
2,A0A086F3E3,MTKGRLEAFSDGVLAIIITIMVLELKVPEGSSWASLQPILPRFLAY...,"[GO:0044765, GO:0015672, GO:0071805, GO:005123..."
3,A0A087X1C5,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,"[GO:0071466, GO:0009410, GO:0042493, GO:000815..."
4,A0A0A1GKA2,MSDLSGEDVTVVGGGIGGLSAACYLADAGADVSLLEKNEQLGGRAS...,"[GO:0071704, GO:0016114, GO:0044255, GO:001611..."
...,...,...,...
50808,W5EP13,MLLFAPTPPPSPATAHRRPGGSAASCIRCSSVRELDRSPSRPPLPP...,"[GO:0008150, GO:0044237, GO:0006793, GO:001631..."
50809,W8DXL4,MWLSACLCLVLSFLGGVNGTCPSQCSCEYHGRHDGSGSRLVLCNDL...,"[GO:0050953, GO:0032501, GO:0050877, GO:000760..."
50810,W8E7I1,MSSEEGKLFVGGLNFNTDERALEDHFSSFGPISEVVVVKDRETQRS...,"[GO:0051716, GO:0006950, GO:0009631, GO:000926..."
50811,X1WGX5,MEGKPRKKSFTPRDGKKPSFKSKGKPGGKPQGKRPFKPHNNDKGKG...,"[GO:0006928, GO:0007276, GO:0022412, GO:000835..."


In [40]:
bpo_test = os.path.join(save_bpo, 'bpo_test_data.pkl')
df = pd.read_pickle(bpo_test)
df

Unnamed: 0,proteins,sequences,annotations
0,T100900000026,MAESFKELDPDSSMGKALEMTCAIQNQLARILAEFEMTLERDVLQP...,"[GO:0065009, GO:0008150, GO:0016043, GO:004308..."
1,T100900000115,MNNLSFSELCCLFCCPPCPGKIASKLAFLPPDPTYTLMCDESGSRW...,"[GO:0050794, GO:0044380, GO:0097061, GO:004215..."
2,T100900000116,MPEPGPRMNGFSLGELCWLFCCPPCPSRIAAKLAFLPPEPTYTVLA...,"[GO:0071704, GO:0048519, GO:0050794, GO:007072..."
3,T100900000141,MAVPGPTARAGARPRLDLQLVQRFVRIQKVFFPSWSSQNVLMFMTL...,"[GO:0051716, GO:1990830, GO:0070887, GO:003409..."
4,T100900000161,MADDLEQQPQGWLSSWLPTWRPTSMSQLKNVEARILQCLQNKFLAR...,"[GO:0071704, GO:0044106, GO:0070291, GO:004428..."
...,...,...,...
2128,T992870000466,MIPEKRIIRRIQSGGCAIHCQDCSISQLCIPFTLNEHELDQLDNII...,"[GO:1902680, GO:0050794, GO:0006139, GO:001607..."
2129,T992870000685,MPEVKTEKPHLLDMGKPQLRMVDLNLLTVFDAVMQEQNITRAAHTL...,"[GO:1902680, GO:0050794, GO:0006139, GO:001607..."
2130,T992870001023,MMRVLVVEDNALLRHHLKVQLQDSGHQVDAAEDAREADYYLNEHLP...,"[GO:1902680, GO:0050794, GO:0006139, GO:001607..."
2131,T992870001087,MIDGKTANEIFDSIRQHIIAGTLRAEDSLPPVRELASELKVNRNTV...,"[GO:0046483, GO:0071704, GO:0050794, GO:005117..."


In [42]:
mfo_train = os.path.join(save_mfo, 'mfo_train_data.pkl')
df = pd.read_pickle(mfo_train)
df

Unnamed: 0,proteins,sequences,annotations
0,A0A060X6Z0,MPISSSSSSSTKSMRRAASELERSDSVTSPRFIGRRQSLIEDARKE...,"[GO:0016491, GO:0003674, GO:0004511, GO:001670..."
1,A0A078CGE6,MARQMTSSQFHKSKTLDNKYMLGDEIGKGAYGRVYIGLDLENGDFV...,"[GO:0016773, GO:0016301, GO:0003824, GO:000367..."
2,A0A086F3E3,MTKGRLEAFSDGVLAIIITIMVLELKVPEGSSWASLQPILPRFLAY...,"[GO:0015267, GO:0022842, GO:0022890, GO:000521..."
3,A0A087X1C5,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,"[GO:0016712, GO:0016491, GO:0016705, GO:000382..."
4,A0A096SRM5,MAANGGDHTSARPHVVLLPSAGMGHLVPFARLAVALSEGHGCNVSV...,"[GO:0035251, GO:0008194, GO:0016740, GO:000367..."
...,...,...,...
35081,V5JFY4,MGPWTLLLLHLPLVVSMLPAPTNVSIVSFNLEHTLTWLPGPETPDN...,"[GO:0004888, GO:0004904, GO:0004896, GO:000367..."
35082,V5YM14,MRPNLLAAAIAVPLSLLAAQIAQAGEGMWVPQQLPEIAGPLKKAGL...,"[GO:0016787, GO:0004177, GO:0003824, GO:004280..."
35083,V5YMB3,MRHPAFRLTLLASTVAFALAPQAAQAAPSAADRIAGTELIARDALF...,"[GO:0016787, GO:0004177, GO:0003824, GO:000417..."
35084,V9GXG1,MPYAEITVNLGKVTLGEENRKKMTNSCLKRHENSSLVQAVCALLNS...,"[GO:0016787, GO:0043021, GO:0043022, GO:004487..."


In [43]:
mfo_test = os.path.join(save_mfo, 'mfo_test_data.pkl')
df = pd.read_pickle(mfo_test)
df

Unnamed: 0,proteins,sequences,annotations
0,T100900000046,MRLCIPQVLLALFLSMLTAPGEGSRRRATQEDTTQPALLRLSDHLL...,"[GO:0003674, GO:0005488, GO:0042802, GO:0005515]"
1,T100900000115,MNNLSFSELCCLFCCPPCPGKIASKLAFLPPDPTYTLMCDESGSRW...,"[GO:0003824, GO:0098599, GO:0016790, GO:000367..."
2,T100900000116,MPEPGPRMNGFSLGELCWLFCCPPCPSRIAAKLAFLPPEPTYTVLA...,"[GO:0016787, GO:0098599, GO:0016788, GO:001679..."
3,T100900000161,MADDLEQQPQGWLSSWLPTWRPTSMSQLKNVEARILQCLQNKFLAR...,"[GO:0016787, GO:0016788, GO:0016298, GO:000382..."
4,T100900000167,MEKSWMLWSFIERWLLALASWSWALCRISLLPLIVTFHLYGGIVLL...,"[GO:0016787, GO:0098599, GO:0016788, GO:001679..."
...,...,...,...
1083,T992870001087,MIDGKTANEIFDSIRQHIIAGTLRAEDSLPPVRELASELKVNRNTV...,"[GO:0003700, GO:0001071, GO:0043167, GO:003017..."
1084,T992870001259,MKQGLQLRLSQQLAMTPQLQQAIRLLQLSTLELQQELQQALENNPL...,"[GO:0001130, GO:1990837, GO:0003690, GO:000097..."
1085,T992870001336,MDYQNNVSEERVAEMIWDAVSEGATLKDVHGIPQDMMDGLYAHAYE...,"[GO:0003674, GO:0005488, GO:0042802, GO:0005515]"
1086,T992870001601,MTVDSNTSSGRGNDPEQIDLIELLLQLWRGKMTIIVAVIIAILLAV...,"[GO:0003674, GO:0005488, GO:0042802, GO:0005515]"


In [44]:
cco_train = os.path.join(save_cco, 'cco_train_data.pkl')
df = pd.read_pickle(cco_train)
df

Unnamed: 0,proteins,sequences,annotations
0,A0A060X6Z0,MPISSSSSSSTKSMRRAASELERSDSVTSPRFIGRRQSLIEDARKE...,"[GO:0070852, GO:0044297, GO:0043005, GO:000557..."
1,A0A068FIK2,MEVGGGSEECCVKVAVHVRPLIGDEKVQGCKDCVTVIPGKPQVQIG...,"[GO:0030863, GO:0005881, GO:0044444, GO:004442..."
2,A0A078CGE6,MARQMTSSQFHKSKTLDNKYMLGDEIGKGAYGRVYIGLDLENGDFV...,"[GO:0031974, GO:0044424, GO:0043231, GO:004322..."
3,A0A087X1C5,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,"[GO:0044444, GO:0044424, GO:0043231, GO:004322..."
4,A0A0C5B5G6,MRWQEMGYIFYPRKLR,"[GO:0005615, GO:0005576, GO:0005575, GO:0044421]"
...,...,...,...
49323,U3H0P2,MKNNYTSLKSPLDEEDELKTDHEIDLEKGPLPEYDSEEEGALPPYS...,"[GO:0043231, GO:0005634, GO:0043227, GO:000563..."
49324,U4PR86,MSKYEVLQGFYAVHDELGSGGFGKVRLATHLLTNQKVAIKIIDKKQ...,"[GO:0044464, GO:0005622, GO:0015630, GO:004322..."
49325,V5JFY4,MGPWTLLLLHLPLVVSMLPAPTNVSIVSFNLEHTLTWLPGPETPDN...,"[GO:0016020, GO:0005575]"
49326,V9GXG1,MPYAEITVNLGKVTLGEENRKKMTNSCLKRHENSSLVQAVCALLNS...,"[GO:0044424, GO:0043231, GO:0043227, GO:000562..."


In [45]:
cco_test = os.path.join(save_cco, 'cco_test_data.pkl')
df = pd.read_pickle(cco_test)
df

Unnamed: 0,proteins,sequences,annotations
0,T100900000026,MAESFKELDPDSSMGKALEMTCAIQNQLARILAEFEMTLERDVLQP...,"[GO:0005911, GO:0030054, GO:0005575, GO:000592..."
1,T100900000115,MNNLSFSELCCLFCCPPCPGKIASKLAFLPPDPTYTLMCDESGSRW...,"[GO:0044464, GO:0055037, GO:0098805, GO:000562..."
2,T100900000116,MPEPGPRMNGFSLGELCWLFCCPPCPSRIAAKLAFLPPEPTYTVLA...,"[GO:0044424, GO:0005623, GO:0044464, GO:004520..."
3,T100900000167,MEKSWMLWSFIERWLLALASWSWALCRISLLPLIVTFHLYGGIVLL...,"[GO:0044444, GO:0042995, GO:0044424, GO:003042..."
4,T100900000453,MRIGLLWLVPLFTLTEGTDGFLQQKNDGRRTKEIVSMVEERHPVHE...,"[GO:0044464, GO:0005623, GO:0005886, GO:000557..."
...,...,...,...
1089,T992870000225,MVLGKPQTDPTLEWFLSHCHIHKYPSKSTLIHQGEKAETLYYIVKG...,"[GO:0032993, GO:0005575, GO:0032991]"
1090,T992870000466,MIPEKRIIRRIQSGGCAIHCQDCSISQLCIPFTLNEHELDQLDNII...,"[GO:0032993, GO:0005575, GO:0032991]"
1091,T992870000685,MPEVKTEKPHLLDMGKPQLRMVDLNLLTVFDAVMQEQNITRAAHTL...,"[GO:0032993, GO:0005575, GO:0032991]"
1092,T992870001023,MMRVLVVEDNALLRHHLKVQLQDSGHQVDAAEDAREADYYLNEHLP...,"[GO:0032993, GO:0005575, GO:0032991]"
