# GOA_cnt

In [5]:
import os
import pandas as pd
from collections import Counter

def statistic_terms(train_data_path):
    "get frequency dict from train file'"
    train_data = pd.read_pickle(train_data_path)
    cnt = Counter()
    for i, row in train_data.iterrows():
        for term in row['annotations']:
            cnt[term] += 1
    print("Number of prop_annotations:", len(cnt))
    sorted_by_freq_tuples = sorted(cnt.items(), key=lambda x: x[0])
    sorted_by_freq_tuples.sort(key=lambda x: x[1], reverse=True)
    freq_dict = {go:count for go,count in sorted_by_freq_tuples}
    return freq_dict

data_path = '/data/xbiome/protein_classification/cafa3'
bpo_path = os.path.join(data_path, 'bpo')
freq_dict = statistic_terms(os.path.join(bpo_path, 'bpo_train_data.pkl'))


Number of prop_annotations: 20178


In [6]:
len(freq_dict)

20178

# make edges

In [9]:

class Ontology(object):
    """
    [Term]
    id: GO:0000003
    name: reproduction
    namespace: biological_process
    alt_id: GO:0019952
    alt_id: GO:0050876
    def: "The production of new individuals that contain some portion of genetic material \
        inherited from one or more parent organisms." [GOC:go_curators, GOC:isa_complete,\
        GOC:jl, ISBN:0198506732]
    subset: goslim_agr
    subset: goslim_chembl
    subset: goslim_flybase_ribbon
    subset: goslim_pir
    subset: goslim_plant
    synonym: "reproductive physiological process" EXACT []
    xref: Wikipedia:Reproduction
    is_a: GO:0008150 ! biological_process
    disjoint_from: GO:0044848 ! biological phase
    """
    def __init__(self, filename='data/go.obo', with_rels=False):
        self.ontology = self.load_obo(filename, with_rels=with_rels)
        self.ic = None

    def load_obo(self, filename, with_rels=False):
        ontlogy = dict()
        goobj = None
        with open(filename, 'r', encoding='utf-8') as f:
            for line in f.readlines():
                line = line.strip()
                if not line:
                    continue
                if line == '[Term]':
                    if goobj is not None:
                        ontlogy[goobj['id']] = goobj

                    goobj = dict()
                    goobj['is_a'] = list()
                    goobj['part_of'] = list()
                    goobj['regulates'] = list()
                    goobj['alt_ids'] = list()
                    goobj['is_obsolete'] = False
                    continue

                elif line == '[Typedef]':
                    if goobj is not None:
                        ontlogy[goobj['id']] = goobj
                    goobj = None

                else:
                    if goobj is None:
                        continue

                    subline = line.split(': ')
                    if subline[0] == 'id':
                        goobj['id'] = subline[1]
                    elif subline[0] == 'alt_id':
                        goobj['alt_ids'].append(subline[1])
                    elif subline[0] == 'namespace':
                        goobj['namespace'] = subline[1]
                    elif subline[0] == 'is_a':
                        goobj['is_a'].append(subline[1].split(' ! ')[0])
                    elif with_rels and subline[0] == 'relationship':
                        it = subline[1].split()
                        # add all types of relationships
                        goobj['is_a'].append(it[1])
                    elif subline[0] == 'name':
                        goobj['name'] = subline[1]
                    elif subline[0] == 'is_obsolete' and subline[1] == 'true':
                        goobj['is_obsolete'] = True
            if goobj is not None:
                ontlogy[goobj['id']] = goobj
            for term_id in list(ontlogy.keys()):
                for alt_id in ontlogy[term_id]['alt_ids']:
                    ontlogy[alt_id] = ontlogy[term_id]
                if ontlogy[term_id]['is_obsolete']:
                    del ontlogy[term_id]

            for term_id, val in ontlogy.items():
                if 'children' not in val:
                    val['children'] = set()
                for p_id in val['is_a']:
                    if p_id in ontlogy:
                        if 'children' not in ontlogy[p_id]:
                            ontlogy[p_id]['children'] = set()
                        ontlogy[p_id]['children'].add(term_id)
        return ontlogy

    def has_term(self, term_id):
        return term_id in self.ontology

    def get_term(self, term_id):
        if self.has_term(term_id):
            return self.ontology[term_id]
        return None

    def calculate_ic(self, annots):
        cnt = Counter()
        for x in annots:
            cnt.update(x)
        self.ic = {}
        for go_id, n in cnt.items():
            parents = self.get_parents(go_id)
            if len(parents) == 0:
                min_n = n
            else:
                min_n = min([cnt[x] for x in parents])

            self.ic[go_id] = math.log(min_n / n, 2)

    def get_ic(self, go_id):
        if self.ic is None:
            raise Exception('Not yet calculated')
        if go_id not in self.ic:
            return 0.0
        return self.ic[go_id]

    def get_anchestors(self, term_id):
        if term_id not in self.ontology:
            return set()
        term_set = set()
        q = deque()
        q.append(term_id)
        while (len(q) > 0):
            t_id = q.popleft()
            if t_id not in term_set:
                term_set.add(t_id)
                for parent_id in self.ontology[t_id]['is_a']:
                    if parent_id in self.ontology:
                        q.append(parent_id)
        return term_set

    def get_parents(self, term_id):
        if term_id not in self.ontology:
            return set()
        term_set = set()
        for parent_id in self.ontology[term_id]['is_a']:
            if parent_id in self.ontology:
                term_set.add(parent_id)
        return term_set

    def get_namespace_terms(self, namespace):
        terms = set()
        for go_id, goobj in self.ontology.items():
            if goobj['namespace'] == namespace:
                terms.add(go_id)
        return terms

    def get_namespace(self, term_id):
        return self.ontology[term_id]['namespace']

    def get_term_set(self, term_id):
        if term_id not in self.ontology:
            return set()
        term_set = set()
        q = deque()
        q.append(term_id)
        while len(q) > 0:
            t_id = q.popleft()
            if t_id not in term_set:
                term_set.add(t_id)
                for ch_id in self.ontology[t_id]['children']:
                    q.append(ch_id)
        return term_set

In [28]:
go_ont = Ontology(go_file, with_rels=True)


In [32]:
go_ont.get_parents('GO:0014059')

{'GO:0014046', 'GO:0044070', 'GO:0050433'}

In [34]:
data_path = '/data/xbiome/protein_classification/cafa3'
go_file = os.path.join(data_path, 'go_cafa3.obo')

def make_edges(go_file, freq_dict, with_rels=True):
    go_ont = Ontology(go_file, with_rels=with_rels)
    all_terms = freq_dict.keys()
    edges = dict()
    for subj in all_terms:
        parents = []
        objs = go_ont.get_parents(subj)
        if len(objs) > 0:
            for obj in objs:
                if obj in  all_terms:
                    parents.append(obj)
            edges.update({subj:parents})
    return edges
edges = make_edges(go_file,freq_dict)

# make IC file

In [39]:
from collections import defaultdict
import math
import os

In [40]:
def read_go_children(input_go_obo_file):
    children = defaultdict(list)
    alt_id = defaultdict(list)
    term = False
    go_id = ''
    alt_ids = set()
    with open(input_go_obo_file) as read_in:
        for line in read_in:
            splitted_line = line.strip().split(':')
            if '[Term]' in line:
                term = True
                go_id = ''
                alt_ids = set()
            elif term and 'id: GO:' in line and 'alt_id' not in line:
                go_id = "GO:{}".format(splitted_line[2].strip())
            elif term and 'alt_id: GO' in line:
                alt_id_id = "GO:{}".format(splitted_line[2].strip())
                alt_ids.add(alt_id_id)
                alt_id[go_id].append(alt_id_id)
            elif term and 'is_a:' in line:
                splitted_term = splitted_line[2].split("!")
                go_term = "GO:{}".format(splitted_term[0].strip())
                children[go_term].append(go_id)
                for a in alt_ids:
                    children[go_term].append(a)
            elif '[Typedef]' in line:
                term = False
    return children, alt_id

In [46]:
def find_all_descendants(input_go_term, children):
    children_set = set()
    queue = []
    queue.append(input_go_term)
    while queue:
        node = queue.pop(0)
        if node in children and node not in children_set:
            node_children = children[node]
            queue.extend(node_children)
        children_set.add(node)
    return children_set

In [47]:
freq_dict

{'GO:0008150': 50813,
 'GO:0009987': 41610,
 'GO:0044699': 38350,
 'GO:0044763': 31633,
 'GO:0008152': 28138,
 'GO:0071704': 26685,
 'GO:0044237': 25689,
 'GO:0065007': 24157,
 'GO:0044238': 23974,
 'GO:0050789': 22206,
 'GO:0050896': 20533,
 'GO:0050794': 20189,
 'GO:0043170': 19502,
 'GO:0044260': 17775,
 'GO:0006807': 16053,
 'GO:0032501': 15377,
 'GO:0034641': 14674,
 'GO:0071840': 14553,
 'GO:0009058': 14528,
 'GO:0044710': 14214,
 'GO:0032502': 14212,
 'GO:1901576': 14162,
 'GO:1901360': 13994,
 'GO:0044249': 13951,
 'GO:0044767': 13944,
 'GO:0016043': 13882,
 'GO:0044707': 13819,
 'GO:0051716': 13612,
 'GO:0048856': 13355,
 'GO:0006725': 13320,
 'GO:0046483': 13128,
 'GO:0051179': 12189,
 'GO:0006139': 12158,
 'GO:0019222': 11556,
 'GO:0007275': 11530,
 'GO:0048518': 10688,
 'GO:0031323': 10610,
 'GO:0010467': 10598,
 'GO:0090304': 10578,
 'GO:0007154': 10537,
 'GO:0060255': 10480,
 'GO:0080090': 10380,
 'GO:0006950': 10353,
 'GO:0009059': 10019,
 'GO:0048731': 9881,
 'GO:001953

In [52]:
def store_counts_for_GO_terms(freq_dict, alt_id):
    go_cnt = defaultdict()
    for term in freq_dict:
        cnt = int(freq_dict[term])
        if term in alt_id.keys():
            for x in alt_id[term]:
                term = x
                if term not in go_cnt:
                    go_cnt[term] = cnt
                else:
                    go_cnt[term] = go_cnt[term] + cnt
        else:
            if term not in go_cnt:
                go_cnt[term] = cnt
            else:
                go_cnt[term] = go_cnt[term] + cnt
    return go_cnt

In [43]:
def calculate_freq(term, children_set, go_cnt):
    freq = 0
    if term in go_cnt.keys():
        freq = freq + go_cnt[term]
    for children in children_set:
        if children in go_cnt.keys():
            freq = freq + go_cnt[children]
    return freq

In [45]:
def calculate_information_contents_of_GO_terms(input_go_cnt_file, children, alt_id):
    ic_dict = defaultdict()
    go_cnt = store_counts_for_GO_terms(input_go_cnt_file, alt_id)
    for x in range(0, 3):
        if x == 0:
            root = 'GO:0005575'  # cellular component
        elif x == 1:
            root = 'GO:0008150'  # biological process
        elif x == 2:
            root = 'GO:0003674'  # molecular function
        root_descendants = find_all_descendants(root, children)
        root_freq = calculate_freq(root, root_descendants, go_cnt)
        for term in root_descendants:
            term_descendants = find_all_descendants(term, children)
            term_freq = calculate_freq(term, term_descendants, go_cnt)
            term_prob = (term_freq + 1) / (root_freq + 1)
            term_ic = -math.log(term_prob)
            assert (term not in ic_dict)
            ic_dict[term] = term_ic
    return ic_dict

In [53]:

# input_go_obo_file = "dataset/go_cafa3.obo"
# input_go_cnt_file = "dataset/BPO/GOA_bpo_cnt.txt"
# output_file = "dataset/BPO/GOA_bpo_IC.txt"

children, alt_id = read_go_children(go_file)
ic_dict = calculate_information_contents_of_GO_terms(freq_dict, children, alt_id)


In [57]:
len(ic_dict.values())

44632

In [59]:
ic_dict

defaultdict(None,
            {'GO:0010287': -0.0,
             'GO:0070082': -0.0,
             'GO:0005780': -0.0,
             'GO:0071017': -0.0,
             'GO:0070719': -0.0,
             'GO:0060053': -0.0,
             'GO:0016611': -0.0,
             'GO:0000123': -0.0,
             'GO:0034358': -0.0,
             'GO:0000344': -0.0,
             'GO:0070419': -0.0,
             'GO:1902911': -0.0,
             'GO:1990586': -0.0,
             'GO:0043675': -0.0,
             'GO:0097454': -0.0,
             'GO:0098723': -0.0,
             'GO:0034519': -0.0,
             'GO:0039713': -0.0,
             'GO:0033095': -0.0,
             'GO:0032160': -0.0,
             'GO:0071087': -0.0,
             'GO:0030478': -0.0,
             'GO:0016940': -0.0,
             'GO:0097585': -0.0,
             'GO:0031519': -0.0,
             'GO:0005731': -0.0,
             'GO:0071561': -0.0,
             'GO:0000778': -0.0,
             'GO:0035618': -0.0,
             'GO:0044665'

# make final edge file

In [None]:

all_children, alt_id = read_go_children(go_file)


go_cnt = 

with open(os.path.join(data_path, "all_GOA_cnt.txt")) as lines:
    for line in lines:
        data = line.split('\t')
        cnt = float(data[1].replace('\n', ''))
        go_cnt[data[0]] = cnt

f1 = open(os.path.join(data_path, "all_go_cnt.tsv"), "w")

with open(os.path.join(data_path, "all_go_edge.txt")) as read_in:
    for line in read_in:
        splitted_line = line.split("\t")
        children = splitted_line[0]
        parent = splitted_line[1].replace("\n", "")
        cnt_every = 0.0
        cnt_chidren = 0.0
        cnt_freq_children = 0.0
        cnt_freq_parent = 0.0
        cnt_freq = 0.0

        if children in go_cnt.keys():
            cnt_freq_children = go_cnt[children]
        if parent in go_cnt.keys():
            cnt_freq_parent = go_cnt[parent]
        if cnt_freq_parent == 0.0 or cnt_freq_children == 0.0:
            cnt_freq = 1.0
        else:
            cnt_freq = cnt_freq_children / cnt_freq_parent

        if parent in all_children:
            for x in all_children[parent]:
                if x in go_ic.keys():
                    cnt_every += go_ic[x]

        if parent in go_ic.keys():
            cnt_every += go_ic[parent]
        if children in go_ic.keys():
            cnt_chidren += go_ic[children]

        cnt_every += cnt_chidren
        final_cnt = (cnt_chidren / cnt_every) + cnt_freq
        f1.write('{}\t{}\t{}\n'.format(children, parent, cnt_freq))