In [1]:
from collections import namedtuple
from grepc.models import *
from mcgocr.pattern_regex import regex_out

Unit = namedtuple('Unit', 'source, goid, start, end, text')

class Annotation(set):
    def __repr__(self):
        return 'Annotation<{} items>'.format(len(self))
    
    def sorted_results(self):
        sorted_results = list(self)
        sorted_results.sort(key=lambda u:u[2])
        sorted_results.sort(key=lambda u:u[0])
        return sorted_results
    
    def to_csv(self, fp):
        import csv  
        with open(fp, 'w') as f:
            w = csv.writer(f)
            w.writerows(self.sorted_results())
        
    def to_json(self, fp):
        import json
        with open(fp, 'w') as f:
            json.dump(self.sorted_results(), f)


class MCGOCR(object):
    def __init__(self, Ie, Im, Ie_boost, feature_hasher, classifier):
        self.Ie = Ie
        self.Im = Im
        self.Ie_boost = Ie_boost
        self.feature_hasher = feature_hasher
        self.classifier = classifier
    
    def build_extractor(self):
        logging.info('Building Aho-Corasick machine...')
        self.extractor = JoinExtractor([
            SolidExtractor(Ie),
            SolidExtractor(Ie_boost),
            SoftExtractor(regex_out)
        ])
    
    def _candidate_generate(self, corpus):
        candidates = []
        baskets = join_baskets(corpus, self.extractor, self.Im)
        for basket in baskets:
            candidates.extend(basket.candidates)
        return candidates
        
    def _feature_extract(self, candidates):
        measures = [candidate2features(c) for c in candidates]
        measures,_ = flaten([measures], [])
        return measures
    
    def _annotate(self, candidates, pred_y):
        annotation = Annotation()
        for candidate, label in zip(candidates, pred_y):
            if label == 1:
                (source, goid, start, end, text) = candidate2result(candidate)
                annotation.add(Unit(source, goid, start, end, text))
        return annotation
        
    def scan(self, corpus):
        """
        Input: Corpus object
        Output: Annotation object
        """
        candidates = self._candidate_generate(corpus)
        measures = self._feature_extract(candidates)
        X = self.feature_hasher.transform(measures)
        pred_y = self.classifier.predict(X)
        annotation = self._annotate(candidates, pred_y)
        return annotation

In [2]:
import logging
import pickle
logging.basicConfig(level=logging.INFO)
if True:
    def load_components():
        """
        Load the components from pickle files, 
        If the file is defined in the 'data/', load it
        else, load the default file from 'data/default/'
        return tuple (Ie, Im, feature_hasher, classifier)
        """
        def _load_pickle(item):
            path = 'data/{}.pickle'.format(item)
            
            try:
                logging.info('Loading the {}...'.format(item))
                with open(path, 'rb') as f:
                    return pickle.load(f)
            except:
                path = 'data/default/{}.pickle'.format(item)
                with open(path, 'rb') as f:
                    return pickle.load(f)
        
        Ie = _load_pickle('index_Ie')
        Im = _load_pickle('index_Im')
        Ie_boost = _load_pickle('index_Ie_boost')
        
        feature_hasher = _load_pickle('feature_hasher')
        classifier = _load_pickle('classifier')
        return Ie, Im, Ie_boost, feature_hasher, classifier
        
    Ie, Im, Ie_boost, feature_hasher, classifier = load_components()

INFO:root:Loading the index_Ie...
INFO:root:Loading the index_Im...
INFO:root:Loading the index_Ie_boost...
INFO:root:Loading the feature_hasher...
INFO:root:Loading the classifier...


In [3]:
%%time
mcgocr = MCGOCR(Ie, Im, Ie_boost, feature_hasher, classifier)
mcgocr.build_extractor()

INFO:root:Building Aho-Corasick machine...


CPU times: user 51.2 s, sys: 1.68 s, total: 52.9 s
Wall time: 54 s


In [4]:
%%time
from experiment import corpus as c
corpus = c.Corpus.from_dir('input', 'testing')

CPU times: user 3.15 s, sys: 48.1 ms, total: 3.19 s
Wall time: 3.35 s


In [5]:
%%time
annotation = mcgocr.scan(corpus)

CPU times: user 1min 26s, sys: 2.98 s, total: 1min 29s
Wall time: 1min 33s


In [6]:
with open('data/annotation.pickle', 'wb') as f:
    pickle.dump(annotation, f)

In [6]:
xmldir = 'data/craft-1.0/knowtator-xml/go_bpmf/'
gold_bpmf = []
for fn in os.listdir(xmldir):
    fpath = os.path.join(xmldir, fn)
    goldstandards = goldstandard_from_xml(fpath)
    gold_bpmf.extend(goldstandards)

print('Answers:', len(gold_bpmf))

xmldir = 'data/craft-1.0/knowtator-xml/go_cc/'
gold_cc = []
for fn in os.listdir(xmldir):
    fpath = os.path.join(xmldir, fn)
    goldstandards = goldstandard_from_xml(fpath)
    gold_cc.extend(goldstandards)

print('Answers:', len(gold_cc))

gold_union = gold_bpmf + gold_cc

Answers: 24062
Answers: 8354


In [7]:
gold = [u for u in gold_union if u[1].startswith('GO:')]

In [8]:
evaluate(annotation, gold, 'hello')

Report<R64.64% P80.65% F71.77% 'hello'>

In [19]:
report = Out[8]
fn = {u[1] for u in report.fn}
tp = {u[1] for u in report.tp}
fp = {u[1] for u in report.fp}
len(fn), len(tp), len(fp), len(fn - tp)

(995, 609, 612, 626)

In [8]:
gd['GO:0007126' ].labels

['meiosis']

In [10]:
Ie['meiosis']

{Constraint(lemma='meiosis', ref='GO:0051257'),
 Constraint(lemma='meiotic chromosome segregation', ref='GO:0051316'),
 Constraint(lemma='mitosis', ref='GO:0051256')}

In [18]:
[gd[g] for g in (fn -tp)]

[Concept<GO:0005008 MF hepatocyte growth factor receptor activity>,
 Concept<GO:0046549 BP retinal cone cell development>,
 Concept<GO:0042632 BP cholesterol homeostasis>,
 Concept<GO:0007126 BP meiosis>,
 Concept<GO:0006954 BP inflammatory response>,
 Concept<GO:0031643 BP positive regulation of myelination>,
 Concept<GO:0008595 BP determination of anterior/posterior axis, embryo>,
 Concept<GO:0042671 BP retinal cone cell fate determination>,
 Concept<GO:0030054 CC cell junction>,
 Concept<GO:0042981 BP regulation of apoptosis>,
 Concept<GO:0017144 BP drug metabolic process>,
 Concept<GO:0019199 MF transmembrane receptor protein kinase activity>,
 Concept<GO:0001816 BP cytokine production>,
 Concept<GO:0050877 BP neurological system process>,
 Concept<GO:0007160 BP cell-matrix adhesion>,
 Concept<GO:0008237 MF metallopeptidase activity>,
 Concept<GO:0004402 MF histone acetyltransferase activity>,
 Concept<GO:0008039 BP synaptic target recognition>,
 Concept<GO:0016572 BP histone phosp

In [9]:
len(annotation), len(set(annotation))

(23603, 23603)

In [18]:
with open('data/annotation.pickle', 'wb') as f:
    pickle.dump(annotation, f)

In [17]:
evaluate

<function grepc.models.evaluate>