In [2]:
from grepc.models import *

In [3]:
definition_fp = 'data/pattern_definition.txt'
pm = gopattern.PatternManager.from_definition(definition_fp)

out = pm.regex_out()

In [4]:
with open('data/clusterbook_lib.pickle', 'rb') as f:
    clusterbook_lib = pickle.load(f)
with open('data/goid2statement_lib.pickle', 'rb') as f:
    goid2statement_lib = pickle.load(f)
with open('data/ref2statements.pickle', 'rb') as f:
    ref2statements = pickle.load(f)

In [5]:
cb = clusterbook_lib[1]
term_index = TermIndex(set)
for c in cb.clusters:
    for term in c.terms:
        term_index[term.lemma].add(c.primary_term)
len(term_index)

47678

In [6]:
pattern2statements = defaultdict(set)
entity2statements = defaultdict(set)

for statements in ref2statements.values():
    for statement in statements:
        if all([isinstance(term, Pattern) for term in statement.terms()]):
            for term in statement.terms():
                pattern2statements[term].add(statement)
        else:
            for term in statement.terms():
                if isinstance(term, Entity):
                    entity2statements[term].add(statement)

pattern2statements = dict(pattern2statements)
entity2statements = dict(entity2statements)


term2statements = ChainMap(entity2statements, pattern2statements)

In [7]:
from txttk.nlptools import sent_tokenize, count_start
import os
txtdir = 'data/craft-1.0/articles/txt/'
sent_tokenize = count_start(sent_tokenize)

sentences = []

for fn in filter(lambda name:name.endswith('.txt'), os.listdir(txtdir)):
    pmid = fn.partition('.')[0]
    with open(os.path.join(txtdir, fn)) as f:
        context = f.read()
    sentence_list = list(sent_tokenize(context, 0))
    
    for text, offset in sentence_list:
        sentences.append(Sentence(text, offset, pmid))
        
len(sentences)

21646

In [8]:
solid_extractor = SolidExtractor(term_index)
soft_extractor = SoftExtractor(out)

In [9]:
%%time
xmldir = 'data/craft-1.0/knowtator-xml/go_bpmf/'
gold_bpmf = []
for fn in os.listdir(xmldir):
    fpath = os.path.join(xmldir, fn)
    goldstandards = goldstandard_from_xml(fpath)
    gold_bpmf.extend(goldstandards)

print('Answers:', len(gold_bpmf))

Answers: 24062
CPU times: user 9.87 s, sys: 156 ms, total: 10 s
Wall time: 10.3 s


In [10]:
%%time
xmldir = 'data/craft-1.0/knowtator-xml/go_cc/'
gold_cc = []
for fn in os.listdir(xmldir):
    fpath = os.path.join(xmldir, fn)
    goldstandards = goldstandard_from_xml(fpath)
    gold_cc.extend(goldstandards)

print('Answers:', len(gold_cc))

Answers: 8354
CPU times: user 3.49 s, sys: 49.1 ms, total: 3.54 s
Wall time: 3.66 s


In [11]:
gold_union = gold_bpmf + gold_cc

In [12]:
forest_cc = make_forest(gold_cc)
forest_bpmf = make_forest(gold_bpmf)


In [13]:
def experiment(title, training, testing, gold, extractor, term2statements, seed):

    h = FeatureHasher(n_features=1024)
    hclf = svm.LinearSVC(max_iter=1000, random_state=seed) 
    
    forest = make_forest(gold)
    training_baskets, testing_baskets = (join_baskets(training, extractor, term2statements), 
                                        join_baskets(testing, extractor, term2statements))
    training_X, testing_X = baskets2X(training_baskets), baskets2X(testing_baskets)
    training_y, testing_y = baskets2y(training_baskets, forest), baskets2y(testing_baskets, forest)

    training_X, training_y = flaten(training_X, training_y)
    
    training_array = h.fit_transform(training_X).toarray()

    #training

    hclf.fit(training_array, training_y)

    #testing
    system = []
    for basket, features_list, labels_list in zip(testing_baskets, testing_X, testing_y):
        baskets, X, y = [basket], [features_list], [labels_list]
        chunk_X, chunk_y = flaten(X, y)
        if len(chunk_X) == 0:
            continue
        chunk_array = h.fit_transform(chunk_X).toarray()
        #chunk_array = scaler.transform(chunk_array)
        pred_y = hclf.predict(chunk_array)
        system.extend(recover(baskets, [pred_y]))
    gold = recover(testing_baskets, testing_y)
    report = evaluate(system, gold, title)
    print(report)
    return(report)

In [14]:
from mcgocr import Craft, GoData, CandidateFinder, SolidExtractor, CandidateFinder, Index, LabelMarker

In [15]:
craft = Craft('data')
corpus = craft.get_corpus()
goldstandard = craft.get_goldstandard()

Reading the corpus...
Reading the goldstandard of GO...


In [16]:
auxillary_term_index = term_index_from(goldstandard, goid2statement_lib)
auxillary_extractor = SolidExtractor(auxillary_term_index)

In [17]:
extractor = JoinExtractor([solid_extractor, soft_extractor, auxillary_extractor])

In [18]:
import time

In [19]:
from sklearn.feature_extraction import FeatureHasher
from sklearn import svm

In [24]:
#experiment('quient', corpus, corpus, goldstandard, extractor, term2statements, 1)

Report<R72.32% P82.49% F77.07% 'quient'>


Report<R72.32% P82.49% F77.07% 'quient'>

In [20]:
title, training, testing, gold, extractor, term2statements, seed = ('quient', corpus, corpus, goldstandard, extractor, term2statements, 1)

h = FeatureHasher(n_features=1024)
hclf = svm.LinearSVC(max_iter=1000, random_state=seed) 

forest = make_forest(gold)
training_baskets, testing_baskets = (join_baskets(training, extractor, term2statements), 
                                    join_baskets(testing, extractor, term2statements))
training_X, testing_X = baskets2X(training_baskets), baskets2X(testing_baskets)
training_y, testing_y = baskets2y(training_baskets, forest), baskets2y(testing_baskets, forest)

training_X, training_y = flaten(training_X, training_y)

training_array = h.fit_transform(training_X).toarray()

#training

hclf.fit(training_array, training_y)

#testing
system = []
for basket, features_list, labels_list in zip(testing_baskets, testing_X, testing_y):
    baskets, X, y = [basket], [features_list], [labels_list]
    chunk_X, chunk_y = flaten(X, y)
    if len(chunk_X) == 0:
        continue
    chunk_array = h.fit_transform(chunk_X).toarray()
    #chunk_array = scaler.transform(chunk_array)
    pred_y = hclf.predict(chunk_array)
    system.extend(recover(baskets, [pred_y]))
gold = recover(testing_baskets, testing_y)
report = evaluate(system, gold, title)
print(report)

Report<R80.91% P82.51% F81.71% 'quient'>


In [49]:
join_baskets(training, extractor, term2statements)

[]

In [52]:
training[:10]

[Sentence(text='Intraocular pressure in genetically distinct mice: an update and strain survey\n', offset=0, docid='11532192'),
 Sentence(text='\nAbstract\n', offset=79, docid='11532192'),
 Sentence(text='\nBackground\n', offset=89, docid='11532192'),
 Sentence(text='\nLittle is known about genetic factors affecting intraocular pressure (IOP) in mice and other mammals. ', offset=101, docid='11532192'),
 Sentence(text='The purpose of this study was to determine the IOPs of genetically distinct mouse strains, assess the effects of factors such as age, sex and time of day on IOP in specific strain backgrounds, and to assess the effects of specific candidate gene mutations on IOP.\n', offset=204, docid='11532192'),
 Sentence(text='\nResults\n', offset=468, docid='11532192'),
 Sentence(text='\nBased on over 30 studied mouse strains, average IOP ranges from approximately 10 to 20 mmHg. ', offset=477, docid='11532192'),
 Sentence(text='Gender does not typically affect IOP and aging results in

In [55]:
for sentence in training[4:5]:
    print(extractor.findall(sentence))


[Evidence(term=Entity(lemma='aging', ref='GO:0007568'), text='age', start=333, end=336), Evidence(term=Entity(lemma='sex', ref='GO:0007548'), text='sex', start=338, end=341)]


In [57]:
grounds = Grounds(extractor.findall(training[4]), sentence)

In [62]:
grounds

Grounds(evidences=[Evidence(term=Entity(lemma='aging', ref='GO:0007568'), text='age', start=333, end=336), Evidence(term=Entity(lemma='sex', ref='GO:0007548'), text='sex', start=338, end=341)], sentence=Sentence(text='The purpose of this study was to determine the IOPs of genetically distinct mouse strains, assess the effects of factors such as age, sex and time of day on IOP in specific strain backgrounds, and to assess the effects of specific candidate gene mutations on IOP.\n', offset=204, docid='11532192'))

In [60]:
term2statements[Entity(lemma='aging', ref='GO:0007568')]

{Statement(statid='GO:0007568%000', evidences=[Evidence(term=Entity(lemma='aging', ref='GO:0007568'), text='aging', start=0, end=5)])}

In [59]:
gather_baskets(grounds, {}, term2statements)

[]

In [69]:
grounds, more_info, term2statements = grounds, {}, term2statements
baskets = []
term2index_evidence = defaultdict(list)
queue = []
evidences = grounds.evidences
for i, evidence in enumerate(evidences):
    term = evidence.term
    term2index_evidence[term].append((i, evidence))
    if isinstance(term, Entity) or isinstance(term, Pattern):
        queue.append((i, term))
        print(term)
term2index_evidence = dict(term2index_evidence)
for i, ep in queue:
    candidates = []
    statements = term2statements.get(ep, set())

    for statement in statements:
        statid = statement.statid
        goid = statid.partition('%')[0]
        density = gd.goid2density[goid]
        evidences = gather_evidences(statement, i, term2index_evidence)
        candidate = Candidate(statement, 
                              density, 
                              gather_evidences(statement, i, term2index_evidence),
                              grounds)
        candidates.append(candidate)
    candidates.sort(key=lambda c: c.density, reverse=True)
    baskets.append(Basket(candidates, more_info))

baskets

[]

In [72]:
term.__class__

grepc.models.Entity

In [73]:
Entity

__main__.Entity

In [37]:
len(term2statements)

20549

In [None]:
corpus

In [48]:
i = 182
print(corpus[i])
print(extractor.findall(corpus[i]))

Sentence(text='\nTo determine if albinism alters IOP, we analyzed B6 mice that were either pigmented or albino. ', offset=20320, docid='11532192')
[Evidence(term=Entity(lemma='pigment', ref='GO:0050931'), text='pigmented', start=20395, end=20404)]


In [21]:
a = ('a', 1, object)
b = ('a', 1, object)
a is b

False

In [22]:
a = 102002
b = 102002
a is b

False

In [23]:
a = 1
b = 1
a is b

True