In [1]:
from grepc.models import *
from mcgocr import pattern_regex

In [2]:
definition_fp = 'data/pattern_definition.txt'
pm = gopattern.PatternManager.from_definition(definition_fp)

out = pm.regex_out()

In [3]:
with open('data/clusterbook_lib.pickle', 'rb') as f:
    clusterbook_lib = pickle.load(f)
with open('data/goid2statement_lib.pickle', 'rb') as f:
    goid2statement_lib = pickle.load(f)
with open('data/ref2statements.pickle', 'rb') as f:
    ref2statements = pickle.load(f)

In [4]:
cb = clusterbook_lib[1]
term_index = TermIndex(set)
for c in cb.clusters:
    for term in c.terms:
        term_index[term.lemma].add(c.primary_term)
len(term_index)

47678

In [5]:
with open('data/default/index_Ie.pickle', 'wb') as f:
    pickle.dump(term_index, f)

In [5]:
pattern2statements = defaultdict(set)
entity2statements = defaultdict(set)

for statements in ref2statements.values():
    for statement in statements:
        if all([isinstance(term, Pattern) for term in statement.terms()]):
            for term in statement.terms():
                pattern2statements[term].add(statement)
        else:
            for term in statement.terms():
                if isinstance(term, Entity):
                    entity2statements[term].add(statement)

pattern2statements = dict(pattern2statements)
entity2statements = dict(entity2statements)


term2statements = ChainMap(entity2statements, pattern2statements)

In [6]:
index_Im = dict(term2statements)

In [8]:
len(index_Im)

20549

In [7]:
with open('data/default/index_Im.pickle', 'wb') as f:
    pickle.dump(index_Im, f)

In [8]:
from txttk.nlptools import sent_tokenize, count_start
import os
txtdir = 'data/craft-1.0/articles/txt/'
sent_tokenize = count_start(sent_tokenize)

sentences = []

for fn in filter(lambda name:name.endswith('.txt'), os.listdir(txtdir)):
    pmid = fn.partition('.')[0]
    with open(os.path.join(txtdir, fn)) as f:
        context = f.read()
    sentence_list = list(sent_tokenize(context, 0))
    
    for text, offset in sentence_list:
        sentences.append(Sentence(text, offset, pmid))
        
len(sentences)

21646

In [9]:
solid_extractor = SolidExtractor(term_index)
soft_extractor = SoftExtractor(out)

In [10]:
%%time
xmldir = 'data/craft-1.0/knowtator-xml/go_bpmf/'
gold_bpmf = []
for fn in os.listdir(xmldir):
    fpath = os.path.join(xmldir, fn)
    goldstandards = goldstandard_from_xml(fpath)
    gold_bpmf.extend(goldstandards)

print('Answers:', len(gold_bpmf))

Answers: 24062
CPU times: user 9.5 s, sys: 133 ms, total: 9.63 s
Wall time: 10.7 s


In [11]:
%%time
xmldir = 'data/craft-1.0/knowtator-xml/go_cc/'
gold_cc = []
for fn in os.listdir(xmldir):
    fpath = os.path.join(xmldir, fn)
    goldstandards = goldstandard_from_xml(fpath)
    gold_cc.extend(goldstandards)

print('Answers:', len(gold_cc))

Answers: 8354
CPU times: user 3.47 s, sys: 51.2 ms, total: 3.53 s
Wall time: 3.74 s


In [12]:
gold_union = gold_bpmf + gold_cc

In [13]:
forest_cc = make_forest(gold_cc)
forest_bpmf = make_forest(gold_bpmf)


In [14]:
from sklearn.feature_extraction import FeatureHasher
from sklearn import svm

In [17]:
def experiment(title, training, testing, gold, extractor, term2statements, seed):

    h = FeatureHasher(n_features=1024)
    hclf = svm.LinearSVC(max_iter=1000, random_state=seed) 
    
    forest = make_forest(gold)
    training_baskets, testing_baskets = (join_baskets(training, extractor, term2statements), 
                                        join_baskets(testing, extractor, term2statements))
    training_X, testing_X = baskets2X(training_baskets), baskets2X(testing_baskets)
    training_y, testing_y = baskets2y(training_baskets, forest), baskets2y(testing_baskets, forest)

    training_X, training_y = flaten(training_X, training_y)
    
    training_array = h.fit_transform(training_X).toarray()

    #training

    hclf.fit(training_array, training_y)

    #testing
    system = []
    for basket, features_list, labels_list in zip(testing_baskets, testing_X, testing_y):
        baskets, X, y = [basket], [features_list], [labels_list]
        chunk_X, chunk_y = flaten(X, y)
        if len(chunk_X) == 0:
            continue
        chunk_array = h.fit_transform(chunk_X).toarray()
        #chunk_array = scaler.transform(chunk_array)
        pred_y = hclf.predict(chunk_array)
        system.extend(recover(baskets, [pred_y]))
    gold = recover(testing_baskets, testing_y)
    report = evaluate(system, gold, title)
    print(report)
    return(report)

In [15]:
from mcgocr import Craft, GoData, CandidateFinder, SolidExtractor, CandidateFinder, Index, LabelMarker

In [16]:
craft = Craft('data')
corpus = craft.get_corpus()
goldstandard = craft.get_goldstandard()

Reading the corpus...
Reading the goldstandard of GO...


In [17]:
auxillary_term_index = term_index_from(goldstandard, goid2statement_lib)
auxillary_extractor = SolidExtractor(auxillary_term_index)

In [18]:
with open('data/default/index_Ie_boost.pickle', 'wb') as f:
    pickle.dump(auxillary_term_index, f)

In [19]:
len(auxillary_term_index)

2497

In [20]:
extractor = JoinExtractor([solid_extractor, soft_extractor, auxillary_extractor])

In [21]:
import time

In [22]:
from sklearn.feature_extraction import FeatureHasher
from sklearn import svm

In [23]:
(title, training, testing, gold, extractor, term2statements, seed) = ('quient', corpus, corpus, goldstandard, extractor, term2statements, 1)

h = FeatureHasher(n_features=1024)
hclf = svm.LinearSVC(max_iter=1000, random_state=seed) 

forest = make_forest(gold)
training_baskets, testing_baskets = (join_baskets(training, extractor, term2statements), 
                                    join_baskets(testing, extractor, term2statements))
training_X, testing_X = baskets2X(training_baskets), baskets2X(testing_baskets)
training_y, testing_y = baskets2y(training_baskets, forest), baskets2y(testing_baskets, forest)

training_X, training_y = flaten(training_X, training_y)

training_array = h.fit_transform(training_X).toarray()

#training

hclf.fit(training_array, training_y)

#testing
system = []
for basket, features_list, labels_list in zip(testing_baskets, testing_X, testing_y):
    baskets, X, y = [basket], [features_list], [labels_list]
    chunk_X, chunk_y = flaten(X, y)
    if len(chunk_X) == 0:
        continue
    chunk_array = h.fit_transform(chunk_X).toarray()
    #chunk_array = scaler.transform(chunk_array)
    pred_y = hclf.predict(chunk_array)
    system.extend(recover(baskets, [pred_y]))
gold = recover(testing_baskets, testing_y)
report = evaluate(system, gold, title)
print(report)

Report<R80.94% P84.98% F82.91% 'quient'>


In [24]:
new_gold = [u for u in gold_union if u[1].startswith('GO:')]
evaluate(system, new_gold, 'Jesus, help me!')

Report<R65.30% P78.81% F71.42% 'Jesus, help me!'>

In [25]:
new_gold = [u for u in gold_union if u[1].startswith('GO:')]
evaluate(system, new_gold, 'Jesus, help me!')

Report<R65.30% P78.81% F71.42% 'Jesus, help me!'>

In [30]:
diff = evaluate(gold, new_gold, 'lets go')

In [31]:
random.sample(diff.fn, 10)

[('15550985', 'GO:0007600', 1724, 1731, 'sensory'),
 ('16110338', 'GO:0033592', 41501, 41514, 'hybridization'),
 ('15005800', 'GO:0047598', 24176, 24181, 'Dhcr7'),
 ('15588329', 'GO:0005739', 19303, 19316, 'mitochondrial'),
 ('17194222', 'GO:0001944', 30731, 30746, 'vascularization'),
 ('16103912', 'GO:0003700', 22740, 22761, 'transcription factors'),
 ('16110338', 'GO:0000739', 53888, 53901, 'Hybridization'),
 ('15588329', 'GO:0010008', 2230322323, 2231222331, 'endosomal ... membrane'),
 ('11897010', 'GO:0005216', 1128, 1136, 'ion pore'),
 ('16109169', 'GO:0009880', 96120, 109126, 'patterning of ... embryo')]

In [33]:
examples = []
for unit in diff.fn:
    examples.append([u for u in gold if u[:3] == unit[:3]])

In [36]:
len(diff.fp)

2172

In [37]:
examples[:100]

[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [('15345036',
   'GO:0001775',
   45408,
   45448,
   'stimulation with LPS and apoptotic cells')],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [('16098226',
   'GO:0004872',
   20467,
   20584,
   'receptor activation causes dissociation of HOMER proteins [35], PAX6 may be released as a result of synaptic activity')],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [('15005800',
   'GO:0021700',
   39426,
   39513,
   'maturation of the gas-exchange region of the lung in the saccular period of development')],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [('15588329', 'GO:0004192', 37213, 37233, 'cathepsin D activity')],
 [],
 [],
 [],
 [],
 [],
 [],
 [('15938754', 'GO:0001775', 4472, 4502, 'stimulate taste receptor cells')],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [

In [34]:
with open('data/default/feature_hasher.pickle', 'wb') as f:
    pickle.dump(h, f)

In [35]:
with open('data/default/classifier.pickle', 'wb') as f:
    pickle.dump(hclf, f)

In [23]:
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)
X = v.fit_transform(training_X)

In [28]:
len(training_X)

459726

In [29]:
len(X[0])

23801

In [20]:
experiment('quient', corpus, corpus, goldstandard, extractor, term2statements, 1)

Report<R79.14% P84.83% F81.88% 'quient'>


Report<R79.14% P84.83% F81.88% 'quient'>

In [None]:
"""
2 Report<R73.11% P89.30% F80.40% 'quient'>
3 Report<R78.32% P85.78% F81.88% 'quient'>
4 Report<R79.14% P84.83% F81.88% 'quient'
"""

# Some observations

In [22]:
report = Out[19]
tp = report.tp

In [24]:
len(gd)

24337

In [25]:
from collections import Counter
Counter([c.namespace for c in gd.values()])

Counter({'biological_process': 14306,
         'cellular_component': 2047,
         'molecular_function': 7984})

end observation

In [20]:
title, training, testing, gold, extractor, term2statements, seed = ('quient', corpus, corpus, goldstandard, extractor, term2statements, 1)

h = FeatureHasher(n_features=1024)
hclf = svm.LinearSVC(max_iter=1000, random_state=seed) 

forest = make_forest(gold)
training_baskets, testing_baskets = (join_baskets(training, extractor, term2statements), 
                                    join_baskets(testing, extractor, term2statements))
training_X, testing_X = baskets2X(training_baskets), baskets2X(testing_baskets)
training_y, testing_y = baskets2y(training_baskets, forest), baskets2y(testing_baskets, forest)

training_X, training_y = flaten(training_X, training_y)

training_array = h.fit_transform(training_X).toarray()

#training

hclf.fit(training_array, training_y)

#testing
system = []
for basket, features_list, labels_list in zip(testing_baskets, testing_X, testing_y):
    baskets, X, y = [basket], [features_list], [labels_list]
    chunk_X, chunk_y = flaten(X, y)
    if len(chunk_X) == 0:
        continue
    chunk_array = h.fit_transform(chunk_X).toarray()
    #chunk_array = scaler.transform(chunk_array)
    pred_y = hclf.predict(chunk_array)
    system.extend(recover(baskets, [pred_y]))
gold = recover(testing_baskets, testing_y)
report = evaluate(system, gold, title)
print(report)

Report<R80.91% P82.51% F81.71% 'quient'>


In [49]:
join_baskets(training, extractor, term2statements)

[]

In [52]:
training[:10]

[Sentence(text='Intraocular pressure in genetically distinct mice: an update and strain survey\n', offset=0, docid='11532192'),
 Sentence(text='\nAbstract\n', offset=79, docid='11532192'),
 Sentence(text='\nBackground\n', offset=89, docid='11532192'),
 Sentence(text='\nLittle is known about genetic factors affecting intraocular pressure (IOP) in mice and other mammals. ', offset=101, docid='11532192'),
 Sentence(text='The purpose of this study was to determine the IOPs of genetically distinct mouse strains, assess the effects of factors such as age, sex and time of day on IOP in specific strain backgrounds, and to assess the effects of specific candidate gene mutations on IOP.\n', offset=204, docid='11532192'),
 Sentence(text='\nResults\n', offset=468, docid='11532192'),
 Sentence(text='\nBased on over 30 studied mouse strains, average IOP ranges from approximately 10 to 20 mmHg. ', offset=477, docid='11532192'),
 Sentence(text='Gender does not typically affect IOP and aging results in

In [55]:
for sentence in training[4:5]:
    print(extractor.findall(sentence))


[Evidence(term=Entity(lemma='aging', ref='GO:0007568'), text='age', start=333, end=336), Evidence(term=Entity(lemma='sex', ref='GO:0007548'), text='sex', start=338, end=341)]


In [57]:
grounds = Grounds(extractor.findall(training[4]), sentence)

In [62]:
grounds

Grounds(evidences=[Evidence(term=Entity(lemma='aging', ref='GO:0007568'), text='age', start=333, end=336), Evidence(term=Entity(lemma='sex', ref='GO:0007548'), text='sex', start=338, end=341)], sentence=Sentence(text='The purpose of this study was to determine the IOPs of genetically distinct mouse strains, assess the effects of factors such as age, sex and time of day on IOP in specific strain backgrounds, and to assess the effects of specific candidate gene mutations on IOP.\n', offset=204, docid='11532192'))

In [60]:
term2statements[Entity(lemma='aging', ref='GO:0007568')]

{Statement(statid='GO:0007568%000', evidences=[Evidence(term=Entity(lemma='aging', ref='GO:0007568'), text='aging', start=0, end=5)])}

In [59]:
gather_baskets(grounds, {}, term2statements)

[]

In [69]:
grounds, more_info, term2statements = grounds, {}, term2statements
baskets = []
term2index_evidence = defaultdict(list)
queue = []
evidences = grounds.evidences
for i, evidence in enumerate(evidences):
    term = evidence.term
    term2index_evidence[term].append((i, evidence))
    if isinstance(term, Entity) or isinstance(term, Pattern):
        queue.append((i, term))
        print(term)
term2index_evidence = dict(term2index_evidence)
for i, ep in queue:
    candidates = []
    statements = term2statements.get(ep, set())

    for statement in statements:
        statid = statement.statid
        goid = statid.partition('%')[0]
        density = gd.goid2density[goid]
        evidences = gather_evidences(statement, i, term2index_evidence)
        candidate = Candidate(statement, 
                              density, 
                              gather_evidences(statement, i, term2index_evidence),
                              grounds)
        candidates.append(candidate)
    candidates.sort(key=lambda c: c.density, reverse=True)
    baskets.append(Basket(candidates, more_info))

baskets

[]

In [72]:
term.__class__

grepc.models.Entity

In [73]:
Entity

__main__.Entity

In [37]:
len(term2statements)

20549

In [None]:
corpus

In [48]:
i = 182
print(corpus[i])
print(extractor.findall(corpus[i]))

Sentence(text='\nTo determine if albinism alters IOP, we analyzed B6 mice that were either pigmented or albino. ', offset=20320, docid='11532192')
[Evidence(term=Entity(lemma='pigment', ref='GO:0050931'), text='pigmented', start=20395, end=20404)]


In [21]:
a = ('a', 1, object)
b = ('a', 1, object)
a is b

False

In [22]:
a = 102002
b = 102002
a is b

False

In [23]:
a = 1
b = 1
a is b

True

In [14]:
type(ref2statements)

dict

In [15]:
list(ref2statements.keys())[0]

'GO:0018154'

In [38]:
x = []
import re
for concept in gd.values():
    labels = concept.labels
    for label in labels:
        if re.match('[A-Z][a-z]+ ', label):
            x.append(concept)
            break
len(x)

311

In [39]:
x

[Concept<GO:0030576 BP Cajal body organization and biogenesis>,
 Concept<GO:0033327 BP Leydig cell differentiation>,
 Concept<GO:0005796 CC Golgi lumen>,
 Concept<GO:0032862 BP activation of Rho GTPase>,
 Concept<GO:0008462 MF endopeptidase Clp activity>,
 Concept<GO:0014037 BP Schwann cell differentiation>,
 Concept<GO:0004712 MF protein serine/threonine/tyrosine kinase activity>,
 Concept<GO:0048193 BP Golgi vesicle transport>,
 Concept<GO:0021880 BP Notch signaling pathway involved in forebrain neuron fate commitment>,
 Concept<GO:0030676 MF Rac guanyl-nucleotide exchange factor activity>,
 Concept<GO:0060008 BP Sertoli cell differentiation>,
 Concept<GO:0048226 CC Casparian strip>,
 Concept<GO:0000815 CC ESCRT III complex>,
 Concept<GO:0000310 MF xanthine phosphoribosyltransferase activity>,
 Concept<GO:0032080 BP negative regulation of Type I restriction endodeoxyribonuclease activity>,
 Concept<GO:0032084 BP regulation of Type I site-specific deoxyribonuclease activity>,
 Concept

In [21]:
y = []
for ref, statements in ref2statements.items():
    if len(statements) == 2:
        y.append(ref)

len(y)

3607

In [24]:
z = [gd[goid] for goid in y]


In [25]:
for concept in z:
    print(concept.statements)

['aerobic respiration, using hydrogen as electron donor', 'hydrogen oxidation']


In [58]:
import random
k = 80
for concept in z:
    if len(concept.statements) == 3 and k <= len(''.join(concept.labels)) < k+10:
        print(concept.goid)
        print(concept.statements)
        print()

GO:0005047
['signal recognition particle binding', 'docking protein', 'signal recognition particle receptor']

GO:0004565
['beta-galactosidase activity', 'exo-(1->4)-beta-D-galactanase activity', 'lactose hydrolysis']

GO:0007636
['chemosensory jump behavior', 'chemosensory jump behaviour', 'jump response to chemical stimulus']

GO:0031737
['CX3C chemokine receptor binding', 'CX3C chemokine receptor ligand', 'fractalkine receptor binding']

GO:0004030
['aldehyde dehydrogenase [NAD(P)+] activity', 'aldehyde:NAD(P)+ oxidoreductase activity', 'ALDH']

GO:0031748
['D1 dopamine receptor binding', 'D1 dopamine receptor ligand', 'D1A dopamine receptor binding']

GO:0004000
['adenosine deaminase activity', 'adenosine aminohydrolase activity', 'adenosine deaminase reaction']

GO:0006612
['protein targeting to membrane', 'protein membrane targeting', 'protein-membrane targeting']

GO:0047143
['chlorate reductase activity', 'chlorate reductase C', 'chlorite:acceptor oxidoreductase activity']

GO:

In [64]:
print('\n'.join(gd['GO:0019853'].labels))

L-ascorbic acid biosynthetic process
ascorbate biosynthesis
ascorbate biosynthetic process
L-ascorbic acid anabolism
L-ascorbic acid biosynthesis
L-ascorbic acid formation
L-ascorbic acid synthesis
vitamin C biosynthesis
vitamin C biosynthetic process


In [67]:
gd['GO:0019853']

Concept<GO:0019853 BP L-ascorbic acid biosynthetic process>

In [36]:
for concept in gd.values():
    for label in concept.labels:
        if 'vitamin C' in label:
            print(concept)
            break

Concept<GO:0015882 BP L-ascorbic acid transport>
Concept<GO:0019854 BP L-ascorbic acid catabolic process>
Concept<GO:0031418 MF L-ascorbic acid binding>
Concept<GO:0015229 MF L-ascorbic acid transporter activity>
Concept<GO:0019853 BP L-ascorbic acid biosynthetic process>
Concept<GO:0019852 BP L-ascorbic acid metabolic process>
Concept<GO:0033591 BP response to L-ascorbic acid>
Concept<GO:0000044 BP ascorbate stabilization>


In [62]:
goid2statement_lib[0]['GO:0019853']

[Statement(statid='GO:0019853%000', evidences=[Evidence(term=Entity(lemma='L-ascorbic acid', ref='GO:0019853'), text='L-ascorbic acid', start=0, end=15), Evidence(term=Pattern(lemma='biosyntheprocess', ref='annotator'), text='biosynthetic process', start=16, end=36)]),
 Statement(statid='GO:0019853%005', evidences=[Evidence(term=Entity(lemma='L-ascorbic acid', ref='GO:0019853'), text='L-ascorbic acid', start=0, end=15), Evidence(term=Pattern(lemma='form', ref='annotator'), text='formation', start=16, end=25)])]

In [71]:
for goid, concept in gd.items():
    if 'microRNA' in concept.name:
        print(concept)

Concept<GO:0035281 BP pre-microRNA export from nucleus>
Concept<GO:0031053 BP primary microRNA processing>
Concept<GO:0031054 BP pre-microRNA processing>


In [72]:
len(gold_bpmf)

24062

In [26]:
goid2size = dict()
for goid, statements in goid2statement_lib[1].items():
    size = min([len(s.evidences) for s in statements])
    goid2size[goid] = size

In [27]:
gold_size = []
for pmid, goid, start, end, text in tp:
    if goid != 'independent_continuant':
        gold_size.append((pmid, goid, start, end, text, goid2size[goid]))

In [32]:
sorted([i for i in gold_size if i[-1]==2 ], key=lambda x:len(x[4]), reverse=True)

[('15492776',
  'GO:0030509',
  32510,
  32560,
  'pathway provide strong evidence that BMP signaling',
  2),
 ('15588329',
  'GO:0004192',
  14794,
  14838,
  'cathepsin D processing on enzymatic activity',
  2),
 ('15630473',
  'GO:0001775',
  1081,
  1116,
  'cells within a sheet are stimulated',
  2),
 ('15588329', 'GO:0004192', 23174, 23204, 'cathepsin D enzymatic activity', 2),
 ('16109169', 'GO:0048339', 13324, 13353, 'paraxial mesoderm development', 2),
 ('15836427', 'GO:0045289', 33010, 33038, 'luciferase and LacZ activity', 2),
 ('16109169', 'GO:0048339', 12223, 12251, 'paraxial mesoderm production', 2),
 ('16109169', 'GO:0048339', 6274, 6302, 'paraxial mesoderm production', 2),
 ('15005800', 'GO:0021700', 5012, 5038, 'development and maturation', 2),
 ('16579849', 'GO:0043754', 3576, 3602, 'dihydrolipoyl transacylase', 2),
 ('16121255', 'GO:0004576', 23633, 23659, 'oligosaccharyl transferase', 2),
 ('16628246', 'GO:0043065', 10522, 10548, 'up-regulation of apoptosis', 2),
 (

In [35]:
gd['GO:0030509'].labels

['BMP signaling pathway',
 'BMP receptor signaling pathway',
 'BMP signalling pathway',
 'bone morphogenetic protein signaling pathway',
 'bone morphogenetic protein signalling pathway']

In [36]:
goid2statement_lib[0]['GO:0030509']

[Statement(statid='GO:0030509%000', evidences=[Evidence(term=Entity(lemma='BMP signaling', ref='GO:0030509'), text='BMP signaling', start=0, end=13), Evidence(term=Pattern(lemma='pathway', ref='annotator'), text='pathway', start=14, end=21)])]

In [21]:
sum([len(v) for v in goid2statement_lib[0].items()])

48674

In [24]:
sum([len(v) for v in goid2statement_lib[1].items()])

48674

In [37]:
gs0, gs1 = goid2statement_lib[0], goid2statement_lib[1]

In [42]:
d0, d1 = dict(), dict()
for statements in gs0.values():
    for s in statements:
        d0[s.statid] = s.evidences

for statements in gs1.values():
    for s in statements:
        d1[s.statid] = s.evidences



In [44]:
ck = []
for statid, mcs in d0.items():
    if mcs == d1[statid]:
        ck.append(1)
    else:
        ck.append(0)
Counter(ck)

Counter({0: 14233, 1: 15936})

In [45]:
14233/len(ck)

0.4717756637608141

In [46]:
len(ck)

30169

In [43]:
len(d0), len(d1)

(30169, 30169)

In [38]:

    if gs0[goid] == gs1[goid]:
        m.append(1)
    else:
        m.append(0)
        
from collections import Counter
Counter(m)

Counter({0: 682, 1: 23655})

In [39]:
len(gs0)

24337

In [33]:
cb0, cb1

(ClusterBook <30933 clusters, 61054 terms>,
 ClusterBook <21738 clusters, 61054 terms>)

In [34]:
for c in cb0.clusters:
    print(c)
    break

Cluster(Entity(lemma='purine nucleotide', ref='GO:0017076'))<1 terms>


In [30]:
'{:.3f}'.format(1/3)

'0.333'

In [3]:
from experiment import corpus as c
corpus = c.Corpus.from_dir('data/craft-1.0/articles/txt', 'testing')

In [4]:
corpus

Corpus<67 documents, 21646 sentences, 'testing'>