In [9]:
from tcre import integration

In [5]:
nlp = integration.get_scispacy_pipeline()

In [6]:
doc = nlp('IFNγ-producing CD4+ T cells')
for t in doc:
    print(t)

IFNγ-producing
CD4
+
T
cells


In [32]:
from tcre import supervision
from snorkel import SnorkelSession
from snorkel.models import Candidate, GoldLabel
classes = supervision.get_candidate_classes()
session = SnorkelSession()

In [117]:
#c = session.query(Candidate).limit(1).one()
g = session.query(GoldLabel).filter(GoldLabel.value == 1).limit(30).all()[12]

In [118]:
c = g.candidate

In [119]:
sent = c.get_parent()

In [120]:
sent.text

'In mice, in vivo differentiation of Tr1 cells was dependent on the presence of the aryl hydrocarbon receptor, c-Maf and IL-27.'

In [121]:
from snorkel.viewer import Viewer, SentenceNgramViewer

In [122]:
SentenceNgramViewer([c], session)

<IPython.core.display.Javascript object>

SentenceNgramViewer(cids=[[[0]]], html='<head>\n<style>\nspan.candidate {\n    background-color: rgba(255,255,…

In [126]:
doc = nlp(sent.text)
#doc = nlp('IL-4 is an inducer of Th2 cells.')

In [127]:
from spacy import displacy

In [128]:
displacy.render([doc], jupyter=True)

In [98]:
def get_tkn(w):
    tkn = None
    for t in doc:
        if t.text == w:
            tkn = t
            break
    return tkn
vrb = get_tkn('inhibit')
vrb

inhibit

In [102]:
vrb.pos_, vrb.tag_

('VERB', 'VBP')

In [99]:
obj = get_tkn('IL-6')
obj

IL-6

In [138]:
list(vrb.subtree)

[For,
 example,
 ,,
 inflammatory,
 cytokines,
 IFN-γ,
 and,
 IL-4,
 inhibit,
 TGF-β-induced,
 iTreg,
 cells,
 ,,
 while,
 IL-6,
 directs,
 Th17,
 cell,
 differentiation,
 in,
 the,
 presence,
 of,
 TGF-β,
 .]

In [139]:
[t.i for t in vrb.subtree]

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24]

## Verb Forms

In [106]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7fa65e739f98>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7fa65e63e2e8>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7fa6572d4168>)]

In [108]:
from tcre import supervision
from snorkel import SnorkelSession
from snorkel.models import Candidate, GoldLabel
import pandas as pd
classes = supervision.get_candidate_classes()
session = SnorkelSession()
gls = session.query(GoldLabel).filter(GoldLabel.value == 1).limit(1000).all()

In [113]:
vrbs = []
for c in gls:
    doc = nlp(c.candidate.get_parent().text)
    for t in doc:
        if t.pos_ == 'VERB':
            vrbs.append(dict(text=t.text, tag=t.tag_, lemma=t.lemma_))
df = pd.DataFrame(vrbs)
df.head()

Unnamed: 0,lemma,tag,text
0,have,VBP,have
1,show,VBN,shown
2,drive,VBZ,drives
3,drive,VBZ,drives
4,have,VBP,have


In [112]:
pd.set_option('display.max_colwidth', 250)
df.groupby('tag')['text'].unique()

tag
VB     [be, suppress, protect, produce, prevent, activate, develop, induce, regulate, open, drive, initiate, promote, modulate, differentiate, polarize, act, form, control, enforce, have, generate, enhance, augment, Note, reduce, abolish, secrete, co-ex...
VBD    [were, promoted, inhibited, enhanced, was, induced, reversed, focused, investigated, played, revealed, repressed, displayed, added, tested, examined, demonstrated, showed, augmented, suppressed, failed, had, became, used, concluded, enlarged, fou...
VBG    [maintaining, occurring, transforming, including, producing, signaling, Transforming, conferring, expressing, inducing, allowing, acting, indicating, residing, upregulating, inhibiting, polarizing, Using, neutralizing, mediating, implying, tailor...
VBN    [shown, generated, induced, decreased, divided, TGF-β-induced, been, identified, presumed, differentiated, required, driven, considered, directed, challenged, involved, protected, exposed, illustrated, adapted, blocked

In [114]:
pd.set_option('display.max_colwidth', 250)
df.groupby('tag')['lemma'].unique()

tag
VB     [be, suppress, protect, produce, prevent, activate, develop, induce, regulate, open, drive, initiate, promote, modulate, differentiate, polarize, act, form, control, enforce, have, generate, enhance, augment, note, reduce, abolish, secrete, co-ex...
VBD    [be, promote, inhibit, enhance, induce, reverse, focus, investigate, play, reveal, repress, display, add, test, examine, demonstrate, show, augment, suppress, fail, have, become, use, conclude, enlarge, find, drive, do, upregulate, report, specia...
VBG    [maintain, occur, transform, include, produce, signal, confer, express, induce, allow, act, indicate, reside, upregulate, inhibit, polarize, use, neutralize, mediate, imply, tailor, facilitate, skew, regulate, control, examine, respond, suggest, ...
VBN    [show, generate, induce, decrease, divide, tgf-β-induc, be, identify, presume, differentiate, require, drive, consider, direct, challenge, involve, protect, expose, illustrate, adapt, block, know, report, demonstrate, 

SpaCy POS tags (https://spacy.io/api/annotation#pos-tagging)

```
VB	VERB	VerbForm=inf	verb, base form
VBD	VERB	VerbForm=fin Tense=past	verb, past tense
VBG	VERB	VerbForm=part Tense=pres Aspect=prog	verb, gerund or present participle
VBN	VERB	VerbForm=part Tense=past Aspect=perf	verb, past participle
VBP	VERB	VerbForm=fin Tense=pres	verb, non-3rd person singular present
VBZ	VERB	VerbForm=fin Tense=pres Number=sing Person=3	verb, 3rd person singular present
```

In [135]:
import os.path as osp
from tcre.env import *
dfv = pd.read_csv(osp.join(REPO_DATA_DIR, 'supervision', 'immunexpresso', 'verbs.csv'))
dfv[dfv['polarity'] == 'negative']['verb'].unique()

array(['abolish', 'abrogate', 'aggravate', 'alleviate', 'antagonize',
       'arise', 'arrested', 'attenuate', 'augment', 'block', 'cleave',
       'confined', 'counteract', 'damage', 'deactivates', 'decline',
       'decrease', 'degrade', 'delay', 'delete', 'deplete', 'depress',
       'deprive', 'desensitize', 'destabilize', 'destroy', 'detached',
       'diminish', 'disable', 'disappeared', 'disrupt', 'dissect',
       'down-modulate', 'downregulate', 'down-regulate', 'draining',
       'dysregulate', 'eliminate', 'exacerbate', 'excise', 'fail',
       'hinders', 'impair', 'inactivate', 'inhibit', 'interfere',
       'interfering', 'kill', 'lack', 'limit', 'lose', 'lost', 'lower',
       'lyse', 'minimize', 'mitigate', 'obstruct', 'oppose', 'overridden',
       'predominate', 'prevent', 'reduce', 'reject', 'remove', 'repress',
       'restraining', 'restrict', 'reverse', 'spikes', 'stop', 'stress',
       'suppress', 'sustain', 'truncated', 'understand', 'underwent',
       'undiffe

In [140]:
VERB_MAP = {
    'induction': [
        'induce', 'drive', 'direct', 'regulate', 'control', 'promote', 'rise',
        'mediate', 'cause', 'depend', 'create', 'generate', 'need', 'require', 'rely',
        'polarize', 'differentiate', 'develop', 'form'
    ],
    'secretion': [
        'secrete', 'express', 'release', 'produce', 'exhibit', 'display', 'show'
    ],
    'negative': [
        'abolish', 'abrogate', 'aggravate', 'alleviate', 'antagonize',
       'arise', 'arrested', 'attenuate', 'augment', 'block', 'cleave',
       'confined', 'counteract', 'damage', 'deactivates', 'decline',
       'decrease', 'degrade', 'delay', 'delete', 'deplete', 'depress',
       'deprive', 'desensitize', 'destabilize', 'destroy', 'detached',
       'diminish', 'disable', 'disappeared', 'disrupt', 'dissect',
       'down-modulate', 'downregulate', 'down-regulate', 'draining',
       'dysregulate', 'eliminate', 'exacerbate', 'excise', 'fail',
       'hinders', 'impair', 'inactivate', 'inhibit', 'interfere',
       'interfering', 'kill', 'lack', 'limit', 'lose', 'lost', 'lower',
       'lyse', 'minimize', 'mitigate', 'obstruct', 'oppose', 'overridden',
       'predominate', 'prevent', 'reduce', 'reject', 'remove', 'repress',
       'restraining', 'restrict', 'reverse', 'spikes', 'stop', 'stress',
       'suppress', 'sustain', 'truncated', 'understand', 'underwent',
       'undifferentiate', 'unstimulate'
    ]
}

# Used in tcre.supervision.DepParse

### Dependency Parse Performance

In [170]:
from tcre import supervision
from snorkel import SnorkelSession
from snorkel.models import Candidate, GoldLabel
classes = supervision.get_candidate_classes()
session = SnorkelSession()

gls = session.query(GoldLabel).filter(GoldLabel.value == -1).all() + \
    session.query(GoldLabel).filter(GoldLabel.value == 1).all()

In [177]:
dep_nlp = supervision.get_dep_parse_nlp()

In [270]:
import imp
imp.reload(supervision)

<module 'tcre.supervision' from '/lab/repos/t-cell-relation-extraction/src/tcre/supervision.py'>

In [271]:
#dep_parse = supervision.DepParse(dep_nlp)
dep_parse = supervision.DependencyParseTree()

In [272]:
dep_parse.is_candidate_relation(gls[103].candidate)

False

In [278]:
df = pd.DataFrame([
    dict(
        y_true=max(gl.value, 0), y_pred=int(dep_parse.is_candidate_relation(gl.candidate)), 
        gl=gl, type=gl.candidate.type, cand_id=gl.candidate.id
    )
    for gl in gls
])
df.head()

Unnamed: 0,cand_id,gl,type,y_pred,y_true
0,40065,GoldLabel (doccano-inducing_cytokine = -1),inducing_cytokine,0,0
1,41843,GoldLabel (doccano-inducing_cytokine = -1),inducing_cytokine,1,0
2,84968,GoldLabel (doccano-inducing_cytokine = -1),inducing_cytokine,0,0
3,40056,GoldLabel (doccano-inducing_cytokine = -1),inducing_cytokine,0,0
4,73758,GoldLabel (doccano-inducing_cytokine = -1),inducing_cytokine,1,0


In [279]:
df.groupby(['y_pred', 'y_true']).size().unstack()

y_true,0,1
y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1523,346
1,230,230


In [280]:
df.groupby(['type', 'y_pred', 'y_true']).size().unstack()

Unnamed: 0_level_0,y_true,0,1
type,y_pred,Unnamed: 2_level_1,Unnamed: 3_level_1
inducing_cytokine,0,543,85
inducing_cytokine,1,81,91
inducing_transcription_factor,0,464,151
inducing_transcription_factor,1,97,51
secreted_cytokine,0,516,110
secreted_cytokine,1,52,88


In [283]:
from sklearn.metrics import f1_score
df.groupby('type').apply(lambda g: f1_score(g['y_true'], g['y_pred']))

type
inducing_cytokine                0.522989
inducing_transcription_factor    0.291429
secreted_cytokine                0.520710
dtype: float64

In [277]:
from snorkel.viewer import SentenceNgramViewer
dfs = df[(df['type'] == 'inducing_transcription_factor') & (df['y_pred'] != df['y_true']) & (df['y_true'] == 1)].sample(10)
cands = [gl.candidate for gl in dfs['gl'].values]
SentenceNgramViewer(cands, session)

<IPython.core.display.Javascript object>

SentenceNgramViewer(cids=[[[5], [1], [8]], [[6], [9], [3]], [[0], [7], [2]], [[4]]], html='<head>\n<style>\nsp…

### TreeLib

In [212]:
c = gls[0].candidate
c

InducingCytokine(Span("b'IL-12'", sentence=510800, chars=[234,238], words=[36,36]), Span("b'Th2'", sentence=510800, chars=[206,208], words=[32,32]))

In [247]:
sent = c.get_parent()
sent

Sentence(Document PMC4224555,24,b'Recently, many studies reported that miR-21 is involved in the development of inflammatory diseases and T-cell differentiation \xe2\x80\x93. miR-21 is increased in allergic diseases in both mouse and human , promotes Th2 responses by inhibiting IL-12 in myeloid cells , and is expressed at higher levels in Tregs compared to conventional CD4+CD25\xe2\x88\x92 T cells ; but the functional significance of Treg-specific expression of miR-21 has not been ascertained.')

In [248]:
doc = nlp(sent.text)
displacy.render(doc, jupyter=True)

In [223]:
len(doc) == len(sent.words)

True

In [243]:
#[(doc[i].head.i, sent.dep_labels[i], sent.dep_parents[i]) for i in range(len(doc))]

In [232]:
tree = Tree()
tree.create_node(identifier='X')
tree.create_node(identifier='Y', parent='X')

Node(tag=Y, identifier=Y, data=None)

In [269]:
# sent.pos_tags

In [260]:
from treelib import Tree

def get_parse_tree(candidate):
    sent = candidate.get_parent()
    words = sent.words
    tree = Tree()
    
    # Map token index to token data
    nodes = {
        i: dict(
            token=words[i], dep_label=sent.dep_labels[i], 
            # Dep parents are stored with one-based index
            dep_parent=sent.dep_parents[i] - 1,
            index=i, node_id=i
        )
        for i in range(len(words))
    }
    
    # Recursive method for building tree that ensures parent is always added first
    def add_node(n):
        parent_id = None
        if n['dep_label'] != 'ROOT':
            parent = add_node(nodes[n['dep_parent']])
            parent_id = parent.identifier
        if not tree.contains(n['node_id']):
            tree.create_node(tag=n['token'], identifier=n['node_id'], parent=parent_id, data=n)
        return tree.get_node(n['node_id'])

    # Build tree up from every posible starting token
    for i in range(len(words)):
        add_node(nodes[i])
    assert len(tree) == len(words)
    return tree

In [261]:
tree = get_parse_tree(c)

In [262]:
len(tree), len(c.get_parent().words)

(70, 70)

In [263]:
tree.subtree(3).show()

studies
└── many



In [265]:
for n in tree.subtree(3).all_nodes():
    print(n)

Node(tag=studies, identifier=3, data={'token': 'studies', 'dep_label': 'nsubj', 'dep_parent': 4, 'index': 3, 'node_id': 3})
Node(tag=many, identifier=2, data={'token': 'many', 'dep_label': 'amod', 'dep_parent': 3, 'index': 2, 'node_id': 2})


In [255]:
tree.show()

reported
├── ,
├── .
├── Recently
├── increased
│   ├── ,
│   ├── ,
│   ├── ;
│   ├── and
│   ├── ascertained
│   │   ├── been
│   │   ├── has
│   │   ├── not
│   │   └── significance
│   │       ├── expression
│   │       │   ├── Treg-specific
│   │       │   ├── miR-21
│   │       │   │   └── of
│   │       │   └── of
│   │       ├── functional
│   │       └── the
│   ├── but
│   ├── diseases
│   │   ├── allergic
│   │   ├── in
│   │   └── mouse
│   │       ├── and
│   │       ├── both
│   │       ├── human
│   │       └── in
│   ├── expressed
│   │   ├── cells
│   │   │   ├── CD4+CD25−
│   │   │   ├── T
│   │   │   ├── compared
│   │   │   ├── conventional
│   │   │   └── to
│   │   ├── is
│   │   └── levels
│   │       ├── Tregs
│   │       │   └── in
│   │       ├── at
│   │       └── higher
│   ├── is
│   └── promotes
│       ├── inhibiting
│       │   ├── IL-12
│       │   ├── by
│       │   └── cells
│       │       ├── in
│       │       └── myeloid
│       └── responses
│    