## Load Gold Candidate Labels

In [63]:
import os
import os.path as osp
import pandas as pd
import numpy as np
import collections
from tcre.env import *
from tcre.supervision import *

In [64]:
def load_labels_01():
    """Load labels from BRAT (dev split)"""
    # Note: Labels are exclusive on ends of char range
    path = osp.join(REPO_DATA_DIR, 'annotation', 'brat_export.csv')
    df = pd.read_csv(path)
    for i in [1, 2]:
        # Convert to inclusive end range
        df['e{}_end_chr'.format(i)] = df['e{}_end_chr'.format(i)] - 1
        assert (df['e{}_end_chr'.format(i)] > df['e{}_start_chr'.format(i)]).all()
    df = df.rename(columns={'id': 'doc_id'})
    df['type'] = df['rel_typ'].map({
        'Induction': REL_FIELD_INDUCING_CYTOKINE,
        'Secretion': REL_FIELD_SECRETED_CYTOKINE,
        'Differentiation': REL_FIELD_INDUCING_TRANSCRIPTION_FACTOR
    })
    assert df['type'].notnull().all()
    # All of these annotations are only positive
    df['value'] = 1
    df['annotator'] = 'brat'
    df = df[[
        'doc_id', 'type', 'e1_text', 'e1_start_chr', 'e1_end_chr', 
        'e2_text', 'e2_start_chr', 'e2_end_chr', 'value', 'annotator'
    ]]
    return df
df1 = load_labels_01()
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 10 columns):
doc_id          392 non-null object
type            392 non-null object
e1_text         392 non-null object
e1_start_chr    392 non-null int64
e1_end_chr      392 non-null int64
e2_text         392 non-null object
e2_start_chr    392 non-null int64
e2_end_chr      392 non-null int64
value           392 non-null int64
annotator       392 non-null object
dtypes: int64(5), object(5)
memory usage: 30.7+ KB


In [65]:
df1.head()

Unnamed: 0,doc_id,type,e1_text,e1_start_chr,e1_end_chr,e2_text,e2_start_chr,e2_end_chr,value,annotator
0,PMC4451961,secreted_cytokine,IL-17,1338,1342,Vγ4 T,1291,1295,1,brat
1,PMC4451961,secreted_cytokine,IL-17,1460,1464,Vγ4 T,1291,1295,1,brat
2,PMC4451961,secreted_cytokine,IL-17,1903,1907,Vγ4 T,1850,1854,1,brat
3,PMC4451961,secreted_cytokine,(IFN)-γ,3730,3736,γδ T,3825,3828,1,brat
4,PMC4451961,secreted_cytokine,(IL)-17,3754,3760,γδ T,3825,3828,1,brat


In [66]:
def load_labels_02():
    """Load labels from SnorkelNgramViewer annotation"""
    # Note: Labels are already inclusive on ends of char range
    path = osp.join(REPO_DATA_DIR, 'annotation', 'ngramviewer_export.csv')
    df = pd.read_csv(path)
    df = df[df['split'] == 3]
    df = df[['doc_id', 'field', 'c1_text', 'c1_char_start', 'c1_char_end', 'c2_text', 'c2_char_start', 'c2_char_end', 'value']]
    df = df.rename(columns=lambda c: c.replace('c1', 'e1').replace('c2', 'e2'))
    df = df.rename(columns=lambda c: c.replace('char_start', 'start_chr').replace('char_end', 'end_chr'))
    df = df.rename(columns={'field': 'type'})
    df['annotator'] = 'snorkelngramviewer'
    return df
df2 = load_labels_02()
df2.head()

Unnamed: 0,doc_id,type,e1_text,e1_start_chr,e1_end_chr,e2_text,e2_start_chr,e2_end_chr,value,annotator
300,PMC5799164,inducing_cytokine,IL-12,217,221,cytotoxic T,14,24,-1,snorkelngramviewer
301,PMC3378591,inducing_cytokine,IFN-γ,201,205,Th1,72,74,-1,snorkelngramviewer
302,PMC6279938,inducing_cytokine,IFN-γ,167,171,Th1,190,192,-1,snorkelngramviewer
303,PMC2646562,inducing_cytokine,IL-6,36,39,Th17,102,105,-1,snorkelngramviewer
304,PMC3486158,inducing_cytokine,IFN-γ,184,188,NKT,127,129,-1,snorkelngramviewer


## Merge

In [67]:
def get_stable_id(r):
    return '{}::span:{}:{}'.format(r['doc_id'], r['start_chr'], r['end_chr'])

def get_context_stable_id(r):
    e1_id = get_stable_id(r.filter(regex='^e1_|^doc_id$').rename(lambda v: v.replace('e1_', '')))
    e2_id = get_stable_id(r.filter(regex='^e2_|^doc_id$').rename(lambda v: v.replace('e2_', '')))
    return "~~".join([e1_id, e2_id])

df = pd.concat([df1, df2])
df['context_stable_ids'] = df.apply(get_context_stable_id, axis=1)
df.head()

Unnamed: 0,doc_id,type,e1_text,e1_start_chr,e1_end_chr,e2_text,e2_start_chr,e2_end_chr,value,annotator,context_stable_ids
0,PMC4451961,secreted_cytokine,IL-17,1338,1342,Vγ4 T,1291,1295,1,brat,PMC4451961::span:1338:1342~~PMC4451961::span:1...
1,PMC4451961,secreted_cytokine,IL-17,1460,1464,Vγ4 T,1291,1295,1,brat,PMC4451961::span:1460:1464~~PMC4451961::span:1...
2,PMC4451961,secreted_cytokine,IL-17,1903,1907,Vγ4 T,1850,1854,1,brat,PMC4451961::span:1903:1907~~PMC4451961::span:1...
3,PMC4451961,secreted_cytokine,(IFN)-γ,3730,3736,γδ T,3825,3828,1,brat,PMC4451961::span:3730:3736~~PMC4451961::span:3...
4,PMC4451961,secreted_cytokine,(IL)-17,3754,3760,γδ T,3825,3828,1,brat,PMC4451961::span:3754:3760~~PMC4451961::span:3...


In [None]:
# doc_id, eN_[start_chr|end_chr] (inclusive), annotator, relation type, value, split
# - Take DB backup
# - Add stable label on INFER split
# - Move everything to target split based on context stable id of candidates
# - Check that nothing on INFER split has labels

### Load Labels

In [68]:
from snorkel import SnorkelSession
from snorkel.models import Document, Sentence, Candidate
from snorkel.models import StableLabel
from snorkel.db_helpers import reload_annotator_labels
session = SnorkelSession()
classes = get_candidate_classes()

In [69]:
def load_labels(session, relations, annotator, candidate_class, split):
    assert (relations['type'] == candidate_class.field).all(), 'Relations data frame has conflicting relation classes'
    assert (relations['annotator'] == annotator).all(), 'Relations data frame has conflicting annotator'
    annotator_name = annotator + '-' + candidate_class.field
    
    for i, r in relations.iterrows():
        value = int(r['value'])
        assert value in [-1, 1], 'Row has invalid relation value: {}'.format(r)
        context_stable_ids = r['context_stable_ids']
        
        query = session.query(StableLabel)\
            .filter(StableLabel.context_stable_ids == context_stable_ids)\
            .filter(StableLabel.annotator_name == annotator_name)
        if query.count() == 0:
            session.add(StableLabel(
                context_stable_ids=context_stable_ids,
                annotator_name=annotator_name,
                value=value
            ))
            
    session.commit()
    # This function will create GoldLabel records for each StableLabel above after
    # selecting them based on annotator_name.  The annotator name should be different
    # for each candidate class if they might have identical context_stable_ids since
    # otherwise as written above only the first value for the same annotator + context_stable_ids
    # will be saved.
    # Other notes: split will be used to find candidates necessary to create GoldLabels though
    # it is not necessary for StableLabel filtering in this case (thus filter_label_split=False)
    # because the labels were not created with a split above
    return reload_annotator_labels(
        session, candidate_class.subclass, annotator_name, split=split, 
        filter_label_split=False, create_missing_cands=False)

In [70]:
labels = collections.OrderedDict()
clmap = {classes[c].field: classes[c] for c in classes}
grps = df.groupby(['type', 'annotator'])
    
for k, g in grps:
    candidate_class = clmap[k[0]]
    annotator = k[1]
    print('Loading labels for class={}, annotator={}'.format(*k))
    # Returns tuple with:
    # 0: List of GoldLabel instances added
    # 1: List of StableLabel instances for which no candidate could be found
    labels[k] = load_labels(session, g, annotator, candidate_class, SPLIT_INFER)

Loading labels for class=inducing_cytokine, annotator=brat
AnnotatorLabels created: 1, missed: 144
Loading labels for class=inducing_cytokine, annotator=snorkelngramviewer
AnnotatorLabels created: 0, missed: 197
Loading labels for class=inducing_transcription_factor, annotator=brat
AnnotatorLabels created: 3, missed: 92
Loading labels for class=inducing_transcription_factor, annotator=snorkelngramviewer
AnnotatorLabels created: 1, missed: 216
Loading labels for class=secreted_cytokine, annotator=brat
AnnotatorLabels created: 0, missed: 152
Loading labels for class=secreted_cytokine, annotator=snorkelngramviewer
AnnotatorLabels created: 0, missed: 198


In [71]:
# Show missing candidates
from snorkel.models import Context 

# Show which entity types and relations were unable to be matched with spans
# extracted and inserted into snorkel db
def summarize_missing_candidates(labels):
    df = []
    for k in labels:
        candidate_class = clmap[k[0]]
        class_name = k[0]
        missed = labels[k][1]
        for mc in missed:
            doc_id = mc.context_stable_ids.split('::')[0]
            ids = mc.context_stable_ids.split('~~')
            typs = []
            for i, sid in enumerate(ids):
                ctx = session.query(Context).filter(Context.stable_id == sid).all()
                if len(ctx) == 0:
                    typs.append(candidate_class.entity_types[i])
            df.append((class_name, doc_id, ','.join(typs)))
    return pd.DataFrame(df, columns=['relation', 'doc_id', 'missing'])
df_miss = summarize_missing_candidates(labels)
df_miss.groupby(['relation', 'missing']).size()

relation                       missing                              
inducing_cytokine              cytokine                                   2
                               cytokine,immune_cell_type                338
                               immune_cell_type                           1
inducing_transcription_factor  transcription_factor,immune_cell_type    308
secreted_cytokine              cytokine                                   2
                               cytokine,immune_cell_type                347
                               immune_cell_type                           1
dtype: int64

In [77]:
gl = list(labels.values())[0][0][0]

In [79]:
doc = gl.candidate.get_parent().get_parent()

In [83]:
gl.candidate

InducingCytokine(Span("b'IL-27'", sentence=101221, chars=[120,124], words=[22,22]), Span("b'Tr1'", sentence=101221, chars=[36,38], words=[7,7]))

In [88]:
sent = gl.candidate.get_parent()

In [89]:
sent

Sentence(Document PMC4168117,5,b'In mice, in vivo differentiation of Tr1 cells was dependent on the presence of the aryl hydrocarbon receptor, c-Maf and IL-27.')

In [85]:
df1[df1['doc_id'] == 'PMC4168117']

Unnamed: 0,doc_id,type,e1_text,e1_start_chr,e1_end_chr,e2_text,e2_start_chr,e2_end_chr,value,annotator
70,PMC4168117,inducing_transcription_factor,c-Maf,952,956,Tr1,878,880,1,brat
71,PMC4168117,inducing_cytokine,IL-27,962,966,Tr1,878,880,1,brat
72,PMC4168117,inducing_cytokine,transforming growth factor-β,1940,1967,iTreg,1864,1868,1,brat
73,PMC4168117,inducing_cytokine,TGF-β,1970,1974,iTreg,1864,1868,1,brat
74,PMC4168117,secreted_cytokine,interleukin (IL)-10,2027,2045,iTreg,1864,1868,1,brat
75,PMC4168117,secreted_cytokine,TGF-β,2051,2055,iTreg,1864,1868,1,brat
76,PMC4168117,inducing_transcription_factor,c-Maf,2287,2291,Tr1,2183,2185,1,brat
77,PMC4168117,inducing_cytokine,IL-27,2296,2300,Tr1,2183,2185,1,brat
78,PMC4168117,secreted_cytokine,IL-10,2455,2459,Tr1,2363,2365,1,brat
79,PMC4168117,secreted_cytokine,interferon gamma,2473,2488,Tr1,2363,2365,1,brat


In [81]:
doc

Document PMC4168117

### Assign Splits

In [72]:
# Create data frame containing newly added candidates
dfc = pd.DataFrame([
    # GoldLabelKey(name=annotator_name)
    # GoldLabel(candidate=candidate, key=ak, value=sl.value)
    dict(type=k[0], annotator=k[1], cand_id=label.candidate.id, annotator_name=label.key.name, value=label.value)
    for k in labels
    for label in labels[k][0]
])
dfc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
annotator         5 non-null object
annotator_name    5 non-null object
cand_id           5 non-null int64
type              5 non-null object
value             5 non-null int64
dtypes: int64(2), object(3)
memory usage: 280.0+ bytes


In [73]:
dfc.groupby(['type', 'annotator', 'annotator_name']).size()

type                           annotator           annotator_name                                  
inducing_cytokine              brat                brat-inducing_cytokine                              1
inducing_transcription_factor  brat                brat-inducing_transcription_factor                  3
                               snorkelngramviewer  snorkelngramviewer-inducing_transcription_factor    1
dtype: int64

In [None]:
from sklearn.model_selection import train_test_split

def assign_split(g):
    assert g['annotator'].nunique() == 1
    annotator = g['annotator'].iloc[0]
    
    # Assign all BRAT annotations to dev
    if annotator == 'brat':
        return g.assign(split=SPLIT_DEV)
    
    # Do stratified assignment of ngramviewer annotations to val and test
    if annotator == 'snorkelngramviewer':
        idx = np.arange(len(g))
        val_idx, test_idx = train_test_split(idx, stratify=g['value'].values, random_state=TCRE_SEED, test_size=.5)
        splits = np.zeros(len(g))
        splits[val_idx] = SPLIT_VAL
        splits[test_idx] = SPLIT_TEST
        return g.assign(split=splits)
    raise ValueError('Annotator {} not supported'.format(annotator))
    
dfcs = dfc.groupby(['annotator', 'type'], group_keys=False).apply(assign_split)
assert (dfcs['split'] > 0).all()
assert dfcs['split'].dtype == np.int64
dfcs.head()

### Move Candidates

In [None]:
for i, r in dfcs.iterrows():
    cand = session.query(Candidate).filter(Candidate.id == r['candidate'].id).one()
    #cand.split = r['split']
session.commit()    

In [3]:
from snorkel import SnorkelSession
from snorkel.models import Document, Sentence, Candidate
session = SnorkelSession()

In [30]:
doc_ids = [v[0] for v in session.query(Document.name).all()]
len(doc_ids)

20600

In [13]:
dff = df[df['doc_id'].isin(doc_ids)]
len(dff)

684

In [14]:
dff.iloc[0]

doc_id                                                     PMC2646571
text                Down-regulation of Gfi-1 expression by TGF-β i...
sent_char_offset                                                    0
type                                                 InducingCytokine
field                                               inducing_cytokine
split                                                               1
value                                                               1
c1_cid                          CKC59F8990F767EBD4:CKFD4CA0B2B4BC3AE4
c2_cid                          CTBFBDE5121B6748D1:CTBFBDE5121B6748D1
c1_ctxt_id                                                     126965
c1_stable_id                                   PMC2646571::span:39:43
c1_word_start                                                       7
c1_word_end                                                         7
c1_char_start                                                      39
c1_char_end         

#### Debugging

In [43]:
from snorkel import SnorkelSession
from snorkel.models import Document, Sentence, Candidate
from snorkel.models import StableLabel
from snorkel.db_helpers import reload_annotator_labels
session = SnorkelSession()
doc = session.query(Document).filter(Document.name == 'PMC2646571').first()

In [46]:
docs = session.query(Document).limit(10).all()

In [47]:
doc = docs[1]

In [48]:
sents = doc.sentences

In [59]:
sent = sents[5]
sent

Sentence(Document PMC3691816,5,b'Activation of purified naive CD4+ T cells from NOX2-deficient mice led to augmented IFN-\xce\xb3 and diminished IL-4 production and an increased ratio of expression of the TH1-specific transcription factor T-bet versus the TH2-specfic transcription factor GATA-3, consistent with a TH1 skewing of na\xc3\xafve T cells.')

In [60]:
sent.entity_types

['O',
 'O',
 'O',
 'immune_cell_type',
 'immune_cell_type',
 'immune_cell_type',
 'immune_cell_type',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'cytokine',
 'O',
 'O',
 'cytokine',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'transcription_factor',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'immune_cell_type',
 'O',
 'O',
 'immune_cell_type',
 'immune_cell_type',
 'O',
 'O']

In [61]:
sent.entity_cids

['O',
 'O',
 'O',
 'CTF2A8D03E20D25B7F:CTC1A9DDE1514D6425',
 'CTF2A8D03E20D25B7F:CTC1A9DDE1514D6425',
 'CTF2A8D03E20D25B7F:CTC1A9DDE1514D6425',
 'CTF2A8D03E20D25B7F:CTC1A9DDE1514D6425',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'CK1B71668FDDECE3CF:CKEF3883BE9F74024F',
 'O',
 'O',
 'CKF0F9E60D7241467D:CKF0F9E60D7241467D',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'TFC92DA64402D18F05:TFC99791BEF24108BC',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'CTC3A8C3CBC245616A:CT6621AF17554FFD22',
 'O',
 'O',
 'CT9E267C696796470F:CT591AB86BE24D7A57',
 'CT9E267C696796470F:CT591AB86BE24D7A57',
 'O',
 'O']

In [20]:
from tcre import tagging
from tcre import parsing
nlp = tagging.get_pipeline(include_jnlpba=False)

In [34]:
import imp
imp.reload(parsing)
parser = parsing.SpaCyParser(nlp, ent_fn=tagging.snorkel_ent_fn)

In [35]:
text = 'Down-regulation of Gfi-1 expression by TGF-\xce\xb2 is important for differentiation of Th17 and CD103+ inducible regulatory T cells.\n\nInterleukin (IL) 4 further induces Gfi-1, resulting in optimal Th2 cell expansion. '
parts = list(parser.parse(None, text))