## Load Gold Candidate Labels

In [19]:
import os
import os.path as osp
import pandas as pd
import numpy as np
from tcre.env import *
from tcre.supervision import *

In [27]:
def load_labels_01():
    """Load labels from BRAT (dev split)"""
    # Note: Labels are exclusive on ends of char range
    path = osp.join(REPO_DATA_DIR, 'annotation', 'brat_export.csv')
    df = pd.read_csv(path)
    for i in [1, 2]:
        # Convert to inclusive end range
        df['e{}_end_chr'.format(i)] = df['e{}_end_chr'.format(i)] - 1
        assert (df['e{}_end_chr'.format(i)] > df['e{}_start_chr'.format(i)]).all()
    df = df.rename(columns={'id': 'doc_id'})
    df['type'] = df['rel_typ'].map({
        'Induction': REL_FIELD_INDUCING_CYTOKINE,
        'Secretion': REL_FIELD_SECRETED_CYTOKINE,
        'Differentiation': REL_FIELD_INDUCING_TRANSCRIPTION_FACTOR
    })
    assert df['type'].notnull().all()
    # All of these annotations are only positive
    df['value'] = 1
    df['annotator'] = 'brat'
    df = df[[
        'doc_id', 'type', 'e1_text', 'e1_start_chr', 'e1_end_chr', 
        'e2_text', 'e2_start_chr', 'e2_end_chr', 'value', 'annotator'
    ]]
    return df
df1 = load_labels_01()
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 10 columns):
doc_id          392 non-null object
type            392 non-null object
e1_text         392 non-null object
e1_start_chr    392 non-null int64
e1_end_chr      392 non-null int64
e2_text         392 non-null object
e2_start_chr    392 non-null int64
e2_end_chr      392 non-null int64
value           392 non-null int64
annotator       392 non-null object
dtypes: int64(5), object(5)
memory usage: 30.7+ KB


In [28]:
df1.head()

Unnamed: 0,doc_id,type,e1_text,e1_start_chr,e1_end_chr,e2_text,e2_start_chr,e2_end_chr,value,annotator
0,PMC4451961,secreted_cytokine,IL-17,1338,1342,Vγ4 T,1291,1295,1,brat
1,PMC4451961,secreted_cytokine,IL-17,1460,1464,Vγ4 T,1291,1295,1,brat
2,PMC4451961,secreted_cytokine,IL-17,1903,1907,Vγ4 T,1850,1854,1,brat
3,PMC4451961,secreted_cytokine,(IFN)-γ,3730,3736,γδ T,3825,3828,1,brat
4,PMC4451961,secreted_cytokine,(IL)-17,3754,3760,γδ T,3825,3828,1,brat


In [29]:
def load_labels_02():
    """Load labels from SnorkelNgramViewer annotation"""
    # Note: Labels are already inclusive on ends of char range
    path = osp.join(REPO_DATA_DIR, 'annotation', 'ngramviewer_export.csv')
    df = pd.read_csv(path)
    df = df[df['split'] == 3]
    df = df[['doc_id', 'field', 'c1_text', 'c1_char_start', 'c1_char_end', 'c2_text', 'c2_char_start', 'c2_char_end', 'value']]
    df = df.rename(columns=lambda c: c.replace('c1', 'e1').replace('c2', 'e2'))
    df = df.rename(columns=lambda c: c.replace('char_start', 'start_chr').replace('char_end', 'end_chr'))
    df = df.rename(columns={'field': 'type'})
    df['annotator'] = 'snorkelngramviewer'
    return df
df2 = load_labels_02()
df2.head()

Unnamed: 0,doc_id,type,e1_text,e1_start_chr,e1_end_chr,e2_text,e2_start_chr,e2_end_chr,value,annotator
300,PMC5799164,inducing_cytokine,IL-12,217,221,cytotoxic T,14,24,-1,snorkelngramviewer
301,PMC3378591,inducing_cytokine,IFN-γ,201,205,Th1,72,74,-1,snorkelngramviewer
302,PMC6279938,inducing_cytokine,IFN-γ,167,171,Th1,190,192,-1,snorkelngramviewer
303,PMC2646562,inducing_cytokine,IL-6,36,39,Th17,102,105,-1,snorkelngramviewer
304,PMC3486158,inducing_cytokine,IFN-γ,184,188,NKT,127,129,-1,snorkelngramviewer


## Merge

In [35]:
def get_stable_id(r):
    return '{}::span:{}:{}'.format(r['doc_id'], r['start_chr'], r['end_chr'])

def get_context_stable_id(r):
    e1_id = get_stable_id(r.filter(regex='^e1_|^doc_id$').rename(lambda v: v.replace('e1_', '')))
    e2_id = get_stable_id(r.filter(regex='^e2_|^doc_id$').rename(lambda v: v.replace('e2_', '')))
    return "~~".join([e1_id, e2_id])

df = pd.concat([df1, df2])
df['context_stable_ids'] = df.apply(get_context_stable_id, axis=1)
df.head()

Unnamed: 0,doc_id,type,e1_text,e1_start_chr,e1_end_chr,e2_text,e2_start_chr,e2_end_chr,value,annotator,context_stable_ids
0,PMC4451961,secreted_cytokine,IL-17,1338,1342,Vγ4 T,1291,1295,1,brat,PMC4451961::span:1338:1342~~PMC4451961::span:1...
1,PMC4451961,secreted_cytokine,IL-17,1460,1464,Vγ4 T,1291,1295,1,brat,PMC4451961::span:1460:1464~~PMC4451961::span:1...
2,PMC4451961,secreted_cytokine,IL-17,1903,1907,Vγ4 T,1850,1854,1,brat,PMC4451961::span:1903:1907~~PMC4451961::span:1...
3,PMC4451961,secreted_cytokine,(IFN)-γ,3730,3736,γδ T,3825,3828,1,brat,PMC4451961::span:3730:3736~~PMC4451961::span:3...
4,PMC4451961,secreted_cytokine,(IL)-17,3754,3760,γδ T,3825,3828,1,brat,PMC4451961::span:3754:3760~~PMC4451961::span:3...


In [None]:
# doc_id, eN_[start_chr|end_chr] (inclusive), annotator, relation type, value, split
# - Take DB backup
# - Add stable label on INFER split
# - Move everything to target split based on context stable id of candidates
# - Check that nothing on INFER split has labels

### Load Labels

In [None]:
from snorkel import SnorkelSession
from snorkel.models import Document, Sentence, Candidate
from snorkel.models import StableLabel
from snorkel.db_helpers import reload_annotator_labels
session = SnorkelSession()
classes = get_candidate_classes()

In [7]:
def load_labels(session, relations, annotator, candidate_class, split):
    assert (relations['type'] == candidate_class.field).all(), 'Relations data frame has conflicting relation classes'
    assert (relations['annotator'] == annotator).all(), 'Relations data frame has conflicting annotator'
    annotator_name = annotator + '-' + candidate_class.field
    
    for i, r in relations.iterrows():
        value = int(r['value'])
        assert value in [-1, 1], 'Row has invalid relation value: {}'.format(r)
        context_stable_ids = r['context_stable_ids']
        
        query = session.query(StableLabel)\
            .filter(StableLabel.context_stable_ids == context_stable_ids)\
            .filter(StableLabel.annotator_name == annotator_name)
        if query.count() == 0:
            session.add(StableLabel(
                context_stable_ids=context_stable_ids,
                annotator_name=annotator_name,
                value=value
            ))
            
    session.commit()
    # This function will create GoldLabel records for each StableLabel above after
    # selecting them based on annotator_name.  The annotator name should be different
    # for each candidate class if they might have identical context_stable_ids since
    # otherwise as written above only the first value for the same annotator + context_stable_ids
    # will be saved.
    # Other notes: split will be used to find candidates necessary to create GoldLabels though
    # it is not necessary for StableLabel filtering in this case (thus filter_label_split=False)
    # because the labels were not created with a split above
    return reload_annotator_labels(
        session, candidate_class.subclass, annotator_name, split=split, 
        filter_label_split=False, create_missing_cands=False)

In [None]:
labels = {}
clmap = {classes[c].field: classes[c] for c in classes}
grps = df.groupby(['type', 'annotator'])
    
for k, g in grps:
    candidate_class = clmap[k[0]]
    annotator = k[1]
    print('Loading labels for class={}, annotator={}'.format(*k))
    labels[k] = load_labels(session, g, annotator, candidate_class, SPLIT_INFER)

In [3]:
from snorkel import SnorkelSession
from snorkel.models import Document, Sentence, Candidate
session = SnorkelSession()

In [30]:
doc_ids = [v[0] for v in session.query(Document.name).all()]
len(doc_ids)

20600

In [13]:
dff = df[df['doc_id'].isin(doc_ids)]
len(dff)

684

In [14]:
dff.iloc[0]

doc_id                                                     PMC2646571
text                Down-regulation of Gfi-1 expression by TGF-β i...
sent_char_offset                                                    0
type                                                 InducingCytokine
field                                               inducing_cytokine
split                                                               1
value                                                               1
c1_cid                          CKC59F8990F767EBD4:CKFD4CA0B2B4BC3AE4
c2_cid                          CTBFBDE5121B6748D1:CTBFBDE5121B6748D1
c1_ctxt_id                                                     126965
c1_stable_id                                   PMC2646571::span:39:43
c1_word_start                                                       7
c1_word_end                                                         7
c1_char_start                                                      39
c1_char_end         

In [15]:
doc = session.query(Document).filter(Document.name == '2646571').first()

In [16]:
doc

Document 2646571

In [17]:
sents = doc.sentences

In [18]:
sents[0]

Sentence(Document 2646571,0,b'Down-regulation of Gfi-1 expression by TGF-\xce\xb2 is important for differentiation of Th17 and CD103+ inducible regulatory T cells.\n\n')

In [20]:
sent = sents[0]

In [23]:
sent.entity_cids

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']