In [1]:
import os
import os.path as osp
import pandas as pd
import numpy as np
import snorkel
from snorkel.parser import TextDocPreprocessor
from snorkel.parser import CorpusParser
from snorkel.models import Candidate, Document, Sentence
from snorkel import SnorkelSession
import dotenv
dotenv.load_dotenv('../env.sh')
%run ../src/supervision.py

In [2]:
session = SnorkelSession()

In [3]:
from snorkel.models import Candidate, Document, Span
classes = get_candidate_classes()
#candidate_class = classes.inducing_cytokine
#candidate_class = classes.inducing_transcription_factor
candidate_class = classes.secreted_cytokine
query = session.query(candidate_class.subclass).filter(Candidate.split==SPLIT_INFER)
cands = query.all()
len(cands)

162226

In [4]:
if candidate_class == classes.inducing_transcription_factor:
    # Remove bad TF tags
    from snorkel.models import Span, Sentence
    spans = session.query(Span.id, Span.char_start, Span.char_end, Sentence.text)\
        .join(Sentence, Sentence.id == Span.sentence_id).all()
    # Span ID -> Span text
    spans = {r[0]: r[-1][r[1]:(r[2]+1)] for r in spans}
    cands = [c for c in cands if len(spans[c.transcription_factor_id]) >= 4]
    print(len(cands))

In [15]:
from snorkel.models import StableLabel
# Pull all context stable ids and remove matching candidates 
# * This is necessary so that there aren't multiple annotations (secreted + induced)
# for stable ids like 'PMC2646571::span:39:43~~PMC2646571::span:81:84'
labels = [l.context_stable_ids for l in session.query(StableLabel).all()]
def to_sl(c, rev=False):
    ctxts = c.get_contexts()
    if rev:
        ctxts = ctxts[::-1]
    return '~~'.join([ctx.stable_id for ctx in ctxts])
cands = [c for c in cands if to_sl(c) not in labels and to_sl(c, rev=True) not in labels]
len(cands)

162029

In [16]:
annot_cands = pd.Series(np.arange(len(cands))).sample(n=200, random_state=1)
annot_cands = [cands[i] for i in annot_cands.values]
len(annot_cands)

200

In [17]:
from snorkel.viewer import SentenceNgramViewer
SentenceNgramViewer(annot_cands, session, height=500)

<IPython.core.display.Javascript object>

SentenceNgramViewer(cids=[[[125], [84], [44]], [[54], [189], [5]], [[79], [155], [62]], [[43], [7], [101]], [[…

In [22]:
from snorkel.models import Candidate, GoldLabel
res = session.query(Candidate, GoldLabel.value).join(GoldLabel).filter(Candidate.split == SPLIT_INFER).all()
pd.DataFrame([(r[0].type, r[1]) for r in res], columns=['type', 'value'])\
    .groupby('type')['value'].value_counts().unstack()

value,-1,1
type,Unnamed: 1_level_1,Unnamed: 2_level_1
inducing_cytokine,170,27
inducing_transcription_factor,177,40
secreted_cytokine,159,39


## Export

In [133]:
def to_row(r):
    assert r.GoldLabel.candidate.id == r.Candidate.id
    sent = r.Candidate.get_parent()
    
    ctxts = r.Candidate.get_contexts()
    assert len(ctxts) == 2
    cids = r.Candidate.get_cids()
    
    typ = type(r.Candidate).__name__ # e.g. InducingTranscriptionFactor
    field = classes[typ].field # inducing_transcription_factor
    
    def get_ctxt_props(c):
        return pd.Series(dict(
            ctxt_id=c.id,
            stable_id=c.stable_id,
            word_start=c.get_word_start(),
            word_end=c.get_word_end(),
            char_start=c.char_start, 
            char_end=c.char_end, # This is an inclusive end point (i.e. must add 1 to exclusive slice)
            text=str(c.get_span())
        ))
    row = pd.Series(dict(
        doc_id=sent.document.name,
        text=sent.text,
        # This first index of char for sentence within doc
        sent_char_offset=sent.abs_char_offsets[0],
        type=typ,
        field=field,
        split=r.Candidate.split,
        value=r.GoldLabel.value,
        c1_cid=cids[0],
        c2_cid=cids[1]
    ))
    row = row.append(get_ctxt_props(ctxts[0]).add_prefix('c1_'))
    row = row.append(get_ctxt_props(ctxts[1]).add_prefix('c2_'))
    return row

res = session.query(Candidate, GoldLabel).join(GoldLabel).all()
df = pd.DataFrame([to_row(r) for r in res])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 23 columns):
doc_id              912 non-null object
text                912 non-null object
sent_char_offset    912 non-null int64
type                912 non-null object
field               912 non-null object
split               912 non-null int64
value               912 non-null int64
c1_cid              912 non-null object
c2_cid              912 non-null object
c1_ctxt_id          912 non-null int64
c1_stable_id        912 non-null object
c1_word_start       912 non-null int64
c1_word_end         912 non-null int64
c1_char_start       912 non-null int64
c1_char_end         912 non-null int64
c1_text             912 non-null object
c2_ctxt_id          912 non-null int64
c2_stable_id        912 non-null object
c2_word_start       912 non-null int64
c2_word_end         912 non-null int64
c2_char_start       912 non-null int64
c2_char_end         912 non-null int64
c2_text             912 non-

In [134]:
df.head(3)

Unnamed: 0,doc_id,text,sent_char_offset,type,field,split,value,c1_cid,c2_cid,c1_ctxt_id,...,c1_char_start,c1_char_end,c1_text,c2_ctxt_id,c2_stable_id,c2_word_start,c2_word_end,c2_char_start,c2_char_end,c2_text
0,PMC2646571,Down-regulation of Gfi-1 expression by TGF-β i...,0,InducingCytokine,inducing_cytokine,1,1,CKC59F8990F767EBD4:CKFD4CA0B2B4BC3AE4,CTBFBDE5121B6748D1:CTBFBDE5121B6748D1,126965,...,39,43,TGF-β,126966,PMC2646571::span:81:84,13,13,81,84,Th17
1,PMC2646571,Down-regulation of Gfi-1 expression by TGF-β i...,0,InducingCytokine,inducing_cytokine,1,1,CKC59F8990F767EBD4:CKFD4CA0B2B4BC3AE4,CT561AB33D0CC11964:CT19D812F7CB34BCF6,126965,...,39,43,TGF-β,126967,PMC2646571::span:97:118,17,19,97,118,inducible regulatory T
2,PMC2646571,"Furthermore, a key inducer of both Th17 and iT...",900,InducingCytokine,inducing_cytokine,1,1,CK1B7B0E7A89192169:CKFD4CA0B2B4BC3AE4,CTBFBDE5121B6748D1:CTBFBDE5121B6748D1,126279,...,72,99,transforming growth factor β,126280,PMC2646571::span:935:938,7,7,35,38,Th17


In [135]:
df.groupby(['type', 'field']).size()

type                         field                        
InducingCytokine             inducing_cytokine                330
InducingTranscriptionFactor  inducing_transcription_factor    303
SecretedCytokine             secreted_cytokine                279
dtype: int64

In [136]:
df.groupby(['type', 'split']).size()

type                         split
InducingCytokine             1        133
                             3        197
InducingTranscriptionFactor  1         86
                             3        217
SecretedCytokine             1         81
                             3        198
dtype: int64

In [137]:
df.groupby(['type'])['value'].value_counts(normalize=False)

type                         value
InducingCytokine             -1       170
                              1       160
InducingTranscriptionFactor  -1       177
                              1       126
SecretedCytokine             -1       159
                              1       120
Name: value, dtype: int64

In [138]:
df.groupby(['type', 'split'])['value'].value_counts(normalize=False)

type                         split  value
InducingCytokine             1       1       133
                             3      -1       170
                                     1        27
InducingTranscriptionFactor  1       1        86
                             3      -1       177
                                     1        40
SecretedCytokine             1       1        81
                             3      -1       159
                                     1        39
Name: value, dtype: int64

In [139]:
df.groupby(['type', 'split'])['value'].value_counts(normalize=True)

type                         split  value
InducingCytokine             1       1       1.000000
                             3      -1       0.862944
                                     1       0.137056
InducingTranscriptionFactor  1       1       1.000000
                             3      -1       0.815668
                                     1       0.184332
SecretedCytokine             1       1       1.000000
                             3      -1       0.803030
                                     1       0.196970
Name: value, dtype: float64

In [140]:
path = osp.join(os.environ['REPO_DATA_DIR'], 'annotation', 'label_export.csv')
df.to_csv(path, index=False)
path

'/Users/eczech/repos/hammer/t-cell-relation-extraction/pm_subtype_protein_relations/data/annotation/label_export.csv'