### Linking Candidates to Documents Efficiently

- All Spans are Contexts which have a stable_id (and the doc name is in the stable id)
- Specific Candidate subclass tables each have a span record that can be used to link to stable id

In [50]:
from snorkel import SnorkelSession
from tcre import supervision
import pandas as pd
session = SnorkelSession()
classes = supervision.get_candidate_classes()

In [14]:
from snorkel.models import Document, Span, Context, Candidate, SnorkelBase

In [4]:
sc = classes.inducing_cytokine.subclass

In [6]:
from snorkel.models import candidate_subclass

In [9]:
sc.__argnames__

['cytokine', 'immune_cell_type']

In [17]:
#!cat /opt/conda/envs/nlp/lib/python3.6/site-packages/sqlalchemy/ext/declarative/api.py

In [18]:
span = session.query(Span).limit(1).one()

In [19]:
span

Span("b'T Follicular Helper'", sentence=503, chars=[17,35], words=[2,4])

In [21]:
span.id

794041

In [24]:
span.stable_id

'PMC6170619::span:17:35'

In [23]:
ctx = session.query(Context).filter(Context.id == 794041).one()
ctx

Span("b'T Follicular Helper'", sentence=503, chars=[17,35], words=[2,4])

In [41]:
cand = session.query(classes.inducing_cytokine.subclass).limit(100).all()[-1]

In [42]:
cand

InducingCytokine(Span("b'IFN\xce\xb3'", sentence=985, chars=[51,54], words=[8,8]), Span("b'Th2'", sentence=985, chars=[119,121], words=[17,17]))

In [43]:
cand.id

100

In [60]:
cand.type

'inducing_cytokine'

In [28]:
cand.get_contexts()

(Span("b'chemokine receptor CXCR5'", sentence=514, chars=[141,164], words=[27,29]),
 Span("b'Tfh'", sentence=514, chars=[0,2], words=[0,0]))

### Develop Query for Single Class

In [51]:
con = session.connection()

In [56]:
classes.inducing_cytokine.entity_types

['cytokine', 'immune_cell_type']

In [66]:
rs = con.execute(
    'SELECT A.id AS cand_id, C.id AS sentence_id, D.id AS doc_id  '
    'FROM inducing_cytokine A '
    'INNER JOIN span B ON A.immune_cell_type_id = B.id '
    'INNER JOIN sentence C ON B.sentence_id = C.id '
    'INNER JOIN document D ON C.document_id = D.id'
)
cols = rs.keys()
df = pd.DataFrame([r for r in rs], columns=cols)
df.head()

Unnamed: 0,cand_id,sentence_id,doc_id
0,1,514,1
1,2,515,1
2,3,528,1
3,4,661,1
4,5,665,1


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91839 entries, 0 to 91838
Data columns (total 3 columns):
cand_id        91839 non-null int64
sentence_id    91839 non-null int64
doc_id         91839 non-null int64
dtypes: int64(3)
memory usage: 2.1 MB


In [67]:
session.query(classes.inducing_cytokine.subclass).count()

91839

### Generalize to All Classes

In [70]:
QUERY_TEMPLATE = (
    'SELECT D.id AS doc_id, C.id AS sentence_id, A.id AS cand_id '
    'FROM {} A '
    'INNER JOIN span B ON A.immune_cell_type_id = B.id '
    'INNER JOIN sentence C ON B.sentence_id = C.id '
    'INNER JOIN document D ON C.document_id = D.id'
)
QUERY_TEMPLATE

'SELECT D.id AS doc_id, C.id AS sentence_id, A.id AS cand_id FROM {} A INNER JOIN span B ON A.immune_cell_type_id = B.id INNER JOIN sentence C ON B.sentence_id = C.id INNER JOIN document D ON C.document_id = D.id'

In [76]:
def _get_query(cand_class):
    if 'immune_cell_type' not in cand_class.entity_types:
        raise ValueError('Candidate class {} does not have required entity type "immune_cell_type"'.format(cand_class.field))
    return QUERY_TEMPLATE.format(cand_class.field)

def _run_query(con, query):
    rs = con.execute(query)
    return pd.DataFrame([r for r in rs], columns=rs.keys())

def get_cand_docs(session, classes):
    con = session.connection()
    df = pd.concat([_run_query(con, _get_query(classes[c])).assign(cand_type=classes[c].field) for c in classes])
    return df

In [77]:
df = get_cand_docs(session, classes)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 220213 entries, 0 to 36534
Data columns (total 4 columns):
doc_id         220213 non-null int64
sentence_id    220213 non-null int64
cand_id        220213 non-null int64
cand_type      220213 non-null object
dtypes: int64(3), object(1)
memory usage: 8.4+ MB


In [79]:
df.groupby('cand_type')['cand_id'].nunique()

cand_type
inducing_cytokine                91839
inducing_transcription_factor    36535
secreted_cytokine                91839
Name: cand_id, dtype: int64

In [78]:
pd.DataFrame(session.query(Candidate.type, Candidate.id).all()).groupby('type')['id'].nunique()

type
inducing_cytokine                91839
inducing_transcription_factor    36535
secreted_cytokine                91839
Name: id, dtype: int64

In [85]:
from tcre import query as tcre_query
import imp
imp.reload(tcre_query)

<module 'tcre.query' from '/lab/repos/t-cell-relation-extraction/src/tcre/query.py'>

In [86]:
tcre_query.DocToCand.all(session, classes).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 220213 entries, 0 to 36534
Data columns (total 4 columns):
doc_id         220213 non-null int64
sentence_id    220213 non-null int64
cand_id        220213 non-null int64
cand_type      220213 non-null object
dtypes: int64(3), object(1)
memory usage: 8.4+ MB
