In [1]:
import os
import os.path as osp
import pandas as pd
import numpy as np
from snorkel import SnorkelSession
from tcre.env import *
from tcre.supervision import *
session = SnorkelSession()
classes = get_candidate_classes()

In [82]:
from snorkel.models import Candidate
c = session.query(Candidate).filter(Candidate.split == 1).first()
c.get_cids(), c.cytokine_cid, c.immune_cell_type_cid

(('CK04FD0805168B608B:CKB4EB2D2CC8BBB93D',
  'CTC3A8C3CBC245616A:CTC3A8C3CBC245616A'),
 'CK04FD0805168B608B:CKB4EB2D2CC8BBB93D',
 'CTC3A8C3CBC245616A:CTC3A8C3CBC245616A')

In [108]:
# Check to see which entity types are associated with each candidate span (should always have the same order)
pd.DataFrame([
    dict(
        id=c.id, type=c.type, 
        span_0_type=c.get_parent().entity_types[c.get_contexts()[0].get_word_range()[0]],
        span_1_type=c.get_parent().entity_types[c.get_contexts()[1].get_word_range()[0]]
    )
    for c in session.query(Candidate).filter(Candidate.split == 1).all()
]).groupby(['type', 'span_0_type', 'span_1_type']).size()

type                           span_0_type           span_1_type     
inducing_cytokine              cytokine              immune_cell_type    673
inducing_transcription_factor  transcription_factor  immune_cell_type    410
secreted_cytokine              cytokine              immune_cell_type    673
dtype: int64

In [83]:
c

InducingCytokine(Span("b'IL-12'", sentence=13515, chars=[0,4], words=[0,0]), Span("b'TH1'", sentence=13515, chars=[122,124], words=[21,21]))

In [84]:
c.get_parent()

Sentence(Document PMC3304099,21,b'IL-12 induces not only Ifng expression1 but also T-bet,  which promotes the survival and proliferation of differentiating TH1 cells.   ')

In [85]:
c.get_parent().document_id

122

In [109]:
c.get_contexts()

(Span("b'IL-12'", sentence=13515, chars=[0,4], words=[0,0]),
 Span("b'TH1'", sentence=13515, chars=[122,124], words=[21,21]))

In [104]:
c.cytokine, c.immune_cell_type

(Span("b'IL-12'", sentence=13515, chars=[0,4], words=[0,0]),
 Span("b'TH1'", sentence=13515, chars=[122,124], words=[21,21]))

In [86]:
ctx = c.get_contexts()[0]
ctx

Span("b'IL-12'", sentence=13515, chars=[0,4], words=[0,0])

In [87]:
# This is every candidate of the other relation class with a cytokine matching the same span (as ctx)
ctx.secreted_cytokine_cytokines

[SecretedCytokine(Span("b'IL-12'", sentence=13515, chars=[0,4], words=[0,0]), Span("b'TH1'", sentence=13515, chars=[122,124], words=[21,21]))]

In [88]:
# This is every candidate of the other relation class with an immune cell type matching the same span (as ctx) -- which 
# should return nothing since the span is itself for a cytokine
ctx.secreted_cytokine_immune_cell_types

[]

In [89]:
[(c.id, s.id) for c in ctx.secreted_cytokine_cytokines for s in c.get_contexts()]

[(30840, 126243), (30840, 126245)]

In [90]:
[c.id for c in ctx.secreted_cytokine_cytokines]

[30840]

In [91]:
sibling_cand = ctx.secreted_cytokine_cytokines[0]
sibling_cand

SecretedCytokine(Span("b'IL-12'", sentence=13515, chars=[0,4], words=[0,0]), Span("b'TH1'", sentence=13515, chars=[122,124], words=[21,21]))

In [92]:
sibling_cand.id, c.id

(30840, 30167)

In [93]:
ctx = c.get_contexts()[1]
ctx

Span("b'TH1'", sentence=13515, chars=[122,124], words=[21,21])

In [94]:
ctx.secreted_cytokine_cytokines

[]

In [95]:
ctx.secreted_cytokine_immune_cell_types

[SecretedCytokine(Span("b'IL-12'", sentence=13515, chars=[0,4], words=[0,0]), Span("b'TH1'", sentence=13515, chars=[122,124], words=[21,21])),
 SecretedCytokine(Span("b'Ifng'", sentence=13515, chars=[23,26], words=[4,4]), Span("b'TH1'", sentence=13515, chars=[122,124], words=[21,21]))]

In [96]:
[(c.id, s.id) for c in ctx.secreted_cytokine_immune_cell_types for s in c.get_contexts()]

[(30840, 126243), (30840, 126245), (30841, 126244), (30841, 126245)]

In [97]:
[c.id for c in ctx.secreted_cytokine_immune_cell_types]

[30840, 30841]

In [102]:
c.get_contexts()[0].secreted_cytokine_cytokines[0] == c.get_contexts()[1].secreted_cytokine_immune_cell_types[0]

True

In [98]:
classes.inducing_cytokine.entity_types

['cytokine', 'immune_cell_type']

In [105]:
c.__class__.__argnames__

['cytokine', 'immune_cell_type']

### Function Dev

In [115]:
COMP_TYPES = {
    classes.inducing_cytokine.field: classes.secreted_cytokine.field,
    classes.secreted_cytokine.field: classes.inducing_cytokine.field,
}

def get_span_type(c, span):
    # Return entity type for sentence at position of first word for span
    return c.get_parent().entity_types[span.get_word_range()[0]]

def get_sibling_cand(c, strict=True):
    if c.type not in COMP_TYPES:
        return None
    entity_types = c.__class__.__argnames__  #  ['cytokine', 'immune_cell_type']
    sibl_type = COMP_TYPES[c.type]  # 'secreted_cytokine'
    cand_spans = sorted(c.get_contexts(), key=lambda v: v.id)
    span_map = {get_span_type(c, span): span for i, span in enumerate(cand_spans)}  # map spans by type
    # Find sibling candidates through backref fields like "secreted_cytokine_cytokines"
    # * only first entity type is necessary for lookup since the sibling candidate will be attached to both
    sibl_cands = getattr(span_map[entity_types[0]], sibl_type + '_' + entity_types[0] + 's')
    # Filter to candidate with same spans
    sibl_cands = [s for s in sibl_cands if sorted(s.get_contexts(), key=lambda v: v.id) == cand_spans]
    if strict and len(sibl_cands) != 1:
        raise ValueError(f'Failed to find exactly one sibling candidate for candidate {c} (siblings found = {sibl_cands})')
    return sibl_cands[0] if sibl_cands else None

In [117]:
print(c)
print(get_sibling_cand(c))

InducingCytokine(Span("b'IL-12'", sentence=13515, chars=[0,4], words=[0,0]), Span("b'TH1'", sentence=13515, chars=[122,124], words=[21,21]))
SecretedCytokine(Span("b'IL-12'", sentence=13515, chars=[0,4], words=[0,0]), Span("b'TH1'", sentence=13515, chars=[122,124], words=[21,21]))


In [122]:
#pairs = [(c, get_sibling_cand(c)) for c in session.query(Candidate).filter(Candidate.split == 1).all()]
pairs = [(c, get_sibling_cand(c)) for c in session.query(Candidate).filter(Candidate.split == 0).all()]

In [123]:
len(pairs)

120166

In [124]:
pairs[1]

(InducingCytokine(Span("b'IFN\xce\xb3'", sentence=41532, chars=[74,77], words=[13,13]), Span("b'Th1'", sentence=41532, chars=[54,56], words=[10,10])),
 SecretedCytokine(Span("b'IFN\xce\xb3'", sentence=41532, chars=[74,77], words=[13,13]), Span("b'Th1'", sentence=41532, chars=[54,56], words=[10,10])))