In [1]:
import os
import os.path as osp
import pandas as pd
import numpy as np
import collections
from snorkel import SnorkelSession
from tcre import supervision
from tcre.env import *
from tcre.supervision import *
session = SnorkelSession()
classes = get_candidate_classes()

In [2]:
from snorkel.models import Candidate
c = session.query(classes.inducing_cytokine.subclass).first()
c.get_cids(), c.cytokine_cid, c.immune_cell_type_cid

(('CK692CB9564F9067B0:CK6004F9DA69B8AB01',
  'CT954E3C2D0B390922:CTE9F6070561C95355'),
 'CK692CB9564F9067B0:CK6004F9DA69B8AB01',
 'CT954E3C2D0B390922:CTE9F6070561C95355')

In [3]:
import re
from snorkel.lf_helpers import (
    get_tagged_text,
    get_left_tokens,
    get_right_tokens,
    get_between_tokens,
    get_text_splits,
    rule_regex_search_tagged_text,
    rule_regex_search_btw_AB,
    rule_regex_search_btw_BA,
    rule_regex_search_before_A,
    rule_regex_search_before_B,
)

### Text Pattern Functions

In [4]:
TERMS = get_terms_map()

def subst_pattern(pattern, terms=None):
    for k, v in {**TERMS, **(terms or {})}.items():
        pattern = pattern.replace('{{' + k + '}}', v)
    return pattern

def regex_pattern(pattern, terms=None):
    return regex_search(None, pattern, terms=terms, pattern_only=True)
    
def regex_search(c, pattern, terms=None, pattern_only=False):
    # Add possible "cell" token after each cell type reference
    pattern = pattern.replace('{{B}}', '{{B}}( cell| type cell|-type cell)?')
    
    # Apply generic replacements in pattern
    pattern = subst_pattern(pattern, terms=terms)
    
    if pattern_only:
        return pattern
    return rule_regex_search_tagged_text(c, pattern, 1)

TERMS

{'r_diff_n': '(differentiation|formation|generation|polarization|development|induction)',
 'r_diff_v': '(differentiate|form|generate|polarize|develop|differentiates|forms|generates|polarizes|develops)',
 'r_diff_p': '(differentiated|formed|generated|polarized|developed)',
 'r_diff_g': '(differentiating|forming|generating|polarizing|developing)',
 'r_push_n': '(inducer|driver|director|regulator|controller|promoter|mediator|mediater)',
 'r_push_v': '(induce|drive|direct|regulate|control|promote|mediate|induces|drives|directs|regulates|controls|promotes|mediates)',
 'r_push_p': '(induced|drove|driven|directed|regulated|controlled|promoted|mediated)',
 'r_push_g': '(inducing|driving|directing|regulating|controlling|promoting|mediating)',
 'r_prod_n': '(producer|production|generator|generation|creator|creation)',
 'r_prod_v': '(produce|generate|create|produces|generates|creates)',
 'r_prod_p': '(produced|generated|created)',
 'r_prod_g': '(producing|generating|creating)',
 'r_secr_n': '(sec

In [5]:
def add_lf(fn, name=None):
    if name is not None:
        fn.__name__ = name
    # Register function in global namespace
    globals()[fn.__name__] = fn
        
def get_fn(ptn, sign):
    sub_ptn = regex_search(c, ptn[0], terms=ptn[1] if len(ptn)>1 else None, pattern_only=True)
    def fn(c):
        return sign * regex_search(c, ptn[0], terms=ptn[1] if len(ptn)>1 else None)
    return fn, sub_ptn

def get_agg_fn(ptns, sign, agg):
    fns = [get_fn(ptn, sign)[0] for ptn in ptns]
    def fn(c):
        return agg([f(c) for f in fns])
    return fn

# Set to true to roll all text pattern functions into a single function 
# (since there can be > 30 of them)
use_single_text_pattern_fn = True
if use_single_text_pattern_fn:
    for rcls in LF_REGEX:
        cmap = {v: k for k, v in REL_ABBRS.items()}
        for sign in LF_REGEX[rcls]:
            val = {'positive': 1, 'negative': -1}[sign]
            ptns = LF_REGEX[rcls][sign]
            if len(ptns) == 0:
                continue
            agg = max if val == 1 else min
            add_lf(get_agg_fn(ptns, val, agg), 'LF_{}_txtptn_{}_all'.format(cmap[rcls], sign[:3]))
else:
    for rcls in LF_REGEX:
        cmap = {v: k for k, v in REL_ABBRS.items()}
        for sign in LF_REGEX[rcls]:
            for i, ptn in enumerate(LF_REGEX[rcls][sign]):
                val = {'positive': 1, 'negative': -1}[sign]
                fn_name = 'LF_{}_txtptn_{}_{}'.format(cmap[rcls], sign[:3], i + 1)
                fn, sub_ptn = get_fn(ptn, val)
                print(fn_name, ':\n\t', ptn[0], '\n\t', sub_ptn)
                add_lf(fn, fn_name)


In [6]:
# text = 'The current {{A}}-(pos) cells'
# pattern = regex_pattern(r'{{A}}(\+|-)?(positive|negative|\(pos\)|neg|hi|lo)')
# print(pattern)
# m = re.search(pattern, text, flags=re.I)
# m is not None, m

### Heuristic Functions

In [7]:
# Universal function to check for references that are over a certain number of words away, 
# making them unlikely to have any kind of relationship
def is_ref_too_far(c, max_words=25, allow_positive=False):
    n = len(list(get_between_tokens(c)))
    alt = 1 if allow_positive else 0
    return -1 if n > max_words else alt

def _add_dist_fn(abbr, dist_wrd):
    add_lf(
        lambda c: is_ref_too_far(c, dist_wrd, allow_positive=True), 
        'LF_' + abbr + '_heur_distref_' + str(dist_wrd)
    )
    
for abbr in REL_ABBRS:
    add_lf(lambda c: is_ref_too_far(c), 'LF_' + abbr + '_heur_distref')
    for dist_wrd in [5, 10, 15, 20, 25, 32, 50]:
        _add_dist_fn(abbr, dist_wrd)

def get_kwds(term_prefixes):
    return set([ 
        kw
        for prefix in term_prefixes
        for k, v in TERMS.items() if k.startswith(prefix)
        for kw in re.findall('\w+', v)
    ])

INDCK_KWS = get_kwds(['r_diff'])
SECCK_KWS = get_kwds(['r_secr'])
INDTF_KWS = get_kwds(['r_diff'])
INDCK_KWS = INDCK_KWS.difference(SECCK_KWS)
SECCK_KWS = SECCK_KWS.difference(INDCK_KWS)

             
def has_kwds(c, kwds, window=5):
    words = list(get_left_tokens(c, window=window)) + list(get_between_tokens(c)) + list(get_right_tokens(c, window=window))
    return len(kwds.intersection(words)) > 0

def indck_kwds(c, window):
    f1, f2 = has_kwds(c, INDCK_KWS, window=window), has_kwds(c, SECCK_KWS, window=window)
    if f1 and not f2:
        return 1
    if f2 and not f1:
        return -1
    return 0

def LF_indck_heur_kwds_01(c):
    return indck_kwds(c, 5)

def LF_indck_heur_kwds_02(c):
    return indck_kwds(c, 10)

def LF_indck_heur_kwds_03(c):
    return indck_kwds(c, 25)
    
def LF_secck_heur_kwds_01(c):
    return -1 * LF_indck_heur_kwds_01(c)

def LF_secck_heur_kwds_02(c):
    return -1 * LF_indck_heur_kwds_02(c)

def LF_secck_heur_kwds_03(c):
    return -1 * LF_indck_heur_kwds_03(c)

def LF_indtf_heur_kwds_01(c):
    return 1 if has_kwds(c, INDTF_KWS, window=5) else -1

def LF_indtf_heur_kwds_02(c):
    return 1 if has_kwds(c, INDTF_KWS, window=10) else -1

def LF_indtf_heur_kwds_03(c):
    return 1 if has_kwds(c, INDTF_KWS, window=25) else -1
        
def LF_indck_heur_closer_ck_to_ct(c):
    return -1*has_closer_reference(c, right=True)

def LF_indck_heur_closer_ct_to_ck(c):
    return -1*has_closer_reference(c, right=False)

def LF_indck_heur_closer_ref(c):
    return min(LF_indck_heur_closer_ck_to_ct(c), LF_indck_heur_closer_ct_to_ck(c))

def LF_secck_heur_closer_ck_to_ct(c):
    return -1*has_closer_reference(c, right=True)

def LF_secck_heur_closer_ct_to_ck(c):
    return -1*has_closer_reference(c, right=False)

def LF_secck_heur_closer_ref(c):
    return min(LF_secck_heur_closer_ck_to_ct(c), LF_secck_heur_closer_ct_to_ck(c))

def LF_indtf_heur_closer_tf_to_ct(c):
    return -1*has_closer_reference(c, right=True)

def LF_indtf_heur_closer_ct_to_tf(c):
    return -1*has_closer_reference(c, right=False)

def LF_indtf_heur_closer_ref(c):
    return min(LF_indtf_heur_closer_tf_to_ct(c), LF_indtf_heur_closer_ct_to_tf(c))


### Heuristics for identifying off-target sentences 

def _valid_cand(**kwargs):
    return lambda c: -1 if supervision.is_invalid_candidate(c, **kwargs) else 0

def _cmplx_cand(**kwargs):
    return lambda c: -1 if supervision.is_complex_candidate(c, **kwargs) else 0

def _hypot_cand(**kwargs):
    return lambda c: -1 if supervision.is_hypothesis_candidate(c, **kwargs) else 0

def _prexp_cand(**kwargs):
    return lambda c: -1 if supervision.is_expressed_protein(c, **kwargs) else 0

for abbr in REL_ABBRS:
    add_lf(_valid_cand(fig_kw_ct_thresh=1, punc_ct_thresh=5, newline_ct_thresh=3, char_ct_thresh=2000), 'LF_' + abbr + '_heur_valid_cand_01')
    add_lf(_valid_cand(fig_kw_ct_thresh=2, punc_ct_thresh=8, newline_ct_thresh=8, char_ct_thresh=2000), 'LF_' + abbr + '_heur_valid_cand_02')
    add_lf(_valid_cand(fig_kw_ct_thresh=4, punc_ct_thresh=12, newline_ct_thresh=10, char_ct_thresh=2000), 'LF_' + abbr + '_heur_valid_cand_03')
    add_lf(_cmplx_cand(entity_ct_thresh=3, char_ct_thresh=500), 'LF_' + abbr + '_heur_complex_cand_01')
    add_lf(_cmplx_cand(entity_ct_thresh=5, char_ct_thresh=800), 'LF_' + abbr + '_heur_complex_cand_02')
    add_lf(_hypot_cand(prefix_ct_thresh=1, token_ct_thresh=1), 'LF_' + abbr + '_heur_hypothesis_cand_01')
    add_lf(_hypot_cand(prefix_ct_thresh=1, token_ct_thresh=3), 'LF_' + abbr + '_heur_hypothesis_cand_02')
    if abbr != REL_SECCK:
        add_lf(_prexp_cand(), 'LF_' + abbr + '_heur_prexpression_cand_01')
    
### Heuristics for dependency parse tree processing

dep_parser = supervision.DependencyParseTree()

def LF_indck_parse_tree_01(c):
    if c.type != REL_FIELD_INDUCING_CYTOKINE:
        raise ValueError(f'Type {c.type} not supported')
    f1 = dep_parser.is_candidate_relation(c, REL_FIELD_INDUCING_CYTOKINE)
    f2 = dep_parser.is_candidate_relation(c, REL_FIELD_SECRETED_CYTOKINE)
    # Return false if heuristic positive for both related classes
    if f1 and f2:
        return -1
    return 1 if f1 else -1

def LF_secck_parse_tree_01(c):
    if c.type != REL_FIELD_SECRETED_CYTOKINE:
        raise ValueError(f'Type {c.type} not supported')
    f1 = dep_parser.is_candidate_relation(c, REL_FIELD_INDUCING_CYTOKINE)
    f2 = dep_parser.is_candidate_relation(c, REL_FIELD_SECRETED_CYTOKINE)
    # Return false if heuristic positive for both related classes
    if f1 and f2:
        return -1
    return 1 if f2 else -1

def LF_indtf_parse_tree_01(c):
    if c.type != REL_FIELD_INDUCING_TRANSCRIPTION_FACTOR:
        raise ValueError(f'Type {c.type} not supported')
    return 1 if dep_parser.is_candidate_relation(c, REL_FIELD_INDUCING_TRANSCRIPTION_FACTOR) else -1

### Distant Supervision Functions

In [8]:
from tcre.ix import IXDB
ix_data_file = osp.join(SUPERVISION_DATA_DIR, 'immunexpresso', 'data.csv')

def add_ix_lfs(ix, lf_suffix):
    
    def indck_fn(c):
        if c.type != classes.inducing_cytokine.field:
            return 0
        ind1 = ix.is_candidate_relation(c, 'cytokine', 'Positive')
        if ind1 is None:
            return 0
        ind2 = ix.is_candidate_relation(c, 'cytokine', 'Negative')
        ind3 = ix.is_candidate_relation(c, 'cell', 'Positive')
        if ind1:
            if not ind2 and not ind3:
                return 1
        else:
            if ind2 or ind3:
                return -1
        return 0

    def secck_fn(c):
        if c.type != classes.secreted_cytokine.field:
            return 0
        ind1 = ix.is_candidate_relation(c, 'cell', 'Positive')
        if ind1 is None:
            return 0
        ind2 = ix.is_candidate_relation(c, 'cytokine', 'Positive')
        ind3 = ix.is_candidate_relation(c, 'cytokine', 'Negative')
        if ind1:
            if not ind2 and not ind3:
                return 1
        else:
            if ind2 or ind3:
                return -1
        return 0
    
    add_lf(indck_fn, 'LF_indck_dsup_imexpresso_' + lf_suffix)
    add_lf(secck_fn, 'LF_secck_dsup_imexpresso_' + lf_suffix)
    
ixs = {}
for min_papers in [4, 8, 12, 20]:
    ixs[min_papers] = IXDB(ix_data_file, min_papers).initialize()
    add_ix_lfs(ixs[min_papers], f'mp{min_papers:02d}')
ixs[4].df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 157 entries, (CT00B5D38A1594A953, CK25044A239340F089) to (CTFE4015328B70D553, CKF8C95AB8BB638BAC)
Data columns (total 8 columns):
actor             157 non-null object
category          157 non-null object
cell_id           157 non-null object
cell_label        157 non-null object
cytokine_id       157 non-null object
cytokine_label    157 non-null object
num_papers        157 non-null int64
score             157 non-null float64
dtypes: float64(1), int64(1), object(6)
memory usage: 10.6+ KB


### Composite Functions

In [9]:
def _global_lfs():
    return {
        k:v for k, v in globals().items() 
        if callable(v) and k.startswith('LF_')
    }

def get_lfs(filter_fn=None): 
    lfs = _global_lfs()
    if filter_fn is None:
        filter_fn = lambda n: True
    res = collections.OrderedDict([
        (k, lfs[k]) for k in sorted(lfs.keys()) 
        if filter_fn(k)
    ])
    return res

fns_indck_comp_pos = get_lfs(lambda k: k.startswith('LF_indck_txtptn_pos'))
fns_secck_comp_pos = get_lfs(lambda k: k.startswith('LF_secck_txtptn_pos'))
fns_indtf_comp_pos = get_lfs(lambda k: k.startswith('LF_indtf_txtptn_pos'))

def summarize_fns(name, fns):
    print('{}: Num functions found = {}, Names:\n{}'.format(
        name, len(fns), '\n'.join(list(fns.keys()))
    ))
summarize_fns('fns_indck_comp_pos', fns_indck_comp_pos)
summarize_fns('fns_secck_comp_pos', fns_secck_comp_pos)
summarize_fns('fns_indtf_comp_pos', fns_indtf_comp_pos)

def LF_indck_comp_neg_sec(c):
    # Create function that returns -1 for inducing cytokines anytime a positive hit is found on secretion
    # * should return -1 or 0
    return min(-1*max([f(c) for f in fns_secck_comp_pos.values()]), 0)

def LF_secck_comp_neg_ind(c):
    # Create function that returns -1 for inducing cytokines anytime a positive hit is found on secretion
    # * should return -1 or 0
    return min(-1*max([f(c) for f in fns_indck_comp_pos.values()]), 0)

fns_indck_comp_neg = get_lfs(lambda k: k.startswith('LF_indck_txtptn_neg') or \
                             k.startswith('LF_indck_heur_distref') or \
                             k.startswith('LF_indck_heur_closer_ct_to_ck') or \
                             k.startswith('LF_indck_comp_neg_sec') or \
                             k.startswith('LF_indck_heur_complex_cand_02') or \
                             k.startswith('LF_indck_heur_valid_cand_03'))
fns_secck_comp_neg = get_lfs(lambda k: k.startswith('LF_secck_txtptn_neg') or \
                             k.startswith('LF_secck_heur_distref') or \
                             k.startswith('LF_secck_heur_closer_ct_to_ck') or \
                             k.startswith('LF_secck_comp_neg_ind') or \
                             k.startswith('LF_secck_heur_complex_cand_02') or \
                             k.startswith('LF_secck_heur_valid_cand_03'))
fns_indtf_comp_neg = get_lfs(lambda k: k.startswith('LF_indtf_txtptn_neg') or \
                             k.startswith('LF_indtf_heur_distref') or \
                             k.startswith('LF_indtf_heur_closer_ct_to_tf') or \
                             k.startswith('LF_indtf_heur_complex_cand_02') or \
                             k.startswith('LF_indtf_heur_valid_cand_03'))
summarize_fns('fns_indck_comp_neg', fns_indck_comp_neg)
summarize_fns('fns_secck_comp_neg', fns_secck_comp_neg)
summarize_fns('fns_indtf_comp_neg', fns_indtf_comp_neg)

def apply_composite_xor_indck(c):
    is_secck = max([f(c) for f in fns_secck_comp_pos.values()]) > 0
    is_indck = max([f(c) for f in fns_indck_comp_pos.values()]) > 0
    not_secck = min([f(c) for f in fns_secck_comp_neg.values()]) < 0
    not_indck = min([f(c) for f in fns_indck_comp_neg.values()]) < 0
    # Look first for single strong positive prediction
    if is_secck and is_indck:
        return 0
    if is_indck and not is_secck:
        return 1
    if not is_indck and is_secck:
        return -1
    # If neither have positive prediction, look for strong negative prediction
    if not_secck == not_indck:
        return 0
    if not_indck and not not_secck:
        return -1
    return 1

def LF_indck_comp_xor(c):
    return apply_composite_xor_indck(c)

def LF_secck_comp_xor(c):
    return -1*apply_composite_xor_indck(c)

def LF_indtf_comp_xor(c):
    is_indtf = max([f(c) for f in fns_indtf_comp_pos.values()]) > 0
    not_indtf = min([f(c) for f in fns_indtf_comp_neg.values()]) < 0
    # Take non-conflicting positive or negative classifications and make 0 otherwise
    if is_indtf and not not_indtf:
        return 1
    if not_indtf and not is_indtf:
        return -1
    return 0

# Distant supervision composites
def LF_indck_comp_imexpresso_nonneg(c):
    skip = min([f(c) for f in fns_indck_comp_neg.values()]) < 0
    return LF_indck_dsup_imexpresso_mp08(c) if not skip else 0

def LF_secck_comp_imexpresso_nonneg(c):
    skip = min([f(c) for f in fns_secck_comp_neg.values()]) < 0
    return LF_secck_dsup_imexpresso_mp08(c) if not skip else 0

fns_indck_comp_pos: Num functions found = 1, Names:
LF_indck_txtptn_pos_all
fns_secck_comp_pos: Num functions found = 1, Names:
LF_secck_txtptn_pos_all
fns_indtf_comp_pos: Num functions found = 1, Names:
LF_indtf_txtptn_pos_all
fns_indck_comp_neg: Num functions found = 13, Names:
LF_indck_comp_neg_sec
LF_indck_heur_closer_ct_to_ck
LF_indck_heur_complex_cand_02
LF_indck_heur_distref
LF_indck_heur_distref_10
LF_indck_heur_distref_15
LF_indck_heur_distref_20
LF_indck_heur_distref_25
LF_indck_heur_distref_32
LF_indck_heur_distref_5
LF_indck_heur_distref_50
LF_indck_heur_valid_cand_03
LF_indck_txtptn_neg_all
fns_secck_comp_neg: Num functions found = 13, Names:
LF_secck_comp_neg_ind
LF_secck_heur_closer_ct_to_ck
LF_secck_heur_complex_cand_02
LF_secck_heur_distref
LF_secck_heur_distref_10
LF_secck_heur_distref_15
LF_secck_heur_distref_20
LF_secck_heur_distref_25
LF_secck_heur_distref_32
LF_secck_heur_distref_5
LF_secck_heur_distref_50
LF_secck_heur_valid_cand_03
LF_secck_txtptn_neg_all
fns_in

In [10]:
LFs = collections.OrderedDict()
LFs[REL_CLASS_INDUCING_CYTOKINE] = get_lfs(lambda k: k.startswith('LF_indck')).values()
LFs[REL_CLASS_SECRETED_CYTOKINE] = get_lfs(lambda k: k.startswith('LF_secck')).values()
LFs[REL_CLASS_INDUCING_TRANSCRIPTION_FACTOR] = get_lfs(lambda k: k.startswith('LF_indtf')).values()
for k in LFs:
    print('{} labeling functions:'.format(k))
    for f in LFs[k]:
        print(f.__name__)
    print()

InducingCytokine labeling functions:
LF_indck_comp_imexpresso_nonneg
LF_indck_comp_neg_sec
LF_indck_comp_xor
LF_indck_dsup_imexpresso_mp04
LF_indck_dsup_imexpresso_mp08
LF_indck_dsup_imexpresso_mp12
LF_indck_dsup_imexpresso_mp20
LF_indck_heur_closer_ck_to_ct
LF_indck_heur_closer_ct_to_ck
LF_indck_heur_closer_ref
LF_indck_heur_complex_cand_01
LF_indck_heur_complex_cand_02
LF_indck_heur_distref
LF_indck_heur_distref_10
LF_indck_heur_distref_15
LF_indck_heur_distref_20
LF_indck_heur_distref_25
LF_indck_heur_distref_32
LF_indck_heur_distref_5
LF_indck_heur_distref_50
LF_indck_heur_hypothesis_cand_01
LF_indck_heur_hypothesis_cand_02
LF_indck_heur_kwds_01
LF_indck_heur_kwds_02
LF_indck_heur_kwds_03
LF_indck_heur_prexpression_cand_01
LF_indck_heur_valid_cand_01
LF_indck_heur_valid_cand_02
LF_indck_heur_valid_cand_03
LF_indck_parse_tree_01
LF_indck_txtptn_neg_all
LF_indck_txtptn_pos_all

SecretedCytokine labeling functions:
LF_secck_comp_imexpresso_nonneg
LF_secck_comp_neg_ind
LF_secck_comp_xo

#### LF Testing

In [11]:
# from snorkel.viewer import SentenceNgramViewer
# from snorkel.annotations import load_gold_labels

# candidate_class = classes.inducing_cytokine
# #candidate_class = classes.secreted_cytokine
# #candidate_class = classes.inducing_transcription_factor
# # Load all dev labels for this relation class
# L_dev = supervision.get_gold_labels(session, candidate_class, split=SPLIT_DEV)

# labeled, gold = [], []
# for cand_id, v in L_dev.items():
#     c = session.query(Candidate).filter(Candidate.id == cand_id).one()
#     if LF_indck_parse_tree_01(c) == 1:
#         labeled.append(c)
#         if v > 0:
#             gold.append(c)

# print("Number labeled =", len(labeled), ", num gold =", len(gold))
# # - ID in hover state is sentence id (c.get_parent().id)
# # - Cell type label will be blue if candidate is NOT in "gold" set and red otherwise
# SentenceNgramViewer(labeled, session, gold=gold, height=1000)

In [12]:
# from tcre.query import DocToCand
# sent_id = 325453
# df_doc_cand = DocToCand.all(session, classes)
# cand_ids = df_doc_cand.set_index(['cand_type', 'sentence_id']).sort_index().loc[(candidate_class.field, sent_id)]['cand_id']
# c = session.query(Candidate).filter(Candidate.id == int(cand_ids.values[0])).one()
# c

## Apply Labeling to Candidates

In [13]:
for c in classes.values():
    for split in [SPLIT_DEV, SPLIT_VAL, SPLIT_TEST, SPLIT_TRAIN, SPLIT_INFER]:
        n = session.query(c.subclass).filter(c.subclass.split == split).count()
        print('Candidate counts: {} (split {}) -> {}'.format(c.name, split, n))

Candidate counts: InducingCytokine (split 1) -> 1025
Candidate counts: InducingCytokine (split 3) -> 278
Candidate counts: InducingCytokine (split 2) -> 371
Candidate counts: InducingCytokine (split 0) -> 10000
Candidate counts: InducingCytokine (split 9) -> 80165
Candidate counts: SecretedCytokine (split 1) -> 1019
Candidate counts: SecretedCytokine (split 3) -> 272
Candidate counts: SecretedCytokine (split 2) -> 364
Candidate counts: SecretedCytokine (split 0) -> 10000
Candidate counts: SecretedCytokine (split 9) -> 80184
Candidate counts: InducingTranscriptionFactor (split 1) -> 627
Candidate counts: InducingTranscriptionFactor (split 3) -> 225
Candidate counts: InducingTranscriptionFactor (split 2) -> 300
Candidate counts: InducingTranscriptionFactor (split 0) -> 10000
Candidate counts: InducingTranscriptionFactor (split 9) -> 25383


In [14]:
from tcre import labeling
labeling.clear_labeling_functions(session)

{'Label': 23052, 'LabelKey': 32}

In [15]:
res_label = collections.defaultdict(dict)
for c in classes:
    if c not in LFs:
        continue
#     for i, split in enumerate([SPLIT_DEV]):
#     for i, split in enumerate([SPLIT_DEV, SPLIT_VAL, SPLIT_TEST]):
#     for i, split in enumerate([SPLIT_TRAIN]):
    for i, split in enumerate([SPLIT_DEV, SPLIT_VAL, SPLIT_TEST, SPLIT_TRAIN]):
        res_label[c][split] = labeling.apply_labeling_functions(session, classes[c], split, LFs[c])

  0%|          | 2/1025 [00:00<01:25, 11.97it/s]

Running UDF...


100%|██████████| 1025/1025 [01:04<00:00, 15.98it/s]
  1%|          | 3/278 [00:00<00:11, 24.79it/s]

Running UDF...


100%|██████████| 278/278 [00:18<00:00, 15.29it/s]
  1%|          | 3/371 [00:00<00:17, 21.31it/s]

Running UDF...


100%|██████████| 371/371 [00:26<00:00, 15.08it/s]
  0%|          | 3/10000 [00:00<06:36, 25.19it/s]

Running UDF...


100%|██████████| 10000/10000 [11:40<00:00, 14.28it/s]
  0%|          | 3/1019 [00:00<00:44, 23.08it/s]

Running UDF...


100%|██████████| 1019/1019 [01:03<00:00, 17.62it/s]
  1%|          | 3/272 [00:00<00:11, 24.20it/s]

Running UDF...


100%|██████████| 272/272 [00:18<00:00, 16.83it/s]
  1%|          | 3/364 [00:00<00:15, 24.03it/s]

Running UDF...


100%|██████████| 364/364 [00:25<00:00, 14.12it/s]
  0%|          | 3/10000 [00:00<06:43, 24.77it/s]

Running UDF...


100%|██████████| 10000/10000 [11:39<00:00, 17.16it/s]
  1%|          | 4/627 [00:00<00:18, 34.49it/s]

Running UDF...


100%|██████████| 627/627 [00:20<00:00, 27.39it/s]
  1%|▏         | 3/225 [00:00<00:09, 24.05it/s]

Running UDF...


100%|██████████| 225/225 [00:08<00:00, 26.66it/s]
  1%|          | 3/300 [00:00<00:12, 23.72it/s]

Running UDF...


100%|██████████| 300/300 [00:11<00:00, 29.80it/s]
  0%|          | 3/10000 [00:00<05:34, 29.90it/s]

Running UDF...


100%|██████████| 10000/10000 [05:59<00:00, 27.84it/s]


In [23]:
# [(k, len(set(v))) for k, v in OFFTARGET_SPLITS.items()]

In [16]:
from IPython.display import display
for c in res_label:
    for split in res_label[c]:
        if split not in [SPLIT_DEV, SPLIT_TRAIN]:
            continue
        print('Stats {} (split = {} ({})):'.format(c, SPLIT_MAP[split], split))
        df = res_label[c][split][2]
        subset = ['Coverage'] + (['Empirical Acc.'] if 'Empirical Acc.' in df else [])
        display(df.style.background_gradient(subset=subset))

Stats InducingCytokine (split = dev (1)):


Unnamed: 0,j,Coverage,Overlaps,Conflicts,TP,FP,FN,TN,Empirical Acc.,Empirical F1
LF_indck_comp_imexpresso_nonneg,0,0.0165854,0.0165854,0.0165854,1,1,0,15,0.941176,0.666667
LF_indck_comp_neg_sec,1,0.131707,0.131707,0.130732,0,0,7,128,0.948148,
LF_indck_comp_xor,2,0.257561,0.257561,0.256585,64,68,5,127,0.723485,0.636816
LF_indck_dsup_imexpresso_mp04,3,0.217561,0.217561,0.210732,38,29,6,150,0.843049,0.684685
LF_indck_dsup_imexpresso_mp08,4,0.134634,0.134634,0.12878,16,7,3,112,0.927536,0.761905
LF_indck_dsup_imexpresso_mp12,5,0.12,0.12,0.114146,16,7,3,97,0.918699,0.761905
LF_indck_dsup_imexpresso_mp20,6,0.0956098,0.0956098,0.0858537,16,7,9,66,0.836735,0.666667
LF_indck_heur_closer_ck_to_ct,7,0.347317,0.347317,0.254634,0,0,33,323,0.907303,
LF_indck_heur_closer_ct_to_ck,8,0.253659,0.253659,0.156098,0,0,10,250,0.961538,
LF_indck_heur_closer_ref,9,0.427317,0.427317,0.323902,0,0,41,397,0.906393,


Stats InducingCytokine (split = train (0)):


Unnamed: 0,j,Coverage,Overlaps,Conflicts
LF_indck_comp_imexpresso_nonneg,0,0.0117,0.0117,0.0116
LF_indck_comp_neg_sec,1,0.1442,0.1442,0.1438
LF_indck_comp_xor,2,0.167,0.167,0.1662
LF_indck_dsup_imexpresso_mp04,3,0.2039,0.2039,0.1935
LF_indck_dsup_imexpresso_mp08,4,0.1358,0.1358,0.1271
LF_indck_dsup_imexpresso_mp12,5,0.1088,0.1088,0.1008
LF_indck_dsup_imexpresso_mp20,6,0.0905,0.0905,0.0862
LF_indck_heur_closer_ck_to_ct,7,0.2585,0.2585,0.2041
LF_indck_heur_closer_ct_to_ck,8,0.1912,0.1912,0.1484
LF_indck_heur_closer_ref,9,0.3445,0.3445,0.2843


Stats SecretedCytokine (split = dev (1)):


Unnamed: 0,j,Coverage,Overlaps,Conflicts,TP,FP,FN,TN,Empirical Acc.,Empirical F1
LF_secck_comp_imexpresso_nonneg,0,0.0245339,0.0245339,0.0225711,9,13,0,3,0.48,0.580645
LF_secck_comp_neg_ind,1,0.133464,0.133464,0.133464,0,0,0,136,1.0,
LF_secck_comp_xor,2,0.247301,0.247301,0.243376,72,47,0,133,0.813492,0.753927
LF_secck_dsup_imexpresso_mp04,3,0.244357,0.244357,0.244357,54,99,5,91,0.582329,0.509434
LF_secck_dsup_imexpresso_mp08,4,0.154073,0.154073,0.15211,35,98,0,24,0.375796,0.416667
LF_secck_dsup_imexpresso_mp12,5,0.13739,0.13739,0.135427,24,92,0,24,0.342857,0.342857
LF_secck_dsup_imexpresso_mp20,6,0.101079,0.101079,0.0991168,22,57,0,24,0.446602,0.435644
LF_secck_heur_closer_ck_to_ct,7,0.335623,0.335623,0.266928,0,0,13,329,0.961988,
LF_secck_heur_closer_ct_to_ck,8,0.249264,0.249264,0.181551,0,0,4,250,0.984252,
LF_secck_heur_closer_ref,9,0.412169,0.412169,0.334642,0,0,17,403,0.959524,


Stats SecretedCytokine (split = train (0)):


Unnamed: 0,j,Coverage,Overlaps,Conflicts
LF_secck_comp_imexpresso_nonneg,0,0.02,0.02,0.0186
LF_secck_comp_neg_ind,1,0.025,0.025,0.025
LF_secck_comp_xor,2,0.1649,0.1649,0.1625
LF_secck_dsup_imexpresso_mp04,3,0.2184,0.2184,0.2133
LF_secck_dsup_imexpresso_mp08,4,0.128,0.128,0.1257
LF_secck_dsup_imexpresso_mp12,5,0.1079,0.1079,0.1059
LF_secck_dsup_imexpresso_mp20,6,0.0892,0.0892,0.0878
LF_secck_heur_closer_ck_to_ct,7,0.257,0.257,0.2203
LF_secck_heur_closer_ct_to_ck,8,0.2014,0.2014,0.1692
LF_secck_heur_closer_ref,9,0.3477,0.3477,0.3055


Stats InducingTranscriptionFactor (split = dev (1)):


Unnamed: 0,j,Coverage,Overlaps,Conflicts,TP,FP,FN,TN,Empirical Acc.,Empirical F1
LF_indtf_comp_xor,0,0.800638,0.800638,0.787879,15,9,47,431,0.888446,0.348837
LF_indtf_heur_closer_ct_to_tf,1,0.307815,0.307815,0.304625,0,0,4,189,0.979275,
LF_indtf_heur_closer_ref,2,0.385965,0.385965,0.37799,0,0,13,229,0.946281,
LF_indtf_heur_closer_tf_to_ct,3,0.207337,0.207337,0.202552,0,0,9,121,0.930769,
LF_indtf_heur_complex_cand_01,4,0.936204,0.936204,0.92504,0,0,89,498,0.848382,
LF_indtf_heur_complex_cand_02,5,0.708134,0.708134,0.69697,0,0,63,381,0.858108,
LF_indtf_heur_distref,6,0.208931,0.208931,0.197767,0,0,3,128,0.977099,
LF_indtf_heur_distref_10,7,1.0,1.0,0.985646,84,215,23,305,0.620415,0.413793
LF_indtf_heur_distref_15,8,1.0,1.0,0.985646,95,293,12,227,0.513557,0.383838
LF_indtf_heur_distref_20,9,1.0,1.0,0.985646,101,341,6,179,0.446571,0.367942


Stats InducingTranscriptionFactor (split = train (0)):


Unnamed: 0,j,Coverage,Overlaps,Conflicts
LF_indtf_comp_xor,0,0.773,0.773,0.7472
LF_indtf_heur_closer_ct_to_tf,1,0.1633,0.1633,0.1525
LF_indtf_heur_closer_ref,2,0.2569,0.2569,0.2405
LF_indtf_heur_closer_tf_to_ct,3,0.1452,0.1452,0.1343
LF_indtf_heur_complex_cand_01,4,0.8832,0.8832,0.8593
LF_indtf_heur_complex_cand_02,5,0.5269,0.5269,0.5038
LF_indtf_heur_distref,6,0.1749,0.1749,0.1508
LF_indtf_heur_distref_10,7,1.0,1.0,0.9739
LF_indtf_heur_distref_15,8,1.0,1.0,0.9739
LF_indtf_heur_distref_20,9,1.0,1.0,0.9739
