In [20]:
import numpy as np
import snorkel
from snorkel.labeling import labeling_function
from snorkel.labeling import LabelingFunction
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
from snorkel.labeling.model import MajorityLabelVoter
import json
import pandas as pd
import label_improve

In [21]:
%load_ext autoreload
%autoreload 2
import label_improve as li

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
# Loading the data 
dataset_name = "chemprot"

idx_to_label = json.load(open(f"../weak_datasets/{dataset_name}/label.json"))
label_to_idx = {l:i for i,l in idx_to_label.items()}
valid_df = li.chemprot_to_df(json.load(open(f"../weak_datasets/{dataset_name}/valid.json", "r")))
train_df = li.chemprot_to_df(json.load(open(f"../weak_datasets/{dataset_name}/train.json", "r")))
test_df = li.chemprot_to_df(json.load(open(f"../weak_datasets/{dataset_name}/test.json", "r")))

# Sample a dev set to help seed ideas for LFs
dev_df = train_df.sample(250, random_state=123)

In [30]:
# chemprot functions:

ABSTAIN = -1
### Keyword based labeling functions ###

## Part of
#0
@labeling_function()
def lf_amino_acid(x):
    return 0 if 'amino acid' in x.sentence.lower() else ABSTAIN
#1
@labeling_function()
def lf_replace(x):
    return 0 if 'replace' in x.sentence.lower() else ABSTAIN
#2
@labeling_function()
def lf_mutant(x):
    return 0 if 'mutant' in x.sentence.lower() or 'mutat' in x.sentence.lower() else ABSTAIN
#3
## Regulator
@labeling_function()
def lf_bind(x):
    return 1 if 'bind' in x.sentence.lower() else ABSTAIN
#4
@labeling_function()
def lf_interact(x):
    return 1 if 'interact' in x.sentence.lower() else ABSTAIN
#5
@labeling_function()
def lf_affinity(x):
    return 1 if 'affinit' in x.sentence.lower() else ABSTAIN
#6
## Upregulator
# Activator
@labeling_function()
def lf_activate(x):
    return 2 if 'activat' in x.sentence.lower() else ABSTAIN
#7
@labeling_function()
def lf_increase(x):
    return 2 if 'increas' in x.sentence.lower() else ABSTAIN
#8 
@labeling_function()
def lf_induce(x):
    return 2 if 'induc' in x.sentence.lower() else ABSTAIN
#9
@labeling_function()
def lf_stimulate(x):
    return 2 if 'stimulat' in x.sentence.lower() else ABSTAIN
#10
@labeling_function()
def lf_upregulate(x):
    return 2 if 'upregulat' in x.sentence.lower() else ABSTAIN
#11
## Downregulator
@labeling_function()
def lf_downregulate(x):
    return 3 if 'downregulat' in x.sentence.lower() or 'down-regulat' in x.sentence.lower() else ABSTAIN
#12
@labeling_function()
def lf_reduce(x):
    return 3 if 'reduc' in x.sentence.lower() else ABSTAIN
#13
@labeling_function()
def lf_inhibit(x):
    return 3 if 'inhibit' in x.sentence.lower() else ABSTAIN
#14
@labeling_function()
def lf_decrease(x):
    return 3 if 'decreas' in x.sentence.lower() else ABSTAIN
#15
## Agonist
@labeling_function()
def lf_agonist(x):
    return 4 if ' agoni' in x.sentence.lower() or "\tagoni" in x.sentence.lower() else ABSTAIN

#16
## Antagonist
@labeling_function()
def lf_antagonist(x):
    return 5 if 'antagon' in x.sentence.lower() else ABSTAIN

#17
## Modulator
# TODO: Delete this LF, or change this to modulator ??
@labeling_function()
def lf_modulate(x):
    return 6 if 'modulat' in x.sentence.lower() else ABSTAIN

#18
@labeling_function()
def lf_allosteric(x):
    return 6 if 'allosteric' in x.sentence.lower() else ABSTAIN
#19
## Cofactor
@labeling_function()
def lf_cofactor(x):
    return 7 if 'cofactor' in x.sentence.lower() else ABSTAIN
#20
## Substrate/Product
@labeling_function()
def lf_substrate(x):
    return 8 if 'substrate' in x.sentence.lower() else ABSTAIN
#21
@labeling_function()
def lf_transport(x):
    return 8 if 'transport' in x.sentence.lower() else ABSTAIN
#22
@labeling_function()
def lf_catalyze(x):
    return 8 if 'catalyz' in x.sentence.lower() or 'catalys' in x.sentence.lower() else ABSTAIN
#23
@labeling_function()
def lf_product(x):
    return 8 if "produc" in x.sentence.lower() else ABSTAIN
#24
@labeling_function()
def lf_convert(x):
    return 8 if "conver" in x.sentence.lower() else ABSTAIN
#25
## NOT
@labeling_function()
def lf_not(x):
    return 9 if 'not' in x.sentence.lower() else ABSTAIN

# 26 
@labeling_function()
def lf_combined_modulator(x):
    sentence_lower = x.text.lower()

    # Specific Modulation Terms
    specific_terms = ['allosteric modulator', 'positive modulator', 'negative modulator', 'non-competitive modulator', 'positive allosteric modulator']
    if any(term in sentence_lower for term in specific_terms):
        return 6

    # Entity-Proximity Modulation
    modulating_terms = ['modulat', 'allosteric', 'potentiate', 'regulate']
    for term in modulating_terms:
        if term in sentence_lower:
            term_index = sentence_lower.index(term)
            if abs(term_index - x.entity1_index) < 20 or abs(term_index - x.entity2_index) < 20:
                return 6

    # Combined Entity and Action Modulation
    if x.entity1 in sentence_lower and x.entity2 in sentence_lower:
        entity1_index = sentence_lower.index(x.entity1.lower())
        entity2_index = sentence_lower.index(x.entity2.lower())
        between_entities = sentence_lower[min(entity1_index, entity2_index):max(entity1_index, entity2_index)]
        if 'modulate' in between_entities or 'regulate' in between_entities:
            return 6

    # Positive and Modulator Proximity
    if 'positive' in sentence_lower and 'modulator' in sentence_lower:
        pos_indices = [i for i, word in enumerate(sentence_lower.split()) if word == 'positive']
        mod_indices = [i for i, word in enumerate(sentence_lower.split()) if 'modulator' in word]
        min_distance = min(abs(p - m) for p in pos_indices for m in mod_indices)
        if min_distance <= 3:
            return 6

    return ABSTAIN


In [31]:
# show the first row of the dataframe
train_dev = train_df.head(30)
train_dev

Unnamed: 0,text,labels,entity1,entity2,span1,span2,weak_labels
10305,"Selective inhibition of PDE5 is a rational therapeutic approach in ED, as proved by the clinical success of sildenafil.",3,PDE5,sildenafil,"[24, 28]","[108, 118]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
8422,"Furthermore, knockdown of OPN enhanced cell death caused by other drugs, including paclitaxel, doxorubicin, actinomycin-D, and rapamycin, which are also P-gp substrates.",8,P-gp,paclitaxel,"[153, 157]","[83, 93]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 8, -1, -1, -1, -1, -1]"
8486,"Furthermore, no impact on cytokine release (i.e., on IL-10, IL-6, IL-12/23p40 and TNFα levels) was seen in LPS-stimulated human PBMCs, except with JWH-210 and JWH-122 which caused a decrease of TNFα and IL-12/23p40.",3,IL-12,JWH-210,"[203, 208]","[147, 154]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 2, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
6254,"Among neuroleptics, the four most potent compounds at the human serotonin transporter were triflupromazine, fluperlapine, chlorpromazine, and ziprasidone (K(D) 24-39 nM); and at the norepinephrine transporter, chlorpromazine, zotepine, chlorprothixene, and promazine (K(D) 19-25 nM).",1,norepinephrine transporter,chlorpromazine,"[182, 208]","[210, 224]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 8, -1, -1, -1, -1]"
1801,"Epidermal growth factor receptor inhibitors currently under investigation include the small molecules gefitinib (Iressa, ZD1839) and erlotinib (Tarceva, OSI-774), as well as monoclonal antibodies such as cetuximab (IMC-225, Erbitux).",3,Epidermal growth factor receptor,Tarceva,"[0, 32]","[144, 151]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
82,"The high-affinity binding of L-noradrenaline to phenylalanine hydroxylase, as studied by equilibrium microdialysis (anaerobically) and ultrafiltration (aerobically), shows positive cooperativity (h = 1.9); at pH 7.2 and 20 degrees C the rat enzyme binds about 0.5 mol L-noradrenaline/mol subunit with a half-maximal binding (S50) at 0.25 microM L-noradrenaline.",1,phenylalanine hydroxylase,L-noradrenaline,"[48, 73]","[268, 283]","[-1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
5981,"Tyrosinase catalyzes an unusual oxidative decarboxylation of 3,4-dihydroxymandelate.",8,Tyrosinase,"3,4-dihydroxymandelate","[0, 10]","[61, 83]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 8, -1, -1, -1]"
6865,"Jo2-induced activation of caspase-3 or -9 in liver tissues was inhibited by minocycline pretreatment, and yet the direct addition of minocycline to liver extracts from Jo2-challenged mice failed to block caspase activation in vitro.",9,caspase,minocycline,"[204, 211]","[133, 144]","[-1, -1, -1, -1, -1, -1, 2, -1, 2, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
9540,Lintitript markedly increased postprandial plasma CCK release (P<0.001) while distinctly reducing postprandial PP levels (P<0.01) as compared to placebo.,2,CCK,Lintitript,"[50, 53]","[0, 10]","[-1, -1, -1, -1, -1, -1, -1, 2, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
13166,"Progestin induction of the cyclin D1 gene, which lacks a progesterone response element, was dependent on PR activation of the Src/MAPK pathway, whereas induction of the Sgk (serum and glucocorticoid regulated kinase) gene that contains a functional progesterone response element was unaffected by mutations that interfere with PR activation of Src.",1,Sgk,Progestin,"[169, 172]","[0, 9]","[-1, -1, 0, -1, -1, -1, 2, -1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"


In [32]:
df = li.chemprot_enhanced(train_dev)
df

Unnamed: 0,text,labels,entity1,entity2,span1,span2,weak_labels,entity1_index,entity2_index
10305,"Selective inhibition of PDE5 is a rational therapeutic approach in ED, as proved by the clinical success of sildenafil.",3,PDE5,sildenafil,"[24, 28]","[108, 118]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",3.0,18.0
8422,"Furthermore, knockdown of OPN enhanced cell death caused by other drugs, including paclitaxel, doxorubicin, actinomycin-D, and rapamycin, which are also P-gp substrates.",8,P-gp,paclitaxel,"[153, 157]","[83, 93]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 8, -1, -1, -1, -1, -1]",20.0,12.0
8486,"Furthermore, no impact on cytokine release (i.e., on IL-10, IL-6, IL-12/23p40 and TNFα levels) was seen in LPS-stimulated human PBMCs, except with JWH-210 and JWH-122 which caused a decrease of TNFα and IL-12/23p40.",3,IL-12,JWH-210,"[203, 208]","[147, 154]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 2, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",32.0,22.0
6254,"Among neuroleptics, the four most potent compounds at the human serotonin transporter were triflupromazine, fluperlapine, chlorpromazine, and ziprasidone (K(D) 24-39 nM); and at the norepinephrine transporter, chlorpromazine, zotepine, chlorprothixene, and promazine (K(D) 19-25 nM).",1,norepinephrine transporter,chlorpromazine,"[182, 208]","[210, 224]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 8, -1, -1, -1, -1]",24.5,26.0
1801,"Epidermal growth factor receptor inhibitors currently under investigation include the small molecules gefitinib (Iressa, ZD1839) and erlotinib (Tarceva, OSI-774), as well as monoclonal antibodies such as cetuximab (IMC-225, Erbitux).",3,Epidermal growth factor receptor,Tarceva,"[0, 32]","[144, 151]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",1.5,17.0
82,"The high-affinity binding of L-noradrenaline to phenylalanine hydroxylase, as studied by equilibrium microdialysis (anaerobically) and ultrafiltration (aerobically), shows positive cooperativity (h = 1.9); at pH 7.2 and 20 degrees C the rat enzyme binds about 0.5 mol L-noradrenaline/mol subunit with a half-maximal binding (S50) at 0.25 microM L-noradrenaline.",1,phenylalanine hydroxylase,L-noradrenaline,"[48, 73]","[268, 283]","[-1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",6.5,37.0
5981,"Tyrosinase catalyzes an unusual oxidative decarboxylation of 3,4-dihydroxymandelate.",8,Tyrosinase,"3,4-dihydroxymandelate","[0, 10]","[61, 83]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 8, -1, -1, -1]",0.0,7.0
6865,"Jo2-induced activation of caspase-3 or -9 in liver tissues was inhibited by minocycline pretreatment, and yet the direct addition of minocycline to liver extracts from Jo2-challenged mice failed to block caspase activation in vitro.",9,caspase,minocycline,"[204, 211]","[133, 144]","[-1, -1, -1, -1, -1, -1, 2, -1, 2, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",30.0,20.0
9540,Lintitript markedly increased postprandial plasma CCK release (P<0.001) while distinctly reducing postprandial PP levels (P<0.01) as compared to placebo.,2,CCK,Lintitript,"[50, 53]","[0, 10]","[-1, -1, -1, -1, -1, -1, -1, 2, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",5.0,0.0
13166,"Progestin induction of the cyclin D1 gene, which lacks a progesterone response element, was dependent on PR activation of the Src/MAPK pathway, whereas induction of the Sgk (serum and glucocorticoid regulated kinase) gene that contains a functional progesterone response element was unaffected by mutations that interfere with PR activation of Src.",1,Sgk,Progestin,"[169, 172]","[0, 9]","[-1, -1, 0, -1, -1, -1, 2, -1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",26.0,0.0


In [33]:
lfs = [lf_combined_modulator]

# Initialize the applier
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df)
L_train

100%|██████████| 30/30 [00:00<00:00, 23466.83it/s]


array([[-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1]])

In [34]:
# Cell to inspect the dev set for a given label 
label = 'Modulator' # Change this variable
pd.set_option('display.max_colwidth', 1000)
label6 = train_df[train_df['labels'] == int(label_to_idx[label])]
# save the dev set to a file
df2 = li.chemprot_enhanced(label6)
df2

Unnamed: 0,text,labels,entity1,entity2,span1,span2,weak_labels,entity1_index,entity2_index
12446,"While SAR within the HTS series was very shallow and unable to be optimized, grafting the phenethyl ether linkage onto the ML129/ML172 cores led to the first sub-micromolar M5 PAM, ML326 (VU0467903), (human and rat M5 EC50s of 409nM and 500nM, respectively) with excellent mAChR selectivity (M1-M4 EC50s >30μM) and a robust 20-fold leftward shift of the ACh CRC.",6,M5,ML172,"[173, 175]","[129, 134]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",28.0,21.0
3477,"Anxiolytic- but not antidepressant-like activity of Lu AF21934, a novel, selective positive allosteric modulator of the mGlu₄ receptor.",6,mGlu₄,Lu AF21934,"[120, 125]","[52, 62]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 6, 6, -1, -1, -1, -1, -1, -1, 9]",16.0,6.5
8655,"Conversely, ovarian PRA and PRB were positively regulated by ethanol and ethanol-melatonin combination, whereas PRA was down-regulated in the uterus and oviduct after ethanol consumption.",6,PRB,melatonin,"[28, 31]","[81, 90]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 8, -1]",4.0,11.0
3387,"BDZs and other positive GABA(A)R modulators, including barbiturates, ethanol, and neurosteroids, can also inhibit L-type voltage-gated calcium channels (L-VGCCs), which could contribute to reduced neuronal excitability.",6,GABA(A)R,BDZs,"[24, 32]","[0, 4]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 3, -1, -1, -1, 6, -1, -1, -1, -1, -1, -1, -1, -1]",4.0,0.0
8654,"Conversely, ovarian PRA and PRB were positively regulated by ethanol and ethanol-melatonin combination, whereas PRA was down-regulated in the uterus and oviduct after ethanol consumption.",6,PRA,melatonin,"[20, 23]","[81, 90]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 8, -1]",2.0,11.0
13836,The recently discovered hyperinsulinism/hyperammonemia disorder showed that the loss of allosteric inhibition of GDH by GTP causes excessive secretion of insulin.,6,GDH,GTP,"[113, 116]","[120, 123]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, 6, -1, -1, -1, -1, -1, -1, -1]",13.0,15.0
5378,The implications of our finding for mechanism of in vivo actions of rapamycin and for design of novel allosteric drugs targeting the proteasome are discussed.,6,proteasome,rapamycin,"[133, 143]","[68, 77]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 6, -1, -1, -1, -1, -1, -1, -1]",22.0,12.0
3606,Ifenprodil is an allosteric inhibitor of GluN1/GluN2B N-methyl-D-aspartate receptors.,6,GluN1,Ifenprodil,"[41, 46]","[0, 10]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, 6, -1, -1, -1, -1, -1, -1, -1]",6.0,0.0
15281,Synthesis and structure-activity relationships of indazole arylsulfonamides as allosteric CC-chemokine receptor 4 (CCR4) antagonists.,6,CC-chemokine receptor 4,indazole arylsulfonamides,"[90, 113]","[50, 75]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, -1, 6, -1, -1, -1, -1, -1, -1, -1]",10.0,5.5
3381,Benzodiazepines (BDZs) depress neuronal excitability via positive allosteric modulation of inhibitory GABA(A) receptors (GABA(A)R).,6,GABA(A)R,BDZs,"[121, 129]","[17, 21]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, 6, 6, -1, -1, -1, -1, -1, -1, -1]",13.0,1.0


In [41]:
count = 0
for i in range(len(df2)):
    weak_label = df2.iloc[i]['weak_labels'][17]
    if weak_label != -1:
        print(i)
        count+=1
print('count', count)

1
3
9
29
33
36
38
40
41
49
53
count 11


In [35]:
L_train = applier.apply(df=df2)
L_train

100%|██████████| 55/55 [00:00<00:00, 26154.96it/s]


array([[-1],
       [ 6],
       [-1],
       [ 6],
       [-1],
       [-1],
       [-1],
       [ 6],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [ 6],
       [-1],
       [-1],
       [-1],
       [-1],
       [ 6],
       [ 6],
       [-1],
       [-1],
       [-1],
       [-1],
       [ 6],
       [-1],
       [ 6],
       [-1],
       [-1],
       [-1],
       [ 6],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [ 6],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [ 6],
       [ 6],
       [-1],
       [-1],
       [ 6],
       [-1]])

In [57]:


# Cell to inspect the dev set for a given label 
label = 'Modulator' # Change this variable
pd.set_option('display.max_colwidth', 1000)


def get_18th_element(lst):
    return lst[16] if len(lst) >16  else None

# Create a copy of the DataFrame
train_df_copy = train_df.copy()

# Apply the function to the copy
train_df_copy['18th_weak_label'] = train_df_copy['weak_labels'].apply(get_18th_element)

# Filter the copy
label6_all = train_df[train_df_copy['18th_weak_label'] == 6]

# Continue with your code
df3 = li.chemprot_enhanced(label6_all)
df3

Unnamed: 0,text,labels,entity1,entity2,span1,span2,weak_labels,18th_weak_label


## Main Work

In [18]:
df2.to_csv(f"./dev_{label}.csv")
chemprot6 = li.df_to_chemprot(li.chemprot_enhanced(label6))
li.save_dataset(chemprot6, f"./dev_{label}.json")


In [12]:
# Cell to inspect dev set for a given keyword (to inspect conflicts). Note: this doesn't work with "+" 
keyword = 'iPad' # Change this word
pd.set_option('display.max_colwidth', 1000)
dev_df[dev_df['text'].str.contains(keyword)]

Unnamed: 0,text,labels,entity1,entity2,span1,span2,weak_labels


## Evaluation

In [235]:
L_train = apply_LFs(lfs, train_df)
L_valid = apply_LFs(lfs, valid_df)
L_test = apply_LFs(lfs, test_df)

print("Train Coverage:", calc_coverage(L_train))
print("Valid Coverage:", calc_coverage(L_valid))
print("Test Coverage:", calc_coverage(L_test))

lf_analysis = LFAnalysis(L=L_valid, lfs=lfs).lf_summary()

# Calculates how many of an LFs votes result in conflicts (helpful signal for debugging LFs)
lf_analysis['Conflict Ratio'] = lf_analysis['Conflicts'] / lf_analysis['Coverage']
lf_analysis

100%|██████████| 131781/131781 [02:54<00:00, 755.05it/s]
100%|██████████| 5805/5805 [00:07<00:00, 754.68it/s]
100%|██████████| 17402/17402 [00:22<00:00, 756.63it/s]


Train Coverage: 0.6395003832115403
Valid Coverage: 0.6396210163652024
Test Coverage: 0.6375703942075623


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Conflict Ratio
lf_blouse,0,[0],0.001206,0.000861,0.000517,0.428571
lf_shirt,1,[0],0.009991,0.006202,0.005857,0.586207
lf_scarf,2,[0],0.001378,0.000689,0.000689,0.500000
lf_jacket,3,[0],0.004479,0.002239,0.002239,0.500000
lf_underwear,4,[0],0.001206,0.000517,0.000345,0.285714
...,...,...,...,...,...,...
lf_bracelet+watch,225,[29],0.003101,0.003101,0.003101,1.000000
lf_electronic+case,226,[30],0.001206,0.001034,0.001034,0.857143
lf_ipod,227,[30],0.012231,0.008786,0.007407,0.605634
lf_ipad,228,[30],0.008613,0.006546,0.005168,0.600000


In [236]:
# List LFs for which 'Conflict Ratio' is above some threshold (helpful for debugging)
lf_analysis[lf_analysis['Conflict Ratio'] > 0.8]['Conflict Ratio'].sort_values(ascending=False)

lf_poncho              1.000000
lf_portable battery    1.000000
lf_bracelet+watch      1.000000
lf_card game           1.000000
lf_tent+daughter       1.000000
lf_car+1:              1.000000
lf_athletic            1.000000
lf_t-shirt             1.000000
lf_video+conversion    1.000000
lf_basketball+shoe     1.000000
lf_cage                1.000000
lf_laptop stand        1.000000
lf_folders             1.000000
lf_headphone+studio    1.000000
lf_pop+classic         1.000000
lf_toners              1.000000
lf_acne treatment      1.000000
lf_skin care           1.000000
lf_waist trimmer       1.000000
lf_nail+paint          1.000000
lf_gummi               1.000000
lf_laptop+battery      0.909091
lf_punch               0.875000
lf_air+drain           0.875000
lf_electronic+case     0.857143
lf_tent+son            0.842105
lf_laptop+case         0.833333
Name: Conflict Ratio, dtype: float64

In [237]:
# Calculate accuracy on the validation set (Ideally do this only at the end)
majority_model = MajorityLabelVoter(31)
preds_valid = majority_model.predict(L=L_valid)
(preds_valid[preds_valid != -1] == valid_df[preds_valid != -1].labels.values).mean()

0.6649519890260631

In [238]:
json.dump(keywords, open("amazon_LFs_v1.json", "w"), indent=8)

In [None]:
# Replace the LFs for a given dataset (in wrench format)
# dataset_name = "dbpedia"

# train_json = json.load(open(f"../weak_datasets/{dataset_name}/train.json", "r"))
# for idx in train_json:
#     train_json[idx]['weak_labels'] = [int(i) for i in list(L_train[int(idx)])]
    
# valid_json = json.load(open(f"../weak_datasets/{dataset_name}/valid.json", "r"))
# for idx in valid_json:
#     valid_json[idx]['weak_labels'] = [int(i) for i in list(L_valid[int(idx)])]
    
# test_json = json.load(open(f"../weak_datasets/{dataset_name}/test.json", "r"))
# for idx in test_json:
#     test_json[idx]['weak_labels'] = [int(i) for i in list(L_test[int(idx)])]

# json.dump(train_json, open(f"../weak_datasets/{dataset_name}/train.json", 'w'), indent=4)
# json.dump(valid_json, open(f"../weak_datasets/{dataset_name}/valid.json", 'w'), indent=4)
# json.dump(test_json, open(f"../weak_datasets/{dataset_name}/test.json", 'w'), indent=4)