In [6]:
import numpy as np
import snorkel
from snorkel.labeling import labeling_function
from snorkel.labeling import LabelingFunction
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
import re
from snorkel.labeling.model import MajorityLabelVoter
import json
import pandas as pd
import label_improve

In [7]:
%load_ext autoreload
%autoreload 2
import label_improve as li

In [8]:
# Loading the data 
dataset_name = "chemprot"
idx_to_label = json.load(open(f"../weak_datasets/{dataset_name}/label.json"))
label_to_idx = {l:i for i,l in idx_to_label.items()}
valid_df = li.chemprot_to_df(json.load(open(f"../weak_datasets/{dataset_name}/valid.json", "r")))
train_df = li.chemprot_to_df(json.load(open(f"../weak_datasets/{dataset_name}/train.json", "r")))
test_df = li.chemprot_to_df(json.load(open(f"../weak_datasets/{dataset_name}/test.json", "r")))

# Sample a dev set to help seed ideas for LFs
dev_df = train_df.sample(250, random_state=123)

In [9]:
train_df.head()

Unnamed: 0,text,label,entity1,entity2,span1,span2,weak_labels
10305,Selective inhibition of PDE5 is a rational the...,3,PDE5,sildenafil,"[24, 28]","[108, 118]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
8422,"Furthermore, knockdown of OPN enhanced cell de...",8,P-gp,paclitaxel,"[153, 157]","[83, 93]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
8486,"Furthermore, no impact on cytokine release (i....",3,IL-12,JWH-210,"[203, 208]","[147, 154]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 2, -1, -1..."
6254,"Among neuroleptics, the four most potent compo...",1,norepinephrine transporter,chlorpromazine,"[182, 208]","[210, 224]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
1801,Epidermal growth factor receptor inhibitors cu...,3,Epidermal growth factor receptor,Tarceva,"[0, 32]","[144, 151]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."


In [142]:

# chemprot functions:

ABSTAIN = -1
### Keyword based labeling functions ###

## Part of
#0
@labeling_function()
def lf_amino_acid(x):
    return 0 if 'amino acid' in x.text.lower() else ABSTAIN
#1
@labeling_function()
def lf_replace(x):
    return 0 if 'replace' in x.text.lower() else ABSTAIN
#2 TODO: 0.1988
@labeling_function()
def lf_mutant(x):
    def find_word_index(words, target):
        for i, word in enumerate(words):
            if target in word:
                return i
        return -1

    words = x.text.lower().split()
    if any('mutant' in word or 'mutat' in word for word in words):
        # if mutant is between the two entities
        if x.entity1_index == -1 or x.entity2_index == -1:
            return ABSTAIN
        if isinstance(x.entity1_index, int) and isinstance(x.entity2_index, int):
            if x.entity1_index < x.entity2_index:
                if any('mutant' in word or 'mutat' in word for word in words[x.entity1_index:x.entity2_index]):
                    return 0
            else:
                if any('mutant' in word or 'mutat' in word for word in words[x.entity2_index:x.entity1_index]):
                    return 0
        # if mutant is close to either of the entities
        mutant_index = find_word_index(words, 'mutant')
        mutat_index = find_word_index(words, 'mutat')
        if (mutant_index != -1 and (abs(x.entity1_index - mutant_index) < 4 or abs(x.entity2_index - mutant_index) < 4)) or \
           (mutat_index != -1 and (abs(x.entity1_index - mutat_index) < 4 or abs(x.entity2_index - mutat_index) < 4)):
            return 0
    return ABSTAIN

#3
## Regulator
@labeling_function()
def lf_bind(x):
    return 1 if 'bind' in x.text.lower() else ABSTAIN
#4
@labeling_function()
def lf_interact(x):
    return 1 if 'interact' in x.text.lower() else ABSTAIN
#5
@labeling_function()
def lf_affinity(x):
    return 1 if 'affinit' in x.text.lower() else ABSTAIN
#6 TODO: 0.3578
## Upregulator
# Activator
@labeling_function()
def lf_activate(x):
    return 2 if 'activat' in x.text.lower() else ABSTAIN
#7
@labeling_function()
def lf_increase(x):
    return 2 if 'increas' in x.text.lower() else ABSTAIN
#8 TODO: 
@labeling_function()
def lf_induce(x):
    return 2 if 'induc' in x.text.lower() else ABSTAIN
#9 TODO: 
@labeling_function()
def lf_stimulate(x):
    return 2 if 'stimulat' in x.text.lower() else ABSTAIN
#10
@labeling_function()
def lf_upregulate(x):
    if ('upregulat' in x.text.lower() or 'up-regulat' in x.text.lower()) and ('downregulat' in x.text.lower() or 'down-regulat' in x.text.lower()):
        entity1_index = x.text.lower().index(x.entity1.lower())
        entity2_index = x.text.lower().index(x.entity2.lower())
        # if up regulate is between the two entities
        if isinstance(entity1_index, int) and isinstance(entity2_index, int):
            if entity1_index < entity2_index:
                if x.text[entity1_index:entity2_index].count('upregulat') > 0 or x.text[entity1_index:entity2_index].count('up-regulat') > 0:
                    return 2
            else:
                if x.text[entity2_index:entity1_index].count('upregulat') > 0 or x.text[entity2_index:entity1_index].count('up-regulat') > 0:
                    return 2
        return ABSTAIN
    else:
        return 2 if 'upregulat' in x.text.lower() or 'up-regulat' in x.text.lower() else ABSTAIN
#11
## Downregulator
@labeling_function()
def lf_downregulate(x):
    if('downregulat' in x.text.lower() or 'down-regulat' in x.text.lower()) and ('upregulat' in x.text.lower() or 'up-regulat' in x.text.lower()):
        if x.entity1_index == -1 or x.entity2_index == -1:
            return ABSTAIN
        # if up regulate is between the two entities
        if isinstance(x.entity1_index, int) and isinstance(x.entity2_index, int):
            if x.entity1_index < x.entity2_index:
                if x.text[x.entity1_index:x.entity2_index].count('downregulat') > 0 or x.text[x.entity1_index:x.entity2_index].count('down-regulat') > 0:
                    return 3
            else:
                if x.text[x.entity2_index:x.entity1_index].count('downregulat') > 0 or x.text[x.entity2_index:x.entity1_index].count('down-regulat') > 0:
                    return 3
        return ABSTAIN
    return 3 if 'downregulat' in x.text.lower() or 'down-regulat' in x.text.lower() else ABSTAIN
#12
@labeling_function()
def lf_reduce(x):
    return 3 if 'reduc' in x.text.lower() else ABSTAIN
#13
@labeling_function()
def lf_inhibit(x):
    return 3 if 'inhibit' in x.text.lower() else ABSTAIN
#14
@labeling_function()
def lf_decrease(x):
    return 3 if 'decreas' in x.text.lower() else ABSTAIN
#15
## Agonist
@labeling_function()
def lf_agonist(x):
    return 4 if ' agoni' in x.text.lower() or "\tagoni" in x.text.lower() else ABSTAIN

#16
## Antagonist
@labeling_function()
def lf_antagonist(x):
    return 5 if 'antagon' in x.text.lower() else ABSTAIN

#17
## Modulator
# TODO: Delete this LF, or change this to modulator ??
@labeling_function()
def lf_modulate(x):
    return 6 if 'modulat' in x.text.lower() else ABSTAIN

#18
@labeling_function()
def lf_allosteric(x):
    return 6 if 'allosteric' in x.text.lower() else ABSTAIN
#19
## Cofactor
@labeling_function()
def lf_cofactor(x):
    return 7 if 'cofactor' in x.text.lower() else ABSTAIN
#20
## Substrate/Product
@labeling_function()
def lf_substrate(x):
    return 8 if 'substrate' in x.text.lower() else ABSTAIN
#21
@labeling_function()
def lf_transport(x):
    return 8 if 'transport' in x.text.lower() else ABSTAIN
#22
@labeling_function()
def lf_catalyze(x):
    return 8 if 'catalyz' in x.text.lower() or 'catalys' in x.text.lower() else ABSTAIN
#23
@labeling_function()
def lf_product(x):
    return 8 if "produc" in x.text.lower() else ABSTAIN
#24
@labeling_function()
def lf_convert(x):
    return 8 if "conver" in x.text.lower() else ABSTAIN
#25
## NOT
@labeling_function()
def lf_not(x):
    entity1_index = x.text.lower().index(x.entity1.lower())
    entity2_index = x.text.lower().index(x.entity2.lower())
    # if the two entities are close to the word 'not'
    
    if 'not' in x.text.lower():
        if abs(entity1_index - x.text.lower().index('not')) < 20 or abs(entity2_index - x.text.lower().index('not')) < 20:
            return 9
        # if not is between the two entities
        if abs(entity1_index - x.text.lower().index('not')) < 40 or abs(entity2_index - x.text.lower().index('not')) < 40:
            if entity1_index < entity2_index:
                if x.text[entity1_index:entity2_index].count('not') > 0:
                    return 9
            else:
                if x.text[entity2_index:entity1_index].count('not') > 0:
                    return 9
    return ABSTAIN

# 26 replace the 17 (18)
@labeling_function()
def lf_combined_modulator(x):
    sentence_lower = x.text.lower()

    specific_terms = ['allosteric modulator', 'positive modulator', 'negative modulator', 'non-competitive modulator', 'positive allosteric modulator']
    if any(term in sentence_lower for term in specific_terms):
        return 6

    modulating_terms = ['modulat', 'allosteric', 'potentiate']
    for term in modulating_terms:
        if term in sentence_lower:
            term_index = sentence_lower.index(term)
            if x.entity1_index == -1 or x.entity2_index == -1:
                return ABSTAIN
            if abs(term_index - x.entity1_index) < 20 or abs(term_index - x.entity2_index) < 20:
                return 6
    # check the first indcidences of modulator and positive
    if x.entity1 in sentence_lower and x.entity2 in sentence_lower:
        entity1_index = sentence_lower.index(x.entity1.lower())
        entity2_index = sentence_lower.index(x.entity2.lower())
        between_entities = sentence_lower[min(entity1_index, entity2_index):max(entity1_index, entity2_index)]
        if 'modulate' in between_entities:
            return 6

    if 'positive' in sentence_lower and 'modulator' in sentence_lower:
        pos_indices = [i for i, word in enumerate(sentence_lower.split()) if word == 'positive']
        mod_indices = [i for i, word in enumerate(sentence_lower.split()) if 'modulator' in word]
        if len(pos_indices) == 0 or len(mod_indices) == 0:
            return ABSTAIN
        min_distance = min(abs(p - m) for p in pos_indices for m in mod_indices)
        if min_distance <= 3:
            return 6

    return ABSTAIN

lfs = [lf_amino_acid, lf_replace, lf_mutant, lf_bind, lf_interact, lf_affinity, lf_activate, lf_increase, lf_stimulate, lf_upregulate, lf_downregulate, lf_reduce, lf_inhibit, lf_decrease, lf_agonist, lf_antagonist, lf_combined_modulator, lf_allosteric, lf_cofactor, lf_substrate, lf_transport, lf_catalyze, lf_product, lf_convert, lf_not]

In [147]:
new_train = li.chemprot_df_with_new_lf(train_df, lfs)
chemprot = li.df_to_chemprot(new_train)
li.save_dataset(chemprot, "../weak_datasets/chemprot2/train.json")
new_test = li.chemprot_df_with_new_lf(test_df, lfs)
chemprot = li.df_to_chemprot(new_test)
li.save_dataset(chemprot, "../weak_datasets/chemprot2/test.json")
new_valid = li.chemprot_df_with_new_lf(valid_df, lfs)
chemprot = li.df_to_chemprot(new_valid)
li.save_dataset(chemprot, "../weak_datasets/chemprot2/valid.json")



100%|██████████| 12861/12861 [00:01<00:00, 6824.43it/s]
100%|██████████| 1607/1607 [00:00<00:00, 6861.98it/s]
100%|██████████| 1607/1607 [00:00<00:00, 6837.29it/s]


In [135]:
mutant, num = li.see_label_function(train_df, [lf_activate])
print(num)
print(len(train_df))
mutant = li.df_to_chemprot(mutant)
li.save_dataset(mutant, "./activate.json")
    

100%|██████████| 12861/12861 [00:00<00:00, 139400.42it/s]


1491
12861


In [136]:
cheack_df = li.chemprot_to_df(json.load(open(f"./chemprot_lfs.json", "r")))

In [137]:
cheack_df.head()

Unnamed: 0,text,label,entity1,entity2,span1,span2,weak_labels
10305,Selective inhibition of PDE5 is a rational the...,3,PDE5,sildenafil,"[24, 28]","[108, 118]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
8422,"Furthermore, knockdown of OPN enhanced cell de...",8,P-gp,paclitaxel,"[153, 157]","[83, 93]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
8486,"Furthermore, no impact on cytokine release (i....",3,IL-12,JWH-210,"[203, 208]","[147, 154]","[-1, -1, -1, -1, -1, -1, -1, -1, 2, -1, -1, -1..."
6254,"Among neuroleptics, the four most potent compo...",1,norepinephrine transporter,chlorpromazine,"[182, 208]","[210, 224]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
1801,Epidermal growth factor receptor inhibitors cu...,3,Epidermal growth factor receptor,Tarceva,"[0, 32]","[144, 151]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."


In [138]:
# show the first row of the dataframe
train_dev = train_df

train_dev = li.chemprot_enhanced(train_dev)
L_dev = li.apply_LFs(lfs, train_dev)
L_dev

L_dev[1]

100%|██████████| 12861/12861 [00:01<00:00, 6873.08it/s]


array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1,  8, -1, -1, -1, -1, -1])

In [139]:
for i in range(train_dev.shape[0]):
    if L_dev[i][2] == 0:
        print(f"LFs: {L_dev[i]}")
        print(f"Text: {train_dev.iloc[i].text}")
        print(f"Entities: {train_dev.iloc[i].entity1}, {train_dev.iloc[i].entity2}")
        print(f"True label: {train_dev.iloc[i].label}")
        print("\n\n")

LFs: [-1 -1  0 -1 -1 -1 -1  2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1]
Text: Circular dichroism of mutant cTnCs revealed a trend where increased alpha-helical content correlated with increased Ca(2+) sensitivity in skinned fibers and vice versa.
Entities: cTnCs, Ca(2+)
True label: 1



LFs: [-1 -1  0  1 -1 -1 -1 -1 -1 -1 -1 -1  3 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1]
Text: The refolding kinetics of guanidine-denatured disulfide-intact bovine pancreatic ribonuclease A (RNase A) and its proline-42-to-alanine mutant (Pro42Ala) have been studied by monitoring tyrosine burial and 2'-cytidine monophosphate (2'CMP) inhibitor binding.
Entities: RNase A, alanine
True label: 0



LFs: [-1 -1  0 -1 -1 -1 -1  2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1]
Text: In contrast, the mutants T844A, F972A and Q975A showed increased K(i) for cilostazol but no difference for milrinone from the recombinant PDE3A.
Entities: F972A, milrinone
True label: 3



LFs: [-1 -1  0 -1 -1 -1 -1 -1 -

In [140]:
print("Test Coverage:", li.calc_coverage(L_dev))
lf_analysis = LFAnalysis(L_dev, lfs).lf_summary()
# Calculates how many of an LFs votes result in conflicts (helpful signal for debugging LFs)
lf_analysis['Conflict Ratio'] = lf_analysis['Conflicts'] / lf_analysis['Coverage']
lf_analysis

Test Coverage: 0.8128450353782755


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Conflict Ratio
lf_amino_acid,0,[0],0.02076,0.014462,0.014074,0.677903
lf_replace,1,[0],0.002877,0.001089,0.000778,0.27027
lf_mutant,2,[0],0.013529,0.009486,0.009253,0.683908
lf_bind,3,[1],0.102092,0.072312,0.063603,0.623001
lf_interact,4,[1],0.02667,0.017728,0.015162,0.568513
lf_affinity,5,[1],0.045253,0.03048,0.02356,0.520619
lf_activate,6,[2],0.115932,0.073867,0.065625,0.566063
lf_increase,7,[2],0.098826,0.066402,0.059171,0.598741
lf_stimulate,8,[2],0.037633,0.030169,0.025737,0.683884
lf_upregulate,9,[2],0.014307,0.008708,0.007464,0.521739


In [141]:
# Calculate accuracy on the validation set (Ideally do this only at the end)
majority_model = MajorityLabelVoter(10)
preds_valid = majority_model.predict(L=L_dev)




print((preds_valid[preds_valid != -1] == train_dev[preds_valid != -1].label.values).mean())
print((preds_valid == train_dev.label.values).mean())

incorrect_indices = np.where(preds_valid!= train_dev.label.values)[0]

incorrect_predictions_df = train_dev.iloc[incorrect_indices]

for i in range(incorrect_predictions_df.shape[0]):
    print(L_dev[i])
    print(incorrect_predictions_df.iloc[i].weak_labels)
    print(incorrect_predictions_df.iloc[i].text)
    print("True label:", incorrect_predictions_df.iloc[i].label)
    print("Predicted label:", preds_valid[incorrect_indices[i]])
    print("\n")

0.6734463276836158
0.37073322447710133
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  3 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1]
[-1, -1, -1, -1, -1, -1, -1, -1, -1, 2, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
Furthermore, no impact on cytokine release (i.e., on IL-10, IL-6, IL-12/23p40 and TNFα levels) was seen in LPS-stimulated human PBMCs, except with JWH-210 and JWH-122 which caused a decrease of TNFα and IL-12/23p40.
True label: 3
Predicted label: -1


[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  8 -1 -1 -1 -1
 -1]
[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 8, -1, -1, -1, -1]
Among neuroleptics, the four most potent compounds at the human serotonin transporter were triflupromazine, fluperlapine, chlorpromazine, and ziprasidone (K(D) 24-39 nM); and at the norepinephrine transporter, chlorpromazine, zotepine, chlorprothixene, and promazine (K(D) 19-25 nM).
True label: 1
Predicted label: 8


[-1 -1 -1 -1 -1 -1 -1 -1  2

 -1


[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1]
[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 9]
However, MS A2756G was significantly associated with cobalamin levels (AA genotype: 290 +/- 122 pmol/l; AG: 381 +/- 151 pmol/l and GG: 415 +/- 100 pmol/l), as was MTRR A66G (AA: 478 +/- 219 pmol/l, AG: 306 +/- 124 pmol/l and GG: 306 +/- 123 pmol/l).
True label: 1
Predicted label: 9


[-1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1  3 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1]
[-1, -1, 0, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
Molecular modeling of the kinase domain of mutant c-Kit (V654A) and AXL showed no binding to IM but efficient binding to MP470, a novel c-Kit/AXL kinase inhibitor.
True label: 9
Predicted label: -1


[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1]
[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 

In [129]:
lfs = [lf_amino_acid, lf_replace, lf_mutant, lf_bind, lf_interact, lf_affinity, lf_activate, lf_increase, lf_induce, lf_stimulate, lf_upregulate, lf_downregulate, lf_reduce, lf_inhibit, lf_decrease, lf_agonist, lf_antagonist, lf_modulate, lf_allosteric, lf_cofactor, lf_substrate, lf_transport, lf_catalyze, lf_product, lf_convert, lf_not]

# show the first row of the dataframe
train_dev = train_df
train_dev = li.chemprot_enhanced(train_dev)
L_dev2 = li.apply_LFs(lfs, train_dev)
L_dev2

100%|██████████| 12861/12861 [00:01<00:00, 6885.06it/s]


array([[-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       ...,
       [-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1]])

In [130]:
difference = np.where(L_dev2 !=  L_dev)[0]

  """Entry point for launching an IPython kernel.


In [131]:
count = 0
for i in difference:
    print(train_dev.iloc[i])
    print(train_dev.iloc[i].text)
    print(L_dev[i])
    print(L_dev2[i])
    count +=1
    print("\n")
print(count)

text             Selective inhibition of PDE5 is a rational the...
label                                                            3
entity1                                                       PDE5
entity2                                                 sildenafil
span1                                                     [24, 28]
span2                                                   [108, 118]
weak_labels      [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...
entity1_index                                                    3
entity2_index                                                   18
Name: 10305, dtype: object
Selective inhibition of PDE5 is a rational therapeutic approach in ED, as proved by the clinical success of sildenafil.
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  3 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1]
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  3 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1]


1


In [132]:
# Calculate accuracy on the validation set (Ideally do this only at the end)
majority_model = MajorityLabelVoter(10)
preds_valid = majority_model.predict(L=L_dev2)

print((preds_valid[preds_valid != -1] == train_dev[preds_valid != -1].label.values).mean())
print((preds_valid == train_dev.label.values).mean())

incorrect_indices = np.where(preds_valid[preds_valid != -1] != train_dev[preds_valid != -1].label.values)[0]
incorrect_predictions_df = train_dev.iloc[incorrect_indices]
label_six_df = incorrect_predictions_df[incorrect_predictions_df.label == 6]
label_six_df

0.6506108692599354
0.36023637353238475


Unnamed: 0,text,label,entity1,entity2,span1,span2,weak_labels,entity1_index,entity2_index
8655,"Conversely, ovarian PRA and PRB were positivel...",6,PRB,melatonin,"[28, 31]","[81, 90]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3...",4.0,11.0
3387,"BDZs and other positive GABA(A)R modulators, i...",6,GABA(A)R,BDZs,"[24, 32]","[0, 4]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...",4.0,0.0
8654,"Conversely, ovarian PRA and PRB were positivel...",6,PRA,melatonin,"[20, 23]","[81, 90]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3...",2.0,11.0
13836,The recently discovered hyperinsulinism/hypera...,6,GDH,GTP,"[113, 116]","[120, 123]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...",13.0,15.0
15281,Synthesis and structure-activity relationships...,6,CC-chemokine receptor 4,indazole arylsulfonamides,"[90, 113]","[50, 75]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...",10.0,5.5
13869,Because these speeds are significantly faster ...,6,M(2) receptor,gallamine,"[198, 211]","[106, 115]","[-1, -1, -1, -1, -1, 1, -1, -1, 2, -1, -1, -1,...",28.5,15.0
5381,Rapamycin is a canonical allosteric inhibitor ...,6,kinase,Rapamycin,"[58, 64]","[0, 9]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...",9.0,0.0
13508,Allosteric interaction of the neuromuscular bl...,6,human muscarinic M2 receptors,vecuronium,"[97, 126]","[53, 63]","[-1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1...",12.5,6.0
5377,We hypothesize that the rapamycin and related ...,6,proteasome,rapamycin,"[105, 115]","[24, 33]","[-1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1...",17.0,4.0
11238,A number of agents are being developed that ta...,6,ER,tamoxifen,"[160, 162]","[125, 134]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...",26.0,20.0


# DO NOT GO BEYOND THIS LINE

# ------------------------------------------------------------------------------------ #

In [71]:
df = li.chemprot_enhanced(train_dev)
df

Unnamed: 0,text,labels,entity1,entity2,span1,span2,weak_labels,entity1_index,entity2_index
10305,"Selective inhibition of PDE5 is a rational therapeutic approach in ED, as proved by the clinical success of sildenafil.",3,PDE5,sildenafil,"[24, 28]","[108, 118]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",3.0,18.0
8422,"Furthermore, knockdown of OPN enhanced cell death caused by other drugs, including paclitaxel, doxorubicin, actinomycin-D, and rapamycin, which are also P-gp substrates.",8,P-gp,paclitaxel,"[153, 157]","[83, 93]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 8, -1, -1, -1, -1, -1]",20.0,12.0
8486,"Furthermore, no impact on cytokine release (i.e., on IL-10, IL-6, IL-12/23p40 and TNFα levels) was seen in LPS-stimulated human PBMCs, except with JWH-210 and JWH-122 which caused a decrease of TNFα and IL-12/23p40.",3,IL-12,JWH-210,"[203, 208]","[147, 154]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 2, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",32.0,22.0
6254,"Among neuroleptics, the four most potent compounds at the human serotonin transporter were triflupromazine, fluperlapine, chlorpromazine, and ziprasidone (K(D) 24-39 nM); and at the norepinephrine transporter, chlorpromazine, zotepine, chlorprothixene, and promazine (K(D) 19-25 nM).",1,norepinephrine transporter,chlorpromazine,"[182, 208]","[210, 224]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 8, -1, -1, -1, -1]",24.5,26.0
1801,"Epidermal growth factor receptor inhibitors currently under investigation include the small molecules gefitinib (Iressa, ZD1839) and erlotinib (Tarceva, OSI-774), as well as monoclonal antibodies such as cetuximab (IMC-225, Erbitux).",3,Epidermal growth factor receptor,Tarceva,"[0, 32]","[144, 151]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",1.5,17.0
...,...,...,...,...,...,...,...,...,...
9350,"Tamsulosin, the first prostate-selective alpha 1A-adrenoceptor antagonist.",5,alpha 1A-adrenoceptor,Tamsulosin,"[41, 62]","[0, 10]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1]",4.5,0.0
13234,"The GRIP1 reduction was inhibited by MK-801, an N-methyl-d-aspartate (NMDA) receptor antagonist, but not by 6-cyano-7-nitroquinoxaline-2,3-dione (CNQX), an AMPA receptor antagonist.",2,GRIP1,MK-801,"[4, 9]","[37, 43]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 3, -1, -1, 5, -1, -1, -1, -1, -1, -1, -1, -1, 9]",1.0,6.0
1837,The reciprocal inhibition of SR-BI and ABCA1 by BLT-4 and glyburide raises the possibility that these proteins may share similar or common steps in their mechanisms of lipid transport.,3,ABCA1,glyburide,"[39, 44]","[58, 67]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, 8, -1, -1, -1, -1]",6.0,10.0
11290,"Firstly, transgenic plants overexpressing formate dehydrogenase (FDH, EC 1.2.1.2) were used to continue our previous studies on the function of FDH in formate metabolism.",8,EC 1.2.1.2,formate,"[70, 80]","[151, 158]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",7.5,22.0


In [72]:
lfs = [lf_combined_modulator]

# Initialize the applier
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df)
L_train

100%|██████████| 1000/1000 [00:00<00:00, 66886.27it/s]


array([[-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],

In [73]:
# Cell to inspect the dev set for a given label 
label = 'Modulator' # Change this variable
pd.set_option('display.max_colwidth', 1000)
label6 = train_df[train_df['labels'] == int(label_to_idx[label])]
# save the dev set to a file
df2 = li.chemprot_enhanced(label6)
df2

Unnamed: 0,text,labels,entity1,entity2,span1,span2,weak_labels,entity1_index,entity2_index
12446,"While SAR within the HTS series was very shallow and unable to be optimized, grafting the phenethyl ether linkage onto the ML129/ML172 cores led to the first sub-micromolar M5 PAM, ML326 (VU0467903), (human and rat M5 EC50s of 409nM and 500nM, respectively) with excellent mAChR selectivity (M1-M4 EC50s >30μM) and a robust 20-fold leftward shift of the ACh CRC.",6,M5,ML172,"[173, 175]","[129, 134]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",28.0,21.0
3477,"Anxiolytic- but not antidepressant-like activity of Lu AF21934, a novel, selective positive allosteric modulator of the mGlu₄ receptor.",6,mGlu₄,Lu AF21934,"[120, 125]","[52, 62]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 6, 6, -1, -1, -1, -1, -1, -1, 9]",16.0,6.5
8655,"Conversely, ovarian PRA and PRB were positively regulated by ethanol and ethanol-melatonin combination, whereas PRA was down-regulated in the uterus and oviduct after ethanol consumption.",6,PRB,melatonin,"[28, 31]","[81, 90]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 8, -1]",4.0,11.0
3387,"BDZs and other positive GABA(A)R modulators, including barbiturates, ethanol, and neurosteroids, can also inhibit L-type voltage-gated calcium channels (L-VGCCs), which could contribute to reduced neuronal excitability.",6,GABA(A)R,BDZs,"[24, 32]","[0, 4]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 3, -1, -1, -1, 6, -1, -1, -1, -1, -1, -1, -1, -1]",4.0,0.0
8654,"Conversely, ovarian PRA and PRB were positively regulated by ethanol and ethanol-melatonin combination, whereas PRA was down-regulated in the uterus and oviduct after ethanol consumption.",6,PRA,melatonin,"[20, 23]","[81, 90]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 8, -1]",2.0,11.0
13836,The recently discovered hyperinsulinism/hyperammonemia disorder showed that the loss of allosteric inhibition of GDH by GTP causes excessive secretion of insulin.,6,GDH,GTP,"[113, 116]","[120, 123]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, 6, -1, -1, -1, -1, -1, -1, -1]",13.0,15.0
5378,The implications of our finding for mechanism of in vivo actions of rapamycin and for design of novel allosteric drugs targeting the proteasome are discussed.,6,proteasome,rapamycin,"[133, 143]","[68, 77]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 6, -1, -1, -1, -1, -1, -1, -1]",22.0,12.0
3606,Ifenprodil is an allosteric inhibitor of GluN1/GluN2B N-methyl-D-aspartate receptors.,6,GluN1,Ifenprodil,"[41, 46]","[0, 10]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, 6, -1, -1, -1, -1, -1, -1, -1]",6.0,0.0
15281,Synthesis and structure-activity relationships of indazole arylsulfonamides as allosteric CC-chemokine receptor 4 (CCR4) antagonists.,6,CC-chemokine receptor 4,indazole arylsulfonamides,"[90, 113]","[50, 75]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, -1, 6, -1, -1, -1, -1, -1, -1, -1]",10.0,5.5
3381,Benzodiazepines (BDZs) depress neuronal excitability via positive allosteric modulation of inhibitory GABA(A) receptors (GABA(A)R).,6,GABA(A)R,BDZs,"[121, 129]","[17, 21]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, 6, 6, -1, -1, -1, -1, -1, -1, -1]",13.0,1.0


In [9]:
count = 0
for i in range(len(df2)):
    weak_label = df2.iloc[i]['weak_labels'][17]
    if weak_label != -1:
        print(i)
        count+=1
print('count', count)

1
3
9
29
33
36
38
40
41
49
53
count 11


In [10]:
L_train = applier.apply(df=df2)
L_train

100%|██████████| 55/55 [00:00<00:00, 22625.22it/s]


array([[-1],
       [ 6],
       [-1],
       [ 6],
       [-1],
       [-1],
       [-1],
       [ 6],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [ 6],
       [-1],
       [-1],
       [-1],
       [-1],
       [ 6],
       [ 6],
       [-1],
       [-1],
       [-1],
       [-1],
       [ 6],
       [-1],
       [ 6],
       [-1],
       [-1],
       [-1],
       [ 6],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [ 6],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [ 6],
       [ 6],
       [-1],
       [-1],
       [ 6],
       [-1]])

In [11]:


# Cell to inspect the dev set for a given label 
label = 'Modulator' # Change this variable
pd.set_option('display.max_colwidth', 1000)


def get_18th_element(lst):
    return lst[17] if len(lst) > 17  else None

# Create a copy of the DataFrame
train_df_copy = train_df.copy()

# Apply the function to the copy
train_df_copy['18th_weak_label'] = train_df_copy['weak_labels'].apply(get_18th_element)

# Filter the copy
label6_all = train_df[train_df_copy['18th_weak_label'] == 6]

# Continue with your code
df3 = li.chemprot_enhanced(label6_all)
df3

Unnamed: 0,text,labels,entity1,entity2,span1,span2,weak_labels,entity1_index,entity2_index
15523,This effect appeared to be due to both competition between S-nitrosocysteine and Prx1 for the Trx system and direct modulation by S-nitrosocysteine of Trx reductase activity.,1,Trx,S-nitrosocysteine,"[94, 97]","[59, 76]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, 6, -1, -1, -1, -1, -1, -1, -1, -1]",15.0,10.0
6532,"Thalidomide--removed from widespread clinical use by 1962 because of severe teratogenicity--has anti-angiogenic and immunomodulatory effects, including the inhibition of TNF alpha.",3,TNF alpha,Thalidomide,"[170, 179]","[0, 11]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, 6, -1, -1, -1, -1, -1, -1, -1, -1]",19.5,0.0
5676,Data suggest that bisphosphonates via modulation of the activity of small-GTPases induce apoptosis in neoplastic cells by DNA-CpG-demethylation and stimulation of FAS-expression.,2,FAS,bisphosphonates,"[163, 166]","[18, 33]","[-1, -1, -1, -1, -1, -1, -1, -1, 2, 2, -1, -1, -1, -1, -1, -1, -1, 6, -1, -1, -1, -1, -1, -1, -1, -1]",21.0,3.0
3367,"BDZs and other positive GABA(A)R modulators, including barbiturates, ethanol, and neurosteroids, can also inhibit L-type voltage-gated calcium channels (L-VGCCs), which could contribute to reduced neuronal excitability.",3,L-VGCCs,neurosteroids,"[153, 160]","[82, 95]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 3, -1, -1, -1, 6, -1, -1, -1, -1, -1, -1, -1, -1]",18.0,10.0
5005,"This study is the first to identify Class V CGPs with their distinctive methine or trimethine linkage between two disubstituted pyrylium moieties as a particularly potent class of MRP modulators and also show that within this core structure, differences in the electronegativity associated with a chalcogen atom can be the sole determinant of whether a compound will stimulate or inhibit MRP2.",1,MRP,methine,"[180, 183]","[72, 79]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 6, -1, -1, -1, -1, -1, -1, -1, -1]",28.0,13.0
...,...,...,...,...,...,...,...,...,...
5116,"Taken together, our findings indicate that 5HHMF suppresses NO production through modulation of iNOS, consequently suppressing NF-κB activity and induction of Nrf2-dependent HO-1 activity.",3,NF-κB,NO,"[127, 132]","[60, 62]","[-1, -1, -1, -1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1, -1, 6, -1, -1, -1, -1, -1, 8, -1, -1]",16.0,8.0
491,"There is, however, much information on the direct (acute and chronic) effects of alcohol on the binding properties of opioid receptors, as well as modulation of opioid peptide synthesis and secretion (e.g.",1,opioid peptide,alcohol,"[161, 175]","[81, 88]","[-1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 6, -1, -1, -1, -1, -1, -1, -1, -1]",26.5,13.0
1225,"Raloxifene is a selective ER modulator with less uterine estrogen agonist activity than tamoxifen, and it is hoped that it will result in fewer uterine cancers but will be equally efficacious in reducing the risk of breast cancer.",1,ER,Raloxifene,"[26, 28]","[0, 10]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, 4, -1, 6, -1, -1, -1, -1, -1, -1, -1, -1]",4.0,0.0
3384,"BDZs and other positive GABA(A)R modulators, including barbiturates, ethanol, and neurosteroids, can also inhibit L-type voltage-gated calcium channels (L-VGCCs), which could contribute to reduced neuronal excitability.",6,GABA(A)R,barbiturates,"[24, 32]","[55, 67]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 3, -1, -1, -1, 6, -1, -1, -1, -1, -1, -1, -1, -1]",4.0,7.0


In [12]:
df3 = df3.head(30)
count = 0
for i in range(len(df3)):
    label = df3.iloc[i]['labels']
    if label == 6:
        print(i)
        count+=1
print('count', count)

5
11
count 2


In [13]:
L_train = applier.apply(df=df3)
count = 0
for i in range(len(L_train)):
    if L_train[i][0] != -1:
        print(i)
        count+=1
print('count', count)


100%|██████████| 30/30 [00:00<00:00, 23232.85it/s]

2
3
5
7
11
17
20
21
26
29
count 10





## Main Work

In [14]:
df2.to_csv(f"./dev_{label}.csv")
chemprot6 = li.df_to_chemprot(li.chemprot_enhanced(label6))
li.save_dataset(chemprot6, f"./dev_{label}.json")


In [15]:
# Cell to inspect dev set for a given keyword (to inspect conflicts). Note: this doesn't work with "+" 
keyword = 'iPad' # Change this word
pd.set_option('display.max_colwidth', 1000)
dev_df[dev_df['text'].str.contains(keyword)]

Unnamed: 0,text,labels,entity1,entity2,span1,span2,weak_labels


## Evaluation

In [16]:
L_train = apply_LFs(lfs, train_df)
L_valid = apply_LFs(lfs, valid_df)
L_test = apply_LFs(lfs, test_df)

print("Train Coverage:", calc_coverage(L_train))
print("Valid Coverage:", calc_coverage(L_valid))
print("Test Coverage:", calc_coverage(L_test))

lf_analysis = LFAnalysis(L=L_valid, lfs=lfs).lf_summary()

# Calculates how many of an LFs votes result in conflicts (helpful signal for debugging LFs)
lf_analysis['Conflict Ratio'] = lf_analysis['Conflicts'] / lf_analysis['Coverage']
lf_analysis

NameError: name 'apply_LFs' is not defined

In [None]:
# List LFs for which 'Conflict Ratio' is above some threshold (helpful for debugging)
lf_analysis[lf_analysis['Conflict Ratio'] > 0.8]['Conflict Ratio'].sort_values(ascending=False)

In [16]:
# Calculate accuracy on the validation set (Ideally do this only at the end)
majority_model = MajorityLabelVoter(10)
preds_valid = majority_model.predict(L)
(preds_valid[preds_valid != -1] == valid_df[preds_valid != -1].labels.values).mean()

NameError: name 'L_valid' is not defined

In [None]:
json.dump(keywords, open("amazon_LFs_v1.json", "w"), indent=8)

In [None]:
# Replace the LFs for a given dataset (in wrench format)
# dataset_name = "dbpedia"

# train_json = json.load(open(f"../weak_datasets/{dataset_name}/train.json", "r"))
# for idx in train_json:
#     train_json[idx]['weak_labels'] = [int(i) for i in list(L_train[int(idx)])]
    
# valid_json = json.load(open(f"../weak_datasets/{dataset_name}/valid.json", "r"))
# for idx in valid_json:
#     valid_json[idx]['weak_labels'] = [int(i) for i in list(L_valid[int(idx)])]
    
# test_json = json.load(open(f"../weak_datasets/{dataset_name}/test.json", "r"))
# for idx in test_json:
#     test_json[idx]['weak_labels'] = [int(i) for i in list(L_test[int(idx)])]

# json.dump(train_json, open(f"../weak_datasets/{dataset_name}/train.json", 'w'), indent=4)
# json.dump(valid_json, open(f"../weak_datasets/{dataset_name}/valid.json", 'w'), indent=4)
# json.dump(test_json, open(f"../weak_datasets/{dataset_name}/test.json", 'w'), indent=4)