In [10]:
import numpy as np
import snorkel
from snorkel.labeling import labeling_function
from snorkel.labeling import LabelingFunction
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
import re
from snorkel.labeling.model import MajorityLabelVoter
import json
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [11]:
%load_ext autoreload
%autoreload 2
import label_improve as li

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
# Loading the data 
dataset_name = "chemprot2.1"
idx_to_label = json.load(open(f"../weak_datasets/{dataset_name}/label.json"))
label_to_idx = {l:i for i,l in idx_to_label.items()}
valid_df = li.chemprot_to_df(json.load(open(f"../weak_datasets/{dataset_name}/valid.json", "r")))
train_df = li.chemprot_to_df(json.load(open(f"../weak_datasets/{dataset_name}/train.json", "r")))
test_df = li.chemprot_to_df(json.load(open(f"../weak_datasets/{dataset_name}/test.json", "r")))

# Sample a dev set to help seed ideas for LFs
dev_df = train_df.sample(250, random_state=123)

In [19]:
# for each label in the train_df get 10 examples, and creat a new df
# with those examples
train_df_sampled = pd.concat([train_df[train_df["label"] == l].sample(10, random_state=123) for l in  range(10)])
train_df_sampled = train_df_sampled.reset_index(drop=True)
# store the sampled data
index_to_label = {
    "0": "Part of",
    "1": "Regulator",
    "2": "Upregulator",
    "3": "Downregulator",
    "4": "Agonist",
    "5": "Antagonist",
    "6": "Modulator",
    "7": "Cofactor",
    "8": "Substrate/Product",
    "9": "NOT"
}
# replace the label of the sampled data from idx to label using the above dict
train_df_sampled["label"] = train_df_sampled["label"].apply(lambda x: idx_to_label[str(x)])

train_df_sampled.to_csv(f"./llm/chemprot_examples.csv", index=False)


In [13]:
# Get rid of the "weak_labels" column
dev_df = dev_df.drop(columns=["weak_labels"])
dev_df.head()


Unnamed: 0,text,label,entity1,entity2,span1,span2
10408,"Unlike OFQ II(1-17), high concentrations of it...",1,mu (mu) opioid receptor,naloxone,"[124, 147]","[196, 204]"
12234,Photolytic release of free alanine results in ...,1,ASCT2,alanine,"[136, 141]","[27, 34]"
14360,Our results showed that bone remodeling was si...,5,CCR1,Met,"[166, 170]","[128, 131]"
10387,Experimental evidence from the use of agents w...,1,BuChE,MF-8622,"[75, 80]","[103, 110]"
2750,"PURPOSE: Dasatinib (BMS-354825), a potent oral...",3,ABL,BMS-354825,"[99, 102]","[20, 30]"


In [14]:
# Save the dataset to csv
dev_df.to_csv(f"./llm/chemprot.csv", index=False)

In [6]:
keywords = {
    "Part of": [
        "binding", "domain", "terminal", "amino", "protein", 
        "sequence", "receptor", "peptide", "phe", "pro"
    ],
    "Regulator": [
        "binding", "receptor", "cells", "cell", "inhibitor", 
        "induced", "both", "acid", "expression", "factor"
    ],
    "Upregulator": [
        "induced", "expression", "activation", "cells", "activity", 
        "phosphorylation", "dose", "increased", "alpha", "receptor"
    ],
    "Downregulator": [
        "inhibitor", "acid", "induced", "cells", "inhibition", 
        "kinase", "activity", "inhibitors", "factor", "against"
    ],
    "Agonist": [
        "adrenoceptor", "agonist", "alpha", "selective", "antagonist", 
        "against", "noradrenaline", "pkb", "activity", "example"
    ],
    "Antagonist": [
        "antagonist", "receptor", "agonist", "treatment", "selective", 
        "antagonists", "effects", "cox", "aryl", "urinary"
    ],
    "Modulator": [],
    "Cofactor": [
        "plp", "oms", "bond", "activation", "methylmalonyl", 
        "coa", "mutase", "stabilization", "post", "homolysis"
    ],
    "Substrate/Product": [
        "cells", "enzyme", "acid", "membrane", "such", 
        "enzymes", "aminopeptidase", "suggest", "well", "human"
    ],
    "NOT": [
        "levels", "both", "protein", "receptors", "did", 
        "agonist", "nor", "receptor", "inactive", "micelles"
    ]
}


In [7]:
li.analysis_LFs_with_weak_labels(test_df,10)

Test Coverage: 0.8214063472308649
acuracy for the not abstains
0.7014270032930845
acuracy for all
0.39763534536403233


In [9]:
keywords_by_idx = {int(label_to_idx[k]):v for k,v in keywords.items()}
lfs = li.keywords_to_LFs(keywords_by_idx)
li.analysis_LFs(lfs, dev_df, 10)

  0%|          | 0/250 [00:00<?, ?it/s]

100%|██████████| 250/250 [00:00<00:00, 2851.55it/s]

Test Coverage: 0.956
acuracy for the not abstains
0.5608108108108109
acuracy for all
0.332





Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.,Conflict Ratio
lf_binding,0,[0],0.100,0.100,0.100,6,19,0.240000,1.000000
lf_domain,1,[0],0.028,0.028,0.028,5,2,0.714286,1.000000
lf_terminal,2,[0],0.036,0.036,0.036,6,3,0.666667,1.000000
lf_amino,3,[0],0.048,0.048,0.048,4,8,0.333333,1.000000
lf_protein,4,[0],0.096,0.096,0.096,6,18,0.250000,1.000000
...,...,...,...,...,...,...,...,...,...
lf_agonist_3,85,[9],0.120,0.120,0.120,2,28,0.066667,1.000000
lf_nor,86,[9],0.036,0.028,0.024,2,7,0.222222,0.666667
lf_receptor_5,87,[9],0.156,0.156,0.156,2,37,0.051282,1.000000
lf_inactive,88,[9],0.012,0.012,0.012,2,1,0.666667,1.000000
