In [2]:
import numpy as np
import snorkel
from snorkel.labeling import labeling_function
from snorkel.labeling import LabelingFunction
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
import re
from snorkel.labeling.model import MajorityLabelVoter
import json
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
%load_ext autoreload
%autoreload 2
import label_improve as li

In [4]:
# Loading the data 
dataset_name = "amazon31"
idx_to_label = json.load(open(f"../weak_datasets/{dataset_name}/label.json"))
label_to_idx = {l:i for i,l in idx_to_label.items()}
valid_df = li.data_to_df(json.load(open(f"../weak_datasets/{dataset_name}/valid.json", "r")))
train_df = li.data_to_df(json.load(open(f"../weak_datasets/{dataset_name}/train.json", "r")))
test_df = li.data_to_df(json.load(open(f"../weak_datasets/{dataset_name}/test.json", "r")))

# Sample a dev set to help seed ideas for LFs
dev_df = train_df.sample(250, random_state=123)

In [5]:
# Get rid of the "weak_labels" column
dev_df = dev_df.drop(columns=["weak_labels"])
dev_df.head()

Unnamed: 0,text,label
89824,Kensun HID Kit Computer Warning Canceller & An...,1
27188,Polaroid Lens Cap Strap Useful A useful access...,4
97610,Genluna Women Ladies Long Sleeve Embroidered C...,0
50672,Triple Pet EZ Dog Toothbrush Too large This to...,21
10886,Majestic MLB 'Team Dad' Mens T-shirt Sizing an...,24


In [6]:
# Save the dataset to csv
dev_df.to_csv(f"./llm/amazon31.csv", index=False)

In [7]:
keywords = {
    "Apparel": ["fit", "large", "love", "size", "small"],
    "Automotive": ["battery", "br", "clear", "lens", "window"],
    "Baby": ["baby", "car", "like", "seat", "use"],
    "Books": ["book", "interesting", "nix", "vampire", "years"],
    "Camera": ["batteries", "battery", "br", "canon", "sony"],
    "Electronics": ["ear", "great", "happy", "headphones", "works"],
    "Gift Card": ["amazon", "card", "com", "gift", "greeting"],
    "Grocery": ["br", "don", "good", "like", "ounce"],
    "Health & Personal Care": ["love", "nail", "product", "sweat", "work"],
    "Home": ["desk", "great", "nice", "perfect", "smell"],
    "Jewelry": ["earrings", "pretty", "really", "ring", "silver"],
    "Kitchen": ["br", "coffee", "fact", "great", "vitamix"],
    "Lawn and Garden": ["br", "did", "sit", "smoker", "use"],
    "Luggage": ["bag", "card", "handle", "luggage", "price"],
    "Major Appliances": ["br", "buy", "fridge", "kitchen", "months"],
    "Mobile_Apps": ["app", "game", "play", "sports", "time"],
    "Music": ["great", "love", "music", "quot", "song"],
    "Musical Instruments": ["banjo", "good", "guitar", "headphones", "play"],
    "Office Products": ["address", "br", "cw", "labels", "ve"],
    "Outdoors": ["bottle", "great", "hat", "love", "water"],
    "PC": ["br", "great", "old", "screen", "tablet"],
    "Pet Products": ["bed", "dog", "great", "pet", "treats"],
    "Shoes": ["boots", "fit", "kenzie", "pair", "shoes"],
    "Software": ["br", "deluxe", "quicken", "software", "turbotax"],
    "Sports": ["good", "great", "like", "stars", "work"],
    "Tools": ["br", "great", "knife", "quality", "sharp"],
    "Toys": ["doll", "fun", "good", "loves", "product"],
    "Video": ["br", "good", "guys", "vhs", "workout"],
    "Video Games": ["card", "download", "game", "great", "xbox"],
    "Watches": ["br", "case", "like", "watch", "watches"],
    "Wireless": ["advertised", "case", "iphone", "item", "works"]
}


In [8]:
# The class number need to be higher than the number of classes in the dataset 77 >31
li.analysis_LFs_with_weak_labels(test_df,77)

Test Coverage: 0.634409837949661
acuracy for the not abstains
0.7047542489664677
acuracy for all
0.35266061372256063


In [9]:
keywords_by_idx = {int(label_to_idx[k]):v for k,v in keywords.items()}
lfs = li.keywords_to_LFs(keywords_by_idx)
li.analysis_LFs(lfs, test_df, 77)

100%|██████████| 17402/17402 [00:10<00:00, 1638.94it/s]


Test Coverage: 0.9963222618089874
acuracy for the not abstains
0.2599450311008245
acuracy for all
0.10326399264452361


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.,Conflict Ratio
lf_fit,0,[0],0.101655,0.101655,0.101655,188,1581,0.106275,1.000000
lf_large,1,[0],0.046776,0.046776,0.046719,62,752,0.076167,0.998771
lf_love,2,[0],0.198770,0.198770,0.198770,143,3316,0.041341,1.000000
lf_size,3,[0],0.072808,0.072808,0.072750,141,1126,0.111287,0.999211
lf_small,4,[0],0.070796,0.070509,0.070509,81,1151,0.065747,0.995942
...,...,...,...,...,...,...,...,...,...
lf_advertised,150,[30],0.009769,0.009769,0.009769,2,168,0.011765,1.000000
lf_case_2,151,[30],0.053442,0.053442,0.053442,209,721,0.224731,1.000000
lf_iphone,152,[30],0.011838,0.011838,0.011838,127,79,0.616505,1.000000
lf_item,153,[30],0.040283,0.040225,0.040225,37,664,0.052782,0.998573
