In [None]:
import numpy as np
import snorkel
from snorkel.labeling import labeling_function
from snorkel.labeling import LabelingFunction
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
from snorkel.labeling.model import MajorityLabelVoter
import json
import pandas as pd

: 

In [221]:
# Helper Functions

# This converts a wrench format dataset to pandas df, makes it easier to apply Snorkel library
def wrench_to_df(dataset):
    indices = [int(i) for i in dataset.keys()]
    text = [ dataset[str(i)]['data']['text'] for i in range(len(indices))]    
    labels = [ dataset[str(i)]['label'] for i in range(len(indices))]    
    data_dict = {'text': text, 'labels': labels}
    df = pd.DataFrame(data=data_dict, index=indices)
    return df
    
# Calculates coverage given a label matrix
def calc_coverage(L):
    return (L.max(axis=1) > -1).mean()

# Applies a set lfs (functions) to a dataset (in df form)
def apply_LFs(lfs, dataset):
    applier = PandasLFApplier(lfs=lfs)
    L_train = applier.apply(df=dataset)
    return L_train

# Helper function for converting an individual keyowrd into an LF
def _keyword_LF(x, keyword=None, label=None):
    # The + allows for conjunctions of keyword LFs
    if "+" in keyword:
        keywords = keyword.split("+")
        return label if all([k in x.text.lower() for k in keywords]) else -1 
    else:
        return label if keyword in x.text.lower() else -1

# Allows us to convert from a keyword_dict {class: [keyword list]} to a set of LFs 
def keywords_to_LFs(keyword_dict):
    lfs = []
    for l, v in keyword_dict.items():
        for k in v:
            lfs.append(LabelingFunction(name=f"lf_{k}", f=_keyword_LF, resources={'keyword':k, 'label':l}))
    return lfs

In [222]:
# Loading the data 
dataset_name = "amazon31"

idx_to_label = json.load(open(f"datasets/{dataset_name}/label.json"))
label_to_idx = {l:i for i,l in idx_to_label.items()}
valid_df = wrench_to_df(json.load(open(f"datasets/{dataset_name}/valid.json", "r")))
train_df = wrench_to_df(json.load(open(f"datasets/{dataset_name}/train.json", "r")))
test_df = wrench_to_df(json.load(open(f"datasets/{dataset_name}/test.json", "r")))

# Sample a dev set to help seed ideas for LFs
dev_df = train_df.sample(250, random_state=123)


## Main Work

In [234]:
### MAIN WORKZONE: To write LFs, simply change elements in the keywords dict ###
keywords = {   
'Apparel':['blouse', 'shirt', 'scarf', 'jacket', 'underwear', 'rib tank', 'strapdowns', 'pants', 'sleeveless', 'apparel', 'poncho'],
'Automotive':['honda', 'suv', 'rv', 'clearance light', 'headlight', 'fuel injector', 'tire'],
'Baby':['hands free gate', 'baby', 'drying+rack', 'motions glider', 'musical+snail'],
'Books':['book', 'workbook', 'writing', 'story'],
'Camera':['camera','lens cap', 'photographer', 'air blaster', 'reflector', 'camera+battery'],
'Electronics':['earphone', 'headphone', 'portable battery', ' laptop ', 'ear+bud'],
'Gift Card':['gift+card'],
'Grocery':['apple+sauce', 'sandwich', 'vanilla', 'gluten+free','cookies', 'coconut+oil', 'gummi', 'candy', 'grocery', 'chocolate', 'truffle', 'rocher', 'ferrero', 'cheddar'],
'Health & Personal Care':['stop+snore', 'snore+less','nail+paint', 'massage+sheet', 'conditioner', 'waist trimmer', 'skin care', 
                          'acne treatment', 'toners', 'face+lotion', 'serums', 'drug store'],
'Home':[' desk ', 'room+divider', 'shoji screen', 'mattress', 'tool kit', 'cutting machine', 'scissor', 'duralast+cartridge'],
'Jewelry':['bracelet', 'cross+pin', ' ring ', 'gemstone', 'earring', 'piercing', 'necklace', 'jewelry', 'wedding+band', 'crystals', 'chain+men'],
'Kitchen':['egg+craker', 'egg separator', 'spoon', 'vitamix', 'blender', 'coffee+filter', 'lunch bag', 'food+container', 'picnic tote', 'salt+shaker', 'pepper+shaker', 'tong','kitchen'], 
'Lawn and Garden':['cooling+umbrella', 'generator', 'garden', 'birdhouse', 'smoker', 'ladybug', 'metal detector'],
'Luggage':['backpack', 'card+holder', 'luggage', 'laptop+case', 'satchel', 'suitcase'],
'Major Appliances':['beer+tap', 'refrigerator', 'freezer', 'dryer', 'heater', 'dishwasher', 'bake element', 'range hood', 'water filter', 'microwave'],
'Mobile_Apps':[' app '], # needs more work, many games
'Music':['album', 'song', ' cd ', ' music ', 'pop+classic'],
'Musical Instruments':['jaw harp', 'jew\'s harp', 'guitar', 'bass', 'headphone+studio', ' capo ', 'guitar+picks'],
'Office Products':['ink+cartridge', 'printer', 'cartridge+replacement', 'ink+replacement', 'paper+card', 'stapler', 'address labels', 'folders',
                    'mouse pad', ' pens ', 'pencils',  'cardcase', 'cordless+phone'],
'Outdoors':['cooler', 'water+bottle', 'bike', 'bicycle', ' hat '],
'PC':['wi-fi', 'tablet', 'laptop+battery', 'wireless+adapter', 'keyboard', 'desktop', 'laptop stand'],
'Pet Products':[' cat ', ' dog', 'pet', 'bird+cage', 'cage', 'puppy' ],
'Shoes':['basketball+shoe', 'sandal' , ' boot', 'shoe', 'sneaker'],
'Software':['software', '[Download]', 'internet+security', 'virus', 'turbo+tax', 'video+conversion'],
'Sports':['t-shirt', 'workout', 'cleat', 'stepper', 'fly rod', 'fly fishing', 'athletic', 'basketball'],
'Tools':['rolling workshop','punch', 'tape measure', 'air+drain', 'fold+knife' , 'wrench', 'kniv+tool',
         'storage box', 'battery pack', 'metal marker', 'plier', 'flashlight', 'knife', 'miter saw'],
'Toys':['bop bag', 'toy+car', 'car+1:', 'toy', 'doll', 'bunny', 'tent+kid', 'tent+son', 'tent+daughter', 'ziggle', 'card game', 'saucer chair'],  # sometimes overlap with baby
'Video':['[vhs]', 'movie', 'film', 'episode', ' dvd ', 'video '],
'Video Games':['game', 'playstation', 'xbox', 'nintendo'],
'Watches':['watch ', 'watches ', 'wrist+watch', 'military+watch', 'sport+watch', 'strap', 'bracelet+watch'],
'Wireless':['electronic+case', 'ipod', 'ipad', 'iphone']
}
keywords_by_idx = {int(label_to_idx[k]):v for k,v in keywords.items()}
lfs = keywords_to_LFs(keywords_by_idx)

In [227]:
# Cell to inspect the dev set for a given label 
label = 'PC' # Change this variable
pd.set_option('display.max_colwidth', 1000)
dev_df[dev_df['labels'] == int(label_to_idx[label])]

Unnamed: 0,text,labels
48112,"Fire HD 7, 7"" HD Display, Wi-Fi, 8 GB I love it. I am 83yrs old and find the ... I love it. I am 83yrs old and find the directions very easy to follow. Sorry I didn't get sooner. Thanks for a great product.<br />,",20
73465,Dragon Touch M10X 10.1-Inch 16GB Tablet Dragon touch m10x tablet Had problems with it in the beginning but I'm starting to really like this tablet and it's working good now,20
42750,"Lenovo IdeaTab K3 Lynx 11.6-Inch 64 GB Tablet Smudgy screen brings it shy of 5 stars.... I'm only going to review the tablet, and not Windows 8, Office, etc...the os and program behavior on the tablet is identical to ANY machine running full Windows 8, so no need for ANOTHER Windows 8 review!<br />.<br />So far I'm not bothered by any so-called &#34;flex&#34; others are talking about. Yes, it's plastic and therefore plasticity, but it feels firm enough to me. I'm willing to sacrifice ultra firmness (like on the iPad) for something as light as this.<br /><br />The screen is bigger than the biggest iPad, which my wife has, but i can hold and read a digital magazine on the Lynx in portrait mode with one hand and not feel the strain like I do with the iPad.<br /><br />That being said, the screen IS an awkward size: in portrait mode, the magazines don't take up the full screen, leaving empty black space at the top and bottom.<br /><br />I put in a 64gb micro SDXC card the other day, and...",20
118024,Rain Design Five Stars It is Awsome !!,20
6864,Generic 2 X Mini USB Bluetooth V2.0 Dongle Wireless Adapter (Bluetoothx2) Good bargain why spend 10 time the price wholesale. Works ok on my windows 7 netbook. But on my old Vista desktop it only receives 1 file at a time. On my old Vista laptop forget using it. It causes the computer to freeze and crash. For the price not bad.,20
111199,"Anker 5200mAh/56Wh Laptop Battery for Toshiba PA3817U-1BRS PA3819U-1BRS Toshiba Satellite C655 L600 L675 L675D L700 L745 L750 L750D L755 L755D M640 M645 P745 Series Pretty good! Finally getting to 100% battery charge after a few tries. Get´s charged way too slow, takes many more hours than publicized. After sale service so far great, even offered me to exchange it, but don´t think I need to.",20
22391,SANOXY Android Keyboard Case Stand To big for my tablet It was too big and the tablet kept sliding out of it and I had to go purchase another one.,20
81677,AMD Athlon 64 X2 4850e 2.5GHz 2x512KB Socket AM2 Dual-Core CPU Great upgrade for Inspiron 531 I upgraded my old desktop from a AMD sempron 2.2ghz single core to this baby and it was a huge difference performance. Great for multitasking a gaming. You don't need a fancy quad core unless u want a constant 60fps. This is all the power you need as it never goes above 70% usage while in games like GTA 4. This is a great product and the seller was fantastic and cooperative.,20


In [228]:
# Cell to inspect dev set for a given keyword (to inspect conflicts). Note: this doesn't work with "+" 
keyword = 'iPad' # Change this word
pd.set_option('display.max_colwidth', 1000)
dev_df[dev_df['text'].str.contains(keyword)]

Unnamed: 0,text,labels
42750,"Lenovo IdeaTab K3 Lynx 11.6-Inch 64 GB Tablet Smudgy screen brings it shy of 5 stars.... I'm only going to review the tablet, and not Windows 8, Office, etc...the os and program behavior on the tablet is identical to ANY machine running full Windows 8, so no need for ANOTHER Windows 8 review!<br />.<br />So far I'm not bothered by any so-called &#34;flex&#34; others are talking about. Yes, it's plastic and therefore plasticity, but it feels firm enough to me. I'm willing to sacrifice ultra firmness (like on the iPad) for something as light as this.<br /><br />The screen is bigger than the biggest iPad, which my wife has, but i can hold and read a digital magazine on the Lynx in portrait mode with one hand and not feel the strain like I do with the iPad.<br /><br />That being said, the screen IS an awkward size: in portrait mode, the magazines don't take up the full screen, leaving empty black space at the top and bottom.<br /><br />I put in a 64gb micro SDXC card the other day, and...",20


## Evaluation

In [235]:
L_train = apply_LFs(lfs, train_df)
L_valid = apply_LFs(lfs, valid_df)
L_test = apply_LFs(lfs, test_df)

print("Train Coverage:", calc_coverage(L_train))
print("Valid Coverage:", calc_coverage(L_valid))
print("Test Coverage:", calc_coverage(L_test))

lf_analysis = LFAnalysis(L=L_valid, lfs=lfs).lf_summary()

# Calculates how many of an LFs votes result in conflicts (helpful signal for debugging LFs)
lf_analysis['Conflict Ratio'] = lf_analysis['Conflicts'] / lf_analysis['Coverage']
lf_analysis

100%|██████████| 131781/131781 [02:54<00:00, 755.05it/s]
100%|██████████| 5805/5805 [00:07<00:00, 754.68it/s]
100%|██████████| 17402/17402 [00:22<00:00, 756.63it/s]


Train Coverage: 0.6395003832115403
Valid Coverage: 0.6396210163652024
Test Coverage: 0.6375703942075623


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Conflict Ratio
lf_blouse,0,[0],0.001206,0.000861,0.000517,0.428571
lf_shirt,1,[0],0.009991,0.006202,0.005857,0.586207
lf_scarf,2,[0],0.001378,0.000689,0.000689,0.500000
lf_jacket,3,[0],0.004479,0.002239,0.002239,0.500000
lf_underwear,4,[0],0.001206,0.000517,0.000345,0.285714
...,...,...,...,...,...,...
lf_bracelet+watch,225,[29],0.003101,0.003101,0.003101,1.000000
lf_electronic+case,226,[30],0.001206,0.001034,0.001034,0.857143
lf_ipod,227,[30],0.012231,0.008786,0.007407,0.605634
lf_ipad,228,[30],0.008613,0.006546,0.005168,0.600000


In [236]:
# List LFs for which 'Conflict Ratio' is above some threshold (helpful for debugging)
lf_analysis[lf_analysis['Conflict Ratio'] > 0.8]['Conflict Ratio'].sort_values(ascending=False)

lf_poncho              1.000000
lf_portable battery    1.000000
lf_bracelet+watch      1.000000
lf_card game           1.000000
lf_tent+daughter       1.000000
lf_car+1:              1.000000
lf_athletic            1.000000
lf_t-shirt             1.000000
lf_video+conversion    1.000000
lf_basketball+shoe     1.000000
lf_cage                1.000000
lf_laptop stand        1.000000
lf_folders             1.000000
lf_headphone+studio    1.000000
lf_pop+classic         1.000000
lf_toners              1.000000
lf_acne treatment      1.000000
lf_skin care           1.000000
lf_waist trimmer       1.000000
lf_nail+paint          1.000000
lf_gummi               1.000000
lf_laptop+battery      0.909091
lf_punch               0.875000
lf_air+drain           0.875000
lf_electronic+case     0.857143
lf_tent+son            0.842105
lf_laptop+case         0.833333
Name: Conflict Ratio, dtype: float64

In [237]:
# Calculate accuracy on the validation set (Ideally do this only at the end)
majority_model = MajorityLabelVoter(31)
preds_valid = majority_model.predict(L=L_valid)
(preds_valid[preds_valid != -1] == valid_df[preds_valid != -1].labels.values).mean()

0.6649519890260631

In [238]:
json.dump(keywords, open("amazon_LFs_v1.json", "w"), indent=8)

In [None]:
# Replace the LFs for a given dataset (in wrench format)
# dataset_name = "dbpedia"

# train_json = json.load(open(f"datasets/{dataset_name}/train.json", "r"))
# for idx in train_json:
#     train_json[idx]['weak_labels'] = [int(i) for i in list(L_train[int(idx)])]
    
# valid_json = json.load(open(f"datasets/{dataset_name}/valid.json", "r"))
# for idx in valid_json:
#     valid_json[idx]['weak_labels'] = [int(i) for i in list(L_valid[int(idx)])]
    
# test_json = json.load(open(f"datasets/{dataset_name}/test.json", "r"))
# for idx in test_json:
#     test_json[idx]['weak_labels'] = [int(i) for i in list(L_test[int(idx)])]

# json.dump(train_json, open(f"datasets/{dataset_name}/train.json", 'w'), indent=4)
# json.dump(valid_json, open(f"datasets/{dataset_name}/valid.json", 'w'), indent=4)
# json.dump(test_json, open(f"datasets/{dataset_name}/test.json", 'w'), indent=4)