In [94]:
import json
import pandas as pd
from snorkel.labeling import LabelingFunction, PandasLFApplier
ABSTAIN = -1
import random

# Original keywords
keywords = {
    "alarm": [
        "alarm",
        "wake+up"
    ],
    "audio": [
        " mute ",
        "volume",
        " loud",
        "quiet"
    ],
    "iot": [
        "light",
        "wemo",
        "coffee"
    ],
    "calendar": [
        "calendar",
        "schedule",
        "remind"
    ],
    "play": [
        "play ",
        "podcast",
        "audiobook"
    ],
    "general": [
        "good morning",
        "joke",
        "explain"
    ],
    "datetime": [
        "date+today",
        "time+is",
        "date+is"
    ],
    "takeaway": [
        "takeaway",
        "delivery",
        "order"
    ],
    "news": [
        "news",
        "times",
        "headline"
    ],
    "music": [
        "what+song",
        "save+song",
        "shuffle"
    ],
    "weather": [
        "weather",
        "temperature",
        " rain",
        " snow"
    ],
    "qa": [
        "stock",
        "what's",
        "define",
        "describe",
        "what is",
        "what+mean"
    ],
    "social": [
        "message",
        "tweet",
        "twitter",
        "facebook",
        "complain",
        "status"
    ],
    "recommendation": [
        "recommend",
        "suggest",
        "restaurant",
    ],
    "cooking": [
        "recipe",
        "timer",
        "cook"
    ],
    "transport": [
        "ticket",
        "train",
        "flight",
        "accident",
        "traffic"
    ],
    "email": [
        "email",
        "inbox",
        "message+inbox",
        "message+email"
    ],
    "lists": [
        " list",
        "create+list",
        "delete+list"
    ]
}

# Detailed labels mapping
detailed_mapping = {
    "alarm": [0, 14, 15],
    "audio": [1, 23, 28, 26],
    "iot": [2, 3, 4, 5, 6, 27, 29, 30, 19],
    "calendar": [7, 38, 52],
    "play": [8, 32, 39, 40, 47],
    "general": [9, 10, 24],
    "datetime": [11, 12],
    "takeaway": [13, 20],
    "news": [16],
    "music": [17, 18, 22, 25],
    "weather": [21],
    "qa": [31, 36, 51, 58, 59],
    "social": [33, 41],
    "recommendation": [34, 37, 45],
    "cooking": [35, 50],
    "transport": [42, 44, 49, 57],
    "email": [43, 48, 54, 56],
    "lists": [46, 53, 55]
}

# Calculate the total number of weak labels needed
number_of_weak_labels = len([kw for kws in keywords.values() for kw in kws])

# Helper function for keyword LFs
def keyword_LF(x, keyword=None, labels=None):
    text = x['data']['text'].lower()
    if "+" in keyword:
        keywords_split = keyword.split("+")
        if all([k in text for k in keywords_split]):
            return random.choice(labels)
    else:
        if keyword in text:
            return random.choice(labels)
    return ABSTAIN

# Create labeling functions for each keyword
def create_keyword_LFs(keywords, detailed_mapping):
    lfs = []
    for category, kw_list in keywords.items():
        labels = detailed_mapping[category]
        for keyword in kw_list:
            lfs.append(LabelingFunction(name=f"lf_{category}_{keyword}", f=keyword_LF, resources={'keyword': keyword, 'labels': labels}))
    return lfs

# Create labeling functions
lfs = create_keyword_LFs(keywords, detailed_mapping)

# Load datasets
dataset_name = "massive"
idx_to_label = json.load(open(f"../weak_datasets/{dataset_name}/label.json"))
label_to_idx = {l: i for i, l in idx_to_label.items()}

test_data = json.load(open(f"../weak_datasets/{dataset_name}/valid.json", "r"))

# Convert test data to DataFrame
test_df = pd.DataFrame.from_dict(test_data, orient='index')

# Applies a set of LFs (functions) to a dataset (in df form)
def apply_LFs(lfs, dataset):
    applier = PandasLFApplier(lfs=lfs)
    L = applier.apply(df=dataset)
    return L

# Generate weak labels for the test dataset
L_test = apply_LFs(lfs, test_df)

# Convert the labeling matrix L_test to weak labels
def convert_to_weak_labels(L, num_labels):
    weak_labels = []
    for row in L:
        weak_label = [-1] * num_labels
        for i, label in enumerate(row):
            if label != ABSTAIN:
                weak_label[i] = int(label)
        weak_labels.append(weak_label)
    return weak_labels

test_df['weak_labels'] = convert_to_weak_labels(L_test, number_of_weak_labels)

# Convert DataFrame back to dictionary format
new_test_data = test_df.to_dict(orient='index')

# Ensure all weak_labels are lists of integers
for key in new_test_data:
    new_test_data[key]['weak_labels'] = [int(i) for i in new_test_data[key]['weak_labels']]

# Save the updated dataset
dataset_name = 'massive_highcad'
with open(f"../weak_datasets/{dataset_name}/valid.json", "w") as f:
    json.dump(new_test_data, f, indent=2)

print("Updated dataset with new weak labels has been saved.")


  0%|          | 0/2033 [00:00<?, ?it/s]

100%|██████████| 2033/2033 [00:00<00:00, 6787.71it/s]


Updated dataset with new weak labels has been saved.


In [95]:
import label_improve as li

test_df = li.massive_to_df(json.load(open(f"../weak_datasets/{dataset_name}/valid.json", "r")))

li.analysis_LFs_with_weak_labels(test_df, 60)

Test Coverage: 0.6222331529758977
acuracy for the not abstains
0.33395348837209304
acuracy for all
0.17658632562715199


In [6]:
import json
import pandas as pd
from snorkel.labeling import LabelingFunction, PandasLFApplier

ABSTAIN = -1

# Original keywords
keywords = {
    "alarm": [
        "alarm",
        "wake+up"
    ],
    "audio": [
        " mute ",
        "volume",
        " loud",
        "quiet"
    ],
    "iot": [
        "light",
        "wemo",
        "coffee"
    ],
    "calendar": [
        "calendar",
        "schedule",
        "remind"
    ],
    "play": [
        "play ",
        "podcast",
        "audiobook"
    ],
    "general": [
        "good morning",
        "joke",
        "explain"
    ],
    "datetime": [
        "date+today",
        "time+is",
        "date+is"
    ],
    "takeaway": [
        "takeaway",
        "delivery",
        "order"
    ],
    "news": [
        "news",
        "times",
        "headline"
    ],
    "music": [
        "what+song",
        "save+song",
        "shuffle"
    ],
    "weather": [
        "weather",
        "temperature",
        " rain",
        " snow"
    ],
    "qa": [
        "stock",
        "what's",
        "define",
        "describe",
        "what is",
        "what+mean"
    ],
    "social": [
        "message",
        "tweet",
        "twitter",
        "facebook",
        "complain",
        "status"
    ],
    "recommendation": [
        "recommend",
        "suggest",
        "restaurant",
    ],
    "cooking": [
        "recipe",
        "timer",
        "cook"
    ],
    "transport": [
        "ticket",
        "train",
        "flight",
        "accident",
        "traffic"
    ],
    "email": [
        "email",
        "inbox",
        "message+inbox",
        "message+email"
    ],
    "lists": [
        " list",
        "create+list",
        "delete+list"
    ]
}

# Detailed labels mapping
detailed_mapping = {
    "alarm": [0, 14, 15],
    "audio": [1, 23, 28, 26],
    "iot": [2, 3, 4, 5, 6, 27, 29, 30, 19],
    "calendar": [7, 38, 52],
    "play": [8, 32, 39, 40, 47],
    "general": [9, 10, 24],
    "datetime": [11, 12],
    "takeaway": [13, 20],
    "news": [16],
    "music": [17, 18, 22, 25],
    "weather": [21],
    "qa": [31, 36, 51, 58, 59],
    "social": [33, 41],
    "recommendation": [34, 37, 45],
    "cooking": [35, 50],
    "transport": [42, 44, 49, 57],
    "email": [43, 48, 54, 56],
    "lists": [46, 53, 55]
}

# Calculate the total number of weak labels needed
number_of_weak_labels = sum(len(detailed_mapping[category]) * len(keywords[category]) for category in keywords)

# Helper function for converting an individual keyword into an LF
def _keyword_LF(x, keyword=None, label=None):
    text = x['data']['text'].lower()
    if "+" in keyword:
        keywords_split = keyword.split("+")
        return label if all([k in text for k in keywords_split]) else ABSTAIN
    else:
        return label if keyword in text else ABSTAIN

# Allows us to convert from a keyword_dict {class: [keyword list]} to a set of LFs 
def keywords_to_LFs(keyword_dict, detailed_mapping):
    lfs = []
    for category, kw_list in keyword_dict.items():
        detailed_labels = detailed_mapping[category]
        for keyword in kw_list:
            for label in detailed_labels:
                lfs.append(LabelingFunction(name=f"lf_{keyword}_{label}", f=_keyword_LF, resources={'keyword': keyword, 'label': label}))
    return lfs

# Create labeling functions
lfs = keywords_to_LFs(keywords, detailed_mapping)

# Load datasets
dataset_name = "massive"
idx_to_label = json.load(open(f"../weak_datasets/{dataset_name}/label.json"))
label_to_idx = {l: i for i, l in idx_to_label.items()}

test_data = json.load(open(f"../weak_datasets/{dataset_name}/valid.json", "r"))

# Convert test data to DataFrame
test_df = pd.DataFrame.from_dict(test_data, orient='index')

# Applies a set of LFs (functions) to a dataset (in df form)
def apply_LFs(lfs, dataset):
    applier = PandasLFApplier(lfs=lfs)
    L = applier.apply(df=dataset)
    return L

# Generate weak labels for the test dataset
L_test = apply_LFs(lfs, test_df)

# Convert the labeling matrix L_test to weak labels
def convert_to_weak_labels(L, num_labels):
    weak_labels = []
    for row in L:
        weak_label = [-1] * num_labels
        for i, label in enumerate(row):
            if label != ABSTAIN:
                weak_label[i] = int(label)
        weak_labels.append(weak_label)
    return weak_labels

test_df['weak_labels'] = convert_to_weak_labels(L_test, number_of_weak_labels)

# Convert DataFrame back to dictionary format
new_test_data = test_df.to_dict(orient='index')

# Ensure all weak_labels are lists of integers
for key in new_test_data:
    new_test_data[key]['weak_labels'] = [int(i) for i in new_test_data[key]['weak_labels']]
dataset_name = 'massive_higcad2'
# Save the updated dataset
with open(f"../weak_datasets/{dataset_name}/valid.json", "w") as f:
    json.dump(new_test_data, f, indent=2)

print("Updated dataset with new weak labels has been saved.")


  0%|          | 0/2033 [00:00<?, ?it/s]

100%|██████████| 2033/2033 [00:00<00:00, 2064.00it/s]


Updated dataset with new weak labels has been saved.
