In [1]:
from googletrans import Translator
import numpy as np
import snorkel
from snorkel.labeling import labeling_function
from snorkel.labeling import LabelingFunction
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
from snorkel.labeling.model import MajorityLabelVoter
import json
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
%load_ext autoreload
%autoreload 2
import label_improve as li

In [3]:
keywords = {
    "alarm": [
        "alarm",
        "wake+up"
    ],
    "audio": [
        " mute ",
        "volume",
        " loud",
        "quiet"
    ],
    "iot": [
        "light",
        "wemo",
        "coffee"
    ],
    "calendar": [
        "calendar",
        "schedule",
        "remind"
    ],
    "play": [
        "play ",
        "podcast",
        "audiobook"
    ],
    "general": [
        "good morning",
        "joke",
        "explain"
    ],
    "datetime": [
        "date+today",
        "time+is",
        "date+is"
    ],
    "takeaway": [
        "takeaway",
        "delivery",
        "order"
    ],
    "news": [
        "news",
        "times",
        "headline"
    ],
    "music": [
        "what+song",
        "save+song",
        "shuffle"
    ],
    "weather": [
        "weather",
        "temperature",
        " rain",
        " snow"
    ],
    "qa": [
        "stock",
        "what's",
        "define",
        "describe",
        "what is",
        "what+mean"
    ],
    "social": [
        "message",
        "tweet",
        "twitter",
        "facebook",
        "complain",
        "status"
    ],
    "recommendation": [
        "recommend",
        "suggest",
        "restaurant",
    ],
    "cooking": [
        "recipe",
        "timer",
        "cook"
    ],
    "transport": [
        "ticket",
        "train",
        "flight",
        "accident",
        "traffic"
    ],
    "email": [
        "email",
        "inbox",
        "message+inbox",
        "message+email"
    ],
    "lists": [
        " list",
        "create+list",
        "delete+list"
    ]
}

In [50]:
# Loading the data 
dataset_name = "massive-en"

idx_to_label = json.load(open(f"../weak_datasets/{dataset_name}/label.json"))
label_to_idx = {l:i for i,l in idx_to_label.items()}
valid_df = li.massive_to_df(json.load(open(f"../weak_datasets/{dataset_name}/valid.json", "r")))
train_df = li.massive_to_df(json.load(open(f"../weak_datasets/{dataset_name}/train.json", "r")))
test_df = li.massive_to_df(json.load(open(f"../weak_datasets/{dataset_name}/test.json", "r")))

# Sample a dev set to help seed ideas for LFs
dev_df = train_df.sample(250, random_state=123)

In [5]:
dev_df = test_df

In [6]:
keywords_by_idx = {int(label_to_idx[k]):v for k,v in keywords.items()}
lfs = li.keywords_to_LFs(keywords_by_idx)

In [7]:
L_dev = li.apply_LFs(lfs, dev_df)

100%|██████████| 1652/1652 [00:00<00:00, 3578.18it/s]


In [8]:
print("Test Coverage:", li.calc_coverage(L_dev))
lf_analysis = LFAnalysis(L_dev, lfs = lfs).lf_summary(Y = test_df.label.values)
lf_analysis['Conflict Ratio'] = lf_analysis['Conflicts'] / lf_analysis['Coverage']
pd.set_option('display.max_rows', None)
lf_analysis

Test Coverage: 0.610774818401937


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.,Conflict Ratio
lf_alarm,0,[0],0.029661,0.002421,0.001816,49,0,1.0,0.061224
lf_wake+up,1,[0],0.001211,0.000605,0.0,2,0,1.0,0.0
lf_ mute,2,[1],0.001211,0.0,0.0,2,0,1.0,0.0
lf_volume,3,[1],0.010291,0.0,0.0,17,0,1.0,0.0
lf_ loud,4,[1],0.000605,0.0,0.0,1,0,1.0,0.0
lf_quiet,5,[1],0.001816,0.0,0.0,3,0,1.0,0.0
lf_light,6,[2],0.039346,0.002421,0.002421,62,3,0.953846,0.061538
lf_wemo,7,[2],0.002421,0.0,0.0,4,0,1.0,0.0
lf_coffee,8,[2],0.007869,0.0,0.0,13,0,1.0,0.0
lf_calendar,9,[3],0.029056,0.003027,0.001211,48,0,1.0,0.041667


In [9]:
# save lf_analysis
lf_analysis.to_csv(f"lf_analysis.csv", index=True)

In [10]:
# Calculate accuracy on the validation set (Ideally do this only at the end)
majority_model = MajorityLabelVoter(18)
preds_valid = majority_model.predict(L=L_dev)

print("acuracy for the not abstains")
print((preds_valid[preds_valid != -1] == dev_df[preds_valid != -1].label.values).mean())
print("acuracy for all")
print((preds_valid == dev_df.label.values).mean())

acuracy for the not abstains
0.9107929515418502
acuracy for all
0.5006053268765133


In [11]:

absent = np.where(preds_valid == -1)[0]
print("Number of absent predictions", len(absent))
absent_df = dev_df.iloc[absent]

# show the head of absent_df
for index in absent_df.index:
    if any(label != -1 for label in absent_df.loc[index, 'weak_labels']):
        absent_df = absent_df.drop(index)

absent_df.head()
new_absent = li.massive_df_with_new_lf(absent_df, lfs)
new_absent.to_csv("massive.csv", index=False)


Number of absent predictions 744


  0%|          | 0/744 [00:00<?, ?it/s]

100%|██████████| 744/744 [00:00<00:00, 3532.11it/s]


# Save datasets

In [12]:
# update df with new weak labels
new_test = li.massive_df_with_new_lf(test_df, lfs)
new_train = li.massive_df_with_new_lf(train_df, lfs)
new_train = li.massive_df_with_new_lf(train_df, lfs)

  0%|          | 0/1652 [00:00<?, ?it/s]

100%|██████████| 1652/1652 [00:00<00:00, 3589.03it/s]
100%|██████████| 11564/11564 [00:03<00:00, 3538.07it/s]
100%|██████████| 3305/3305 [00:00<00:00, 3585.84it/s]


# Analysis

In [13]:

social_media = li.df_with_label(new_test, 1)
social_media
social_media.to_csv("social_media.csv", index=False)

# Save datasets

In [14]:
massive = li.df_to_massive(new_train)
li.save_dataset(massive,  "../weak_datasets/massive-EN/train.json")
massive = li.df_to_massive(new_train)
li.save_dataset(massive,  "../weak_datasets/massive-EN/train.json")
massive = li.df_to_massive(new_test)
li.save_dataset(massive,  "../weak_datasets/massive-EN/test.json")

# Chinese setup


In [15]:
dataset_name = "massive-cn"

idx_to_label = json.load(open(f"../weak_datasets/{dataset_name}/label.json"))
label_to_idx = {l:i for i,l in idx_to_label.items()}
cn_valid_df = li.massive_to_df(json.load(open(f"../weak_datasets/{dataset_name}/valid.json", "r")))
cn_train_df = li.massive_to_df(json.load(open(f"../weak_datasets/{dataset_name}/train.json", "r")))
cn_test_df = li.massive_to_df(json.load(open(f"../weak_datasets/{dataset_name}/test.json", "r")))

In [16]:
new_test = li.df_to_df_with_new_lf(cn_test_df, test_df, lfs)
new_train = li.df_to_df_with_new_lf(cn_train_df, train_df, lfs)
new_valid = li.df_to_df_with_new_lf(cn_valid_df, valid_df, lfs)

  0%|          | 0/1652 [00:00<?, ?it/s]

100%|██████████| 1652/1652 [00:00<00:00, 3537.99it/s]
100%|██████████| 11564/11564 [00:03<00:00, 3486.72it/s]
100%|██████████| 3305/3305 [00:00<00:00, 3535.54it/s]


In [17]:
new_test.head()

Unnamed: 0,text,label,weak_labels
0,你能把灯打开吗,2,"[-1, -1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1..."
1,个人数字助理是什么意思,11,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
2,和下午八点还差多少,6,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
3,我有收到带有追踪号码的邮件吗,16,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
4,妈妈三十分钟前发的电子邮件说了什么,16,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."


In [18]:
massive = li.df_to_massive(new_train)
li.save_dataset(massive,  "../weak_datasets/massive-CN/train.json")
massive = li.df_to_massive(new_valid)
li.save_dataset(massive,  "../weak_datasets/massive-CN/valid.json")
massive = li.df_to_massive(new_test)
li.save_dataset(massive,  "../weak_datasets/massive-CN/test.json")

# Start translate version

In [19]:
cn_train_df.head()

Unnamed: 0,text,label,weak_labels
0,打开我的清单,17,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
1,关闭随机播放,9,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
2,我今天有什么清单吗,17,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
3,今天我偶然碰见老朋友大强了,5,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
4,是一个忙碌的一天,5,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."


In [21]:
# translate all the data
cn_en_test = li.translate_df_to_english(cn_test_df, 'zh-cn')
new_test = li.df_to_df_with_new_lf(cn_test_df, cn_en_test, lfs)
massive = li.df_to_massive(new_test)
li.save_dataset(massive,  "../weak_datasets/massive-CN2/test.json")

100%|██████████| 1652/1652 [27:44<00:00,  1.01s/it]
100%|██████████| 1652/1652 [00:00<00:00, 3522.74it/s]


In [22]:
cn_en_test.head()


Unnamed: 0,text,label,weak_labels
0,Can you turn on the light?,2,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
1,Personal digital assistant What does it mean,11,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
2,How much worse than 8 pm,6,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
3,Have I received an email with a tracking number?,16,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
4,What did my mother say 30 minutes ago?,16,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."


In [27]:
# translate all the data
cn_en_valid = li.translate_df_to_english(cn_valid_df, 'zh-CN')
new_valid = li.df_to_df_with_new_lf(cn_valid_df, cn_en_valid, lfs)
massive = li.df_to_massive(new_valid)
li.save_dataset(massive,  "../weak_datasets/massive-CN2/valid.json")

Translating:   0%|          | 0/133 [00:00<?, ?it/s]

Translating: 100%|██████████| 133/133 [58:04<00:00, 26.20s/it]
100%|██████████| 3305/3305 [00:00<00:00, 3464.07it/s]


In [32]:
# Example usage
auth_key = ""
# translate all the data
cn_en_train = li.deep_translate_df_to_english(cn_train_df, auth_key, 'ZH')
new_train = li.df_to_df_with_new_lf(cn_train_df, cn_en_train, lfs)
massive = li.df_to_massive(new_train)
li.save_dataset(massive,  "../weak_datasets/massive-CN2/train.json")

Translating:   0%|          | 0/11564 [00:00<?, ?it/s]

Translating: 100%|██████████| 11564/11564 [00:20<00:00, 557.32it/s]
100%|██████████| 11564/11564 [00:03<00:00, 3439.00it/s]


In [34]:

massive = li.df_to_massive(cn_en_train)
li.save_dataset(massive,  "./translated_train.json")

In [35]:
cn_en_train.head(19)
new_train = li.massive_df_with_new_lf(cn_en_train, lfs)
massive = li.df_to_massive(new_train)
li.save_dataset(massive,  "./translated_train.json")


  0%|          | 0/11564 [00:00<?, ?it/s]

100%|██████████| 11564/11564 [00:03<00:00, 3666.90it/s]


In [36]:
lf_analysis = li.analysis_LFs(lfs, new_train, 18)

100%|██████████| 11564/11564 [00:03<00:00, 3693.72it/s]


Test Coverage: 0.562953995157385
acuracy for the not abstains
0.36666097935505887
acuracy for all
0.18583535108958837


In [40]:
lf_analysis = li.analysis_LFs_with_weak_labels(new_train, 18)

Test Coverage: 0.562953995157385
acuracy for the not abstains
0.36666097935505887
acuracy for all
0.18583535108958837


In [42]:
cn_train = li.massive_to_df(json.load(open(f"../weak_datasets/massive-CN/train.json", "r")))

In [43]:
lf_analysis = li.analysis_LFs_with_weak_labels(cn_train, 18)

Test Coverage: 0.6162227602905569
acuracy for the not abstains
0.901840490797546
acuracy for all
0.4957627118644068


## Norwegian setup 

In [55]:
dataset_name = "massive-nb"

idx_to_label = json.load(open(f"../weak_datasets/{dataset_name}/label.json"))
label_to_idx = {l:i for i,l in idx_to_label.items()}

nb_valid_df = li.massive_to_df(json.load(open(f"../weak_datasets/{dataset_name}/valid.json", "r")))
nb_train_df = li.massive_to_df(json.load(open(f"../weak_datasets/{dataset_name}/train.json", "r")))
nb_test_df = li.massive_to_df(json.load(open(f"../weak_datasets/{dataset_name}/test.json", "r")))

In [56]:
new_test = li.df_to_df_with_new_lf(nb_test_df, test_df, lfs)
new_train = li.df_to_df_with_new_lf(nb_train_df, train_df, lfs)
new_valid = li.df_to_df_with_new_lf(nb_valid_df, valid_df, lfs)

100%|██████████| 1652/1652 [00:00<00:00, 3692.78it/s]
100%|██████████| 11564/11564 [00:03<00:00, 3653.92it/s]
100%|██████████| 3305/3305 [00:00<00:00, 3697.77it/s]


In [57]:
massive = li.df_to_massive(new_train)
li.save_dataset(massive,  "../weak_datasets/massive-NB/train.json")
massive = li.df_to_massive(new_valid)
li.save_dataset(massive,  "../weak_datasets/massive-NB/valid.json")
massive = li.df_to_massive(new_test)
li.save_dataset(massive,  "../weak_datasets/massive-NB/test.json")

In [58]:
li.analysis_LFs_with_weak_labels(new_train, 18)
li.analysis_LFs_with_weak_labels(new_valid, 18)
li.analysis_LFs_with_weak_labels(new_test, 18)

Test Coverage: 0.6162227602905569
acuracy for the not abstains
0.901840490797546
acuracy for all
0.4957627118644068
Test Coverage: 0.6084720121028744
acuracy for the not abstains
0.9073972602739726
acuracy for all
0.5010590015128593
Test Coverage: 0.610774818401937
acuracy for the not abstains
0.9107929515418502
acuracy for all
0.5006053268765133


In [59]:
# Example usage
auth_key = "82b8da97-a9d0-474e-9002-bb9d776d7c9c:fx"
# translate all the data
nb_en_test = li.deep_translate_df_to_english(nb_test_df, auth_key, 'NB')
new_test = li.df_to_df_with_new_lf(nb_test_df, nb_en_test, lfs)
massive = li.df_to_massive(new_test)
li.save_dataset(massive,  "../weak_datasets/massive-NB2/test.json")

Translating: 100%|██████████| 1652/1652 [00:05<00:00, 288.46it/s]
100%|██████████| 1652/1652 [00:00<00:00, 3662.25it/s]


In [61]:
lf_analysis = li.analysis_LFs_with_weak_labels(new_test, 18)

Test Coverage: 0.5901937046004843
acuracy for the not abstains
0.3691275167785235
acuracy for all
0.19975786924939468


In [63]:
# Example usage
auth_key = "8e406fcb-2e39-41a9-8c7e-05668042bf46:fx"
# translate all the data
nb_en_train = li.deep_translate_df_to_english(nb_train_df, auth_key, 'NB')
new_train = li.df_to_df_with_new_lf(nb_train_df, nb_en_train, lfs)
massive = li.df_to_massive(new_train)
li.save_dataset(massive,  "../weak_datasets/massive-NB2/train.json")

Translating:   0%|          | 0/11564 [00:00<?, ?it/s]

Translating: 100%|██████████| 11564/11564 [00:29<00:00, 398.34it/s]
100%|██████████| 11564/11564 [00:03<00:00, 3627.39it/s]


In [64]:
lf_analysis = li.analysis_LFs_with_weak_labels(new_train, 18)

Test Coverage: 0.5915773088896575
acuracy for the not abstains
0.35142118863049093
acuracy for all
0.18817018332756832


In [66]:
# Example usage
auth_key = "8648c5d9-09cf-4d89-83d5-bde6ebbc5f5e:fx"
# translate all the data
nb_en_valid = li.deep_translate_df_to_english(nb_valid_df, auth_key, 'NB')
new_valid = li.df_to_df_with_new_lf(nb_valid_df, nb_en_valid, lfs)
massive = li.df_to_massive(new_valid)
li.save_dataset(massive,  "../weak_datasets/massive-NB2/valid.json")

Translating:   0%|          | 0/3305 [00:00<?, ?it/s]

Translating: 100%|██████████| 3305/3305 [00:10<00:00, 328.98it/s]
100%|██████████| 3305/3305 [00:00<00:00, 3667.86it/s]


In [67]:
lf_analysis = li.analysis_LFs_with_weak_labels(new_valid, 18)

Test Coverage: 0.5897125567322239
acuracy for the not abstains
0.40404040404040403
acuracy for all
0.2178517397881997


In [68]:
new_train = li.massive_df_with_new_lf(nb_en_train, lfs)
massive = li.df_to_massive(new_train)
li.save_dataset(massive,  "../weak_datasets/temp/translate_nb/train.json")

100%|██████████| 11564/11564 [00:03<00:00, 3632.39it/s]


In [69]:
new_test = li.massive_df_with_new_lf(nb_en_test, lfs)
massive = li.df_to_massive(new_test)
li.save_dataset(massive,  "../weak_datasets/temp/translate_nb/test.json")

100%|██████████| 1652/1652 [00:00<00:00, 3712.34it/s]


In [70]:
new_valid = li.massive_df_with_new_lf(nb_en_valid, lfs)
massive = li.df_to_massive(new_valid)
li.save_dataset(massive,  "../weak_datasets/temp/translate_nb/valid.json")

100%|██████████| 3305/3305 [00:00<00:00, 3710.79it/s]
