In [22]:
import pandas as pd
import numpy as np
import random
from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.model import LabelModel
from snorkel.labeling import LFAnalysis
from snorkel.labeling.model import MajorityLabelVoter

In [23]:
data = [5, 21, 1, 29, 32, 37, 10, 20, 10, 26, 2, 37, 34, 11, 22, 36, 12, 20, 31, 25]
df = pd.DataFrame(data, columns=["Number"])

In [24]:
ABSTAIN = -1
NON_PRIME = 0
PRIME = 1

In [25]:
from snorkel.labeling import labeling_function

@labeling_function()
def is_even(record):
    if record["Number"] % 2 == 0:
        return NON_PRIME
    else:
        return ABSTAIN

In [26]:
@labeling_function()
def is_odd(record):
    if record["Number"] % 2 == 1:
        return ABSTAIN
    else:
        return NON_PRIME

In [27]:
@labeling_function()
def is_two(record):
    if record["Number"] == 2:
        return PRIME
    else:
        return ABSTAIN

In [28]:
#The list of "known" prime numbers
known_primes = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29]
@labeling_function()
def is_known_prime(record):
    if record["Number"] in known_primes:
        return PRIME
    else:
        return ABSTAIN

In [29]:
from snorkel.labeling import PandasLFApplier
lfs = [
    is_odd,
    is_even,
    is_two,
    is_known_prime
]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df)


100%|██████████| 20/20 [00:00<00:00, 10911.30it/s]


In [30]:
from snorkel.labeling import LFAnalysis
LFAnalysis(L=L_train, lfs=lfs).lf_summary()


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
is_odd,0,[0],0.55,0.55,0.05
is_even,1,[0],0.55,0.55,0.05
is_two,2,[1],0.05,0.05,0.05
is_known_prime,3,[1],0.2,0.05,0.05


In [31]:
# define a validation set, and create a DataFrame
validation = [22, 11, 7, 2, 32]
df_val = pd.DataFrame(validation, columns=["Number"])
# gather the ground truth labels
true_labels = np.array([0, 1, 1, 1, 0])
# apply the labels
L_valid = applier.apply(df_val)
# analyze the labelers and get the summary df
LFAnalysis(L_valid, lfs).lf_summary(true_labels)


100%|██████████| 5/5 [00:00<00:00, 2480.37it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
is_odd,0,[0],0.6,0.6,0.2,2,1,0.666667
is_even,1,[0],0.6,0.6,0.2,2,1,0.666667
is_two,2,[1],0.2,0.2,0.2,1,0,1.0
is_known_prime,3,[1],0.6,0.2,0.2,3,0,1.0


In [20]:
df_val = pd.DataFrame(validation, columns=["Number"])
df_val["is_odd"] = df_val.apply(is_odd, axis=1)
df_val["is_even"] = df_val.apply(is_even, axis=1)
df_val["is_two"] = df_val.apply(is_two, axis=1)
df_val["is_known_prime"] = df_val.apply(is_known_prime, axis=1)
df_val

Unnamed: 0,Number,is_odd,is_even,is_two,is_known_prime
0,22,0,0,-1,-1
1,11,-1,-1,-1,1
2,7,-1,-1,-1,1
3,2,0,0,1,1
4,32,0,0,-1,-1


In [21]:
label_model = LabelModel()
label_model.fit(L_train=L_train, n_epochs=200, seed=100)
preds_train_label = label_model.predict(L=L_train)
preds_valid_label = label_model.predict(L=L_valid)
L_valid = applier.apply(df_val)
LFAnalysis(L_valid, lfs).lf_summary()
preds_train_labelingModel = label_model.predict(L=L_train)
preds_valid_labelingModel = label_model.predict(L=L_valid)
df["preds_labelingModel"] = preds_train_labelingModel

INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/200 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.701]
INFO:root:[10 epochs]: TRAIN:[loss=0.208]
  6%|▌         | 11/200 [00:00<00:01, 102.77epoch/s]INFO:root:[20 epochs]: TRAIN:[loss=0.065]
INFO:root:[30 epochs]: TRAIN:[loss=0.025]
INFO:root:[40 epochs]: TRAIN:[loss=0.018]
INFO:root:[50 epochs]: TRAIN:[loss=0.007]
INFO:root:[60 epochs]: TRAIN:[loss=0.005]
INFO:root:[70 epochs]: TRAIN:[loss=0.004]
INFO:root:[80 epochs]: TRAIN:[loss=0.004]
 40%|████      | 81/200 [00:00<00:00, 442.53epoch/s]INFO:root:[90 epochs]: TRAIN:[loss=0.003]
INFO:root:[100 epochs]: TRAIN:[loss=0.003]
INFO:root:[110 epochs]: TRAIN:[loss=0.003]
INFO:root:[120 epochs]: TRAIN:[loss=0.003]
INFO:root:[130 epochs]: TRAIN:[loss=0.003]
INFO:root:[140 epochs]: TRAIN:[loss=0.003]
INFO:root:[150 epochs]: TRAIN:[loss=0.003]
INFO:root:[160 epochs]: TRAIN:[loss=0.003]
INFO:root:[170 epochs]: TRAIN:[loss=0.003]
INFO:root:[180 epochs]: TRAIN:[loss=0.

In [38]:
f1_micro = label_model.score(L_valid, true_labels, metrics=["f1_micro"])
accuracy = label_model.score(L_valid, true_labels, metrics=["accuracy"])
recall = label_model.score(L_valid, true_labels, metrics=["recall"])
precision = label_model.score(L_valid, true_labels, metrics=["precision"])
print("{} {} {} {}".format(f1_micro, accuracy, recall, precision))



{'f1_micro': 0.8000000000000002} {'accuracy': 0.8} {'recall': 0.6666666666666666} {'precision': 1.0}


In [39]:
f1_micro = label_model.score(L_valid, true_labels, metrics=["f1_micro"])
accuracy = label_model.score(L_valid, true_labels, metrics=["accuracy"])
recall = label_model.score(L_valid, true_labels, metrics=["recall"])

precision = label_model.score(L_valid, true_labels, metrics=["precision"])
print("{} {} {} {}".format(f1_micro, accuracy, recall, precision))



{'f1_micro': 0.8000000000000002} {'accuracy': 0.8} {'recall': 0.6666666666666666} {'precision': 1.0}
