In [146]:
import pandas as pd
import numpy as np
import random

from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.model import LabelModel
from snorkel.labeling import LFAnalysis
from snorkel.labeling.model import MajorityLabelVoter

### Create a small example dataset

In [147]:
data = [5, 21, 1, 29, 32, 37, 10, 20, 10, 26, 2, 37, 34, 11, 22, 36, 12, 20, 31, 25]

In [148]:
df = pd.DataFrame(data, columns=['Number'])

### Creating a small validation set

In [149]:
validation = [22, 11, 7, 2, 32]

In [150]:
df_val = pd.DataFrame(validation, columns=['Number'])

Ground truth array for the validation set. 

22 -> not prime [0]<br>
11 -> prime [1]<br>
7 -> prime [1]<br>
2 -> prime [1]<br>
32 -> not prime [0]<br>

In [151]:
true_labels = np.array([0, 1, 1, 1, 0])

In [152]:
df_tl = pd.DataFrame(true_labels, columns=['Number'])

### Labeling functions

In [153]:
ABSTAIN = -1
NON_PRIME = 0
PRIME = 1

In [154]:
# if odd, abstain else non-prime.
@labeling_function()
def is_odd(record):
    if record["Number"]%2 == 1:
        return ABSTAIN
    else:
        return NON_PRIME

In [155]:
# if even, then non-prime else abstain
# @labeling_function()
# def is_even(record):
#     if record["Number"]%2 == 0:
#         return NON_PRIME
#     else:
#         return ABSTAIN

In [156]:
@labeling_function()
def is_two(record):
    if record["Number"] == 2:
        return PRIME
    else:
        return ABSTAIN

In [157]:
@labeling_function()
def is_known_prime(record):
    known_primes = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29]
    if record["Number"] in known_primes:
        return PRIME
    else:
        return ABSTAIN

### Calculating: Polarity, Coverage, Overlaps and Conflics for the labeling functions

In [158]:
lfs = [
        is_odd,
#        is_even, 
        is_two,
        is_known_prime
      ]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df)
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

100%|██████████| 20/20 [00:00<00:00, 9049.20it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
is_odd,0,[0],0.55,0.05,0.05
is_two,1,[1],0.05,0.05,0.05
is_known_prime,2,[1],0.2,0.05,0.05


#### Applying the labeling functions to the dataset, just for illustration of calcualting the above metrics

In [159]:
df["is_odd"] = df.apply(is_odd, axis=1)
# df["is_even"] = df.apply(is_even, axis=1)
df["is_two"] = df.apply(is_two, axis=1)
df["is_known_prime"] = df.apply(is_known_prime, axis=1)

In [160]:
# df[["Number", "is_odd", "is_even", "is_two", "is_known_prime"]]
df[["Number", "is_odd", "is_two", "is_known_prime"]]

Unnamed: 0,Number,is_odd,is_two,is_known_prime
0,5,-1,-1,1
1,21,-1,-1,-1
2,1,-1,-1,-1
3,29,-1,-1,1
4,32,0,-1,-1
5,37,-1,-1,-1
6,10,0,-1,-1
7,20,0,-1,-1
8,10,0,-1,-1
9,26,0,-1,-1


##### Polarity

In [161]:
LFAnalysis(L=L_train, lfs=lfs).lf_polarities()

[[0], [1], [1]]

##### Coverage

In [162]:
print(df[df!= -1].count()/df.shape[0])

Number            1.00
is_odd            0.55
is_two            0.05
is_known_prime    0.20
dtype: float64


#### Overlaps

In [163]:
LFAnalysis(L=L_train, lfs=lfs).lf_overlaps()

array([0.05, 0.05, 0.05])

#### Conflicts

In [164]:
LFAnalysis(L=L_train, lfs=lfs).lf_conflicts()

array([0.05, 0.05, 0.05])

### Calculating: Correct, Incorrect and Empirical Accuracy on the validation set

In [165]:
# apply the labeling functions to the validation set; returns a label from each L.F. for each data point. 
L_valid = applier.apply(df_val)
LFAnalysis(L_valid, lfs).lf_summary(true_labels)

100%|██████████| 5/5 [00:00<00:00, 4897.60it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
is_odd,0,[0],0.6,0.2,0.2,2,1,0.666667
is_two,1,[1],0.2,0.2,0.2,1,0,1.0
is_known_prime,2,[1],0.6,0.2,0.2,3,0,1.0


#### Applying the labeling functions to the validation set, just for illustration of calculating the above metrics

In [166]:
df_val = pd.DataFrame(validation, columns=['Number'])
df_val["is_odd"] = df_val.apply(is_odd, axis=1)
# df_val["is_even"] = df_val.apply(is_even, axis=1)
df_val["is_two"] = df_val.apply(is_two, axis=1)
df_val["is_known_prime"] = df_val.apply(is_known_prime, axis=1)
df_val["ground_truth"] = true_labels
df_val

Unnamed: 0,Number,is_odd,is_two,is_known_prime,ground_truth
0,22,0,-1,-1,0
1,11,-1,-1,1,1
2,7,-1,-1,1,1
3,2,0,1,1,1
4,32,0,-1,-1,0


### Using Random Voter to determine the label

In [167]:
from snorkel.labeling.model import RandomVoter

In [168]:
random_model = RandomVoter()
preds_train_random = random_model.predict(L=L_train, tie_break_policy='abstain')
preds_valid_random = random_model.predict(L=L_valid)
df["preds_train_random"] = preds_train_random
#df[df["Number"] == 2]
df

Unnamed: 0,Number,is_odd,is_two,is_known_prime,preds_train_random
0,5,-1,-1,1,1
1,21,-1,-1,-1,0
2,1,-1,-1,-1,1
3,29,-1,-1,1,0
4,32,0,-1,-1,1
5,37,-1,-1,-1,0
6,10,0,-1,-1,1
7,20,0,-1,-1,0
8,10,0,-1,-1,1
9,26,0,-1,-1,1


Calculating the accuracy of the RandomVoter

In [169]:
metrics = random_model.score(preds_valid_random, true_labels, metrics=['accuracy'])
metrics



{'accuracy': 0.4}

### MajorityClassVoter

In [170]:
from snorkel.labeling.model import MajorityClassVoter
"""Predict probabilities using majority class.
        Assign majority class vote to each datapoint.
        In case of multiple majority classes, assign equal probabilities among them."""

'Predict probabilities using majority class.\n        Assign majority class vote to each datapoint.\n        In case of multiple majority classes, assign equal probabilities among them.'

In [171]:
majorityClass_model = MajorityClassVoter()
majorityClass_model.fit(balance=np.array([0.7, 0.3]))
majorityClass_train_random = majorityClass_model.predict(L=L_train)
majorityClass_valid_random = majorityClass_model.predict(L=L_valid)
df["majorityClass_pred"] = majorityClass_train_random
df[df["Number"] == 2]

Unnamed: 0,Number,is_odd,is_two,is_known_prime,preds_train_random,majorityClass_pred
10,2,0,1,1,0,0


In [172]:
metrics = majorityClass_model.score(majorityClass_valid_random, true_labels, metrics=['accuracy'])
metrics



{'accuracy': 0.4}

### Using MajorityLabelVoter to determine the label

In [173]:
majority_model = MajorityLabelVoter()
preds_train_majority = majority_model.predict(L=L_train)
preds_valid_majority = majority_model.predict(L=L_valid)
df["majorityLabel_pred"] = preds_train_majority

In [189]:
df[df["Number"] == 2]

Unnamed: 0,Number,is_odd,is_two,is_known_prime,preds_train_random,majorityClass_pred,majorityLabel_pred,preds_labelingModel,preds_labelingModel_wclassBalance
0,5,-1,-1,1,1,0,1,1,1
1,21,-1,-1,-1,0,0,-1,-1,0
2,1,-1,-1,-1,1,0,-1,-1,0
3,29,-1,-1,1,0,0,1,1,1
4,32,0,-1,-1,1,0,0,0,0
5,37,-1,-1,-1,0,0,-1,-1,0
6,10,0,-1,-1,1,0,0,0,0
7,20,0,-1,-1,0,0,0,0,0
8,10,0,-1,-1,1,0,0,0,0
9,26,0,-1,-1,1,0,0,0,0


In [188]:
# doesn't work
# metrics2 = majority_model.score(preds_valid_majority, true_labels, metrics=['accuracy'])
# metrics2
(preds_valid_majority == true_labels).mean()


1.0

In [190]:
df_val = pd.DataFrame(validation, columns=['Number'])
df_val["is_odd"] = df_val.apply(is_odd, axis=1)
# df_val["is_even"] = df_val.apply(is_even, axis=1)
df_val["is_two"] = df_val.apply(is_two, axis=1)
df_val["is_known_prime"] = df_val.apply(is_known_prime, axis=1)
df_val["pred_majority"] = preds_valid_majority
df_val["ground_truth"] = true_labels
df_val

Unnamed: 0,Number,is_odd,is_two,is_known_prime,pred_majority,ground_truth
0,22,0,-1,-1,0,0
1,11,-1,-1,1,1,1
2,7,-1,-1,1,1,1
3,2,0,1,1,1,1
4,32,0,-1,-1,0,0


In [192]:
# np.round(majority_model.get_weights(), 2)
# 'MajorityLabelVoter' object has no attribute 'get_weights'

### Using LabelingModel to determine the label

In [175]:
label_model = LabelModel()
label_model.fit(L_train=L_train, n_epochs=200, seed=100)
preds_train_label = label_model.predict(L=L_train)
preds_valid_label = label_model.predict(L=L_valid)
L_valid = applier.apply(df_val)
LFAnalysis(L_valid, lfs).lf_summary()


100%|██████████| 200/200 [00:00<00:00, 1413.40epoch/s]
100%|██████████| 5/5 [00:00<00:00, 3870.00it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
is_odd,0,[0],0.6,0.2,0.2
is_two,1,[1],0.2,0.2,0.2
is_known_prime,2,[1],0.6,0.2,0.2


In [176]:

preds_train_labelingModel = label_model.predict(L=L_train)
preds_valid_labelingModel = label_model.predict(L=L_valid)

Examine the weights of the label_model for each classification source (labeling function).  Labeling functions that make more mistakes would be expected to have lower weights.

In [177]:
df["preds_labelingModel"] = preds_train_labelingModel

In [178]:
df[["Number", "preds_train_random", "majorityClass_pred", "majorityLabel_pred", "preds_labelingModel"]]

Unnamed: 0,Number,preds_train_random,majorityClass_pred,majorityLabel_pred,preds_labelingModel
0,5,1,0,1,1
1,21,0,0,-1,-1
2,1,1,0,-1,-1
3,29,0,0,1,1
4,32,1,0,0,0
5,37,0,0,-1,-1
6,10,1,0,0,0
7,20,0,0,0,0
8,10,1,0,0,0
9,26,1,0,0,0


In [179]:
# 7/20 prime numbers
label_model = LabelModel()
label_model.fit(L_train=L_train, n_epochs=200, class_balance = [0.7, 0.3], seed=100)
preds_train_label = label_model.predict(L=L_train)
preds_valid_label = label_model.predict(L=L_valid)
L_valid = applier.apply(df_val)
LFAnalysis(L_valid, lfs).lf_summary()


100%|██████████| 200/200 [00:00<00:00, 1145.32epoch/s]
100%|██████████| 5/5 [00:00<00:00, 4744.69it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
is_odd,0,[0],0.6,0.2,0.2
is_two,1,[1],0.2,0.2,0.2
is_known_prime,2,[1],0.6,0.2,0.2


In [180]:

preds_train_labelingModel = label_model.predict(L=L_train)
df["preds_labelingModel_wclassBalance"] = preds_train_labelingModel

In [181]:
df[["Number", "preds_labelingModel", "preds_labelingModel_wclassBalance"]]

Unnamed: 0,Number,preds_labelingModel,preds_labelingModel_wclassBalance
0,5,1,1
1,21,-1,0
2,1,-1,0
3,29,1,1
4,32,0,0
5,37,-1,0
6,10,0,0
7,20,0,0
8,10,0,0
9,26,0,0


Looking at the weights of the label_model

In [182]:
np.round(label_model.get_weights(), 2)

array([0.94, 0.93, 0.77])

And the actual conditional probability values placed in a matrix with dimensions [number of labeling function, number of labels + 1 (for abstain), number of classes], rounded are as follows:


def get_conditional_probs(self) -> np.ndarray:

    r"""Return the estimated conditional probabilities table.

    Return the estimated conditional probabilites table cprobs, where cprobs is an
    (m, k+1, k)-dim np.ndarray with:

        cprobs[i, j, k] = P(\lf_i = j-1 | Y = k)

    where m is the number of LFs, k is the cardinality, and cprobs includes the
    conditional abstain probabilities P(\lf_i = -1 | Y = y).

    Returns
    -------
    np.ndarray
        An [m, k + 1, k] np.ndarray conditional probabilities table.
    """
```

array([
labeling function 1    
       [[
        P(lf=1|Y!=y) = 0.114, P(lf=|Y=y) = 0.98 ],
        [0.876, 0.01 ],
        [0.01 , 0.01 ]],
labeling function 2
       [[0.114, 0.98 ],
        [0.876, 0.01 ],
        [0.01 , 0.01 ]],

       [[0.91 , 0.923],
        [0.01 , 0.01 ],
        [0.08 , 0.067]],

       [[0.869, 0.58 ],
        [0.01 , 0.01 ],
        [0.121, 0.41 ]]])
```

Looking at the conditional probabilities 

In [183]:
np.round(label_model.get_conditional_probs(), 3)

array([[[0.257, 0.865],
        [0.733, 0.125],
        [0.01 , 0.01 ]],

       [[0.934, 0.859],
        [0.01 , 0.01 ],
        [0.056, 0.131]],

       [[0.918, 0.501],
        [0.01 , 0.01 ],
        [0.072, 0.489]]])