## Practical Weak Supervision (book)

Chapter 2 labeling example, modified

In [1]:
import pandas as pd
import numpy as np
import random
from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.model import LabelModel
from snorkel.labeling import LFAnalysis
from snorkel.labeling.model import MajorityLabelVoter

  from .autonotebook import tqdm as notebook_tqdm


### Create a small example dataset

In [2]:
data =    [5, 21, 1, 29, 32, 37, 10, 20, 10, 26, 2, 37, 34, 11, 22, 36, 12, 20, 31, 25]
# for logistic regression
y_train = [1,  0, 1, 1,   0,   1,  0,  0,  0,  0, 1,  1,  0,  1,  0,   0,   0,  0,  1,  0]

In [3]:
df = pd.DataFrame(data, columns=['Number'])

### Creating a small validation set

Ground truth array for the validation set. 

22 -> not prime [0]<br>
11 -> prime [1]<br>
7 -> prime [1]<br>
2 -> prime [1]<br>
32 -> not prime [0]<br>
etc...

In [4]:
import is_it_prime  # custom code
# validation = [22, 11, 7, 2, 32]  # from original example
validation = [22, 11, 7, 2, 32, 101, 102]
# true_labels = np.array([0, 1, 1, 1, 0])
true_labels = is_it_prime.array_map(validation)

In [5]:
df_val = pd.DataFrame(validation, columns=['Number'])

In [6]:
df_tl = pd.DataFrame(true_labels, columns=['Number'])

### Labeling functions

In [7]:
ABSTAIN = -1
NON_PRIME = 0
PRIME = 1

In [8]:
# if odd, abstain else non-prime.
@labeling_function()
def is_odd(record):
    if record["Number"]%2 == 1:
        return ABSTAIN
    else:
        return NON_PRIME

In [9]:
# deprecated; if even, then non-prime else abstain; duplicates is_odd() labeling function
@labeling_function()
def is_even(record):
    if record["Number"]%2 == 0:
        return NON_PRIME
    else:
        return ABSTAIN

In [10]:
@labeling_function()
def is_two(record):
    if record["Number"] == 2:
        return PRIME
    else:
        return ABSTAIN

In [11]:
@labeling_function()
def is_known_prime(record):
    # known_primes = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29]  # original list from book code
    known_primes = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 101]
    if record["Number"] in known_primes:
        return PRIME
    else:
        return ABSTAIN

In [12]:
# if > 3 and not evenly divisible by 3 ABSTAIN else NON_PRIME.
@labeling_function()
def gt3_ndb_3(record):
    if record["Number"]>3 and record["Number"]%3 == 0:
        return NON_PRIME
    else:
        return ABSTAIN

### Calculating: Polarity, Coverage, Overlaps and Conflics for the labeling functions

In [13]:
lfs = [
        is_odd,
#        is_even, # duplicates is_odd!
        is_two,
        is_known_prime,
        # gt3_ndb_3
      ]
# apply labeling function s with PandasLFApplier to get a two-dimensional matrix of labels.
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df)
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

100%|██████████| 20/20 [00:00<00:00, 6139.20it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
is_odd,0,[0],0.55,0.05,0.05
is_two,1,[1],0.05,0.05,0.05
is_known_prime,2,[1],0.2,0.05,0.05


#### Applying the labeling functions to the dataset, just for illustration of calculating the above metrics

In [14]:
df["is_odd"] = df.apply(is_odd, axis=1)
# df["is_even"] = df.apply(is_even, axis=1)
df["is_two"] = df.apply(is_two, axis=1)
df["is_known_prime"] = df.apply(is_known_prime, axis=1)
if gt3_ndb_3 in lfs:
    df["gt3_ndb_3"] = df.apply(gt3_ndb_3, axis=1)


In [15]:
# df[["Number", "is_odd", "is_even", "is_two", "is_known_prime"]]
# df[["Number", "is_odd", "is_two", "is_known_prime", "gt3_ndb_3"]]
df

Unnamed: 0,Number,is_odd,is_two,is_known_prime
0,5,-1,-1,1
1,21,-1,-1,-1
2,1,-1,-1,-1
3,29,-1,-1,1
4,32,0,-1,-1
5,37,-1,-1,-1
6,10,0,-1,-1
7,20,0,-1,-1
8,10,0,-1,-1
9,26,0,-1,-1


##### Polarity

In [16]:
LFAnalysis(L=L_train, lfs=lfs).lf_polarities()

[[0], [1], [1]]

##### Coverage

In [17]:
print(df[df!= -1].count()/df.shape[0])

Number            1.00
is_odd            0.55
is_two            0.05
is_known_prime    0.20
dtype: float64


#### Overlaps

In [18]:
LFAnalysis(L=L_train, lfs=lfs).lf_overlaps()

array([0.05, 0.05, 0.05])

#### Conflicts

In [19]:
LFAnalysis(L=L_train, lfs=lfs).lf_conflicts()

array([0.05, 0.05, 0.05])

### Calculating: Correct, Incorrect and Empirical Accuracy on the validation set

In [20]:
# apply the labeling functions to the validation set; returns a label from each L.F. for each data point. 
L_valid = applier.apply(df_val)
LFAnalysis(L_valid, lfs).lf_summary(true_labels)

100%|██████████| 7/7 [00:00<00:00, 5667.98it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
is_odd,0,[0],0.571429,0.142857,0.142857,3,1,0.75
is_two,1,[1],0.142857,0.142857,0.142857,1,0,1.0
is_known_prime,2,[1],0.571429,0.142857,0.142857,4,0,1.0


#### Applying the labeling functions to the validation set, just for illustration of calculating the above metrics

In [21]:
df_val = pd.DataFrame(validation, columns=['Number'])
df_val["is_odd"] = df_val.apply(is_odd, axis=1)
# df_val["is_even"] = df_val.apply(is_even, axis=1)
df_val["is_two"] = df_val.apply(is_two, axis=1)
df_val["is_known_prime"] = df_val.apply(is_known_prime, axis=1)
df_val["ground_truth"] = true_labels
df_val

Unnamed: 0,Number,is_odd,is_two,is_known_prime,ground_truth
0,22,0,-1,-1,False
1,11,-1,-1,1,True
2,7,-1,-1,1,True
3,2,0,1,1,True
4,32,0,-1,-1,False
5,101,-1,-1,1,True
6,102,0,-1,-1,False


### Using Random Voter to determine the label

In [22]:
from snorkel.labeling.model import RandomVoter

In [23]:
random_model = RandomVoter()
preds_train_random = random_model.predict(L=L_train, tie_break_policy='abstain')
preds_valid_random = random_model.predict(L=L_valid)
df["preds_train_random"] = preds_train_random
#df[df["Number"] == 2]
df

Unnamed: 0,Number,is_odd,is_two,is_known_prime,preds_train_random
0,5,-1,-1,1,1
1,21,-1,-1,-1,0
2,1,-1,-1,-1,0
3,29,-1,-1,1,0
4,32,0,-1,-1,1
5,37,-1,-1,-1,1
6,10,0,-1,-1,1
7,20,0,-1,-1,0
8,10,0,-1,-1,1
9,26,0,-1,-1,1


Calculating the accuracy of the RandomVoter

In [24]:
#  METRICS = { 
#      "accuracy": Metric(skmetrics.accuracy_score), 
#      "coverage": Metric(_coverage_score, ["preds"]), 
#      "precision": Metric(skmetrics.precision_score), 
#      "recall": Metric(skmetrics.recall_score), 
#      "f1": Metric(_f1_score, ["golds", "preds"]), 
#      "f1_micro": Metric(_f1_micro_score, ["golds", "preds"]), 
#      "f1_macro": Metric(_f1_macro_score, ["golds", "preds"]), 
#      "fbeta": Metric(skmetrics.fbeta_score), 
#      "matthews_corrcoef": Metric(skmetrics.matthews_corrcoef), 
#      "roc_auc": Metric(_roc_auc_score, ["golds", "probs"]), 
#  } 
metrics = random_model.score(preds_valid_random, true_labels, metrics = ['accuracy', 'coverage', 'precision', 'recall', 'f1'])  #, metrics- =['accuracy'])

accuracy_random_model = metrics["accuracy"]

df_valid_metrics = pd.DataFrame
# df_valid_metrics["accur_rand_mod"] = np.ndarray(int(accuracy_random_model))
metrics



{'accuracy': 0.2857142857142857,
 'coverage': 1.0,
 'precision': 0.3333333333333333,
 'recall': 0.25,
 'f1': 0.28571428571428575}

### MajorityClassVoter

Predicts probabilities using just the majority class.  Assign majority class vote to each datapoint. In case of multiple majority classes, assign equal probabilities among them.  For example, if balance=[0.7,0.3], then all the data points will be labeled 0.  


In [25]:
from snorkel.labeling.model import MajorityClassVoter

In [26]:
majorityClass_model = MajorityClassVoter()
majorityClass_model.fit(balance=np.array([0.7, 0.3]))
# majorityClass_model.fit(balance=np.array([0.3, 0.7]))
majorityClass_train_random = majorityClass_model.predict(L=L_train)
majorityClass_valid_random = majorityClass_model.predict(L=L_valid)
df["majorityClass_pred"] = majorityClass_train_random
df[df["Number"] == 2]

Unnamed: 0,Number,is_odd,is_two,is_known_prime,preds_train_random,majorityClass_pred
10,2,0,1,1,1,0


In [27]:
metrics = majorityClass_model.score(majorityClass_valid_random, true_labels, metrics=['accuracy'])
accuracy_majority_class_model = metrics["accuracy"]
metrics



{'accuracy': 0.42857142857142855}

### Using MajorityLabelVoter to determine the label

In [28]:
majority_model = MajorityLabelVoter()
preds_train_majority_label = majority_model.predict(L=L_train)
preds_valid_majority_label = majority_model.predict(L=L_valid)
df["majorityLabel_pred"] = preds_train_majority_label

In [29]:
df[df["Number"] == 2]

Unnamed: 0,Number,is_odd,is_two,is_known_prime,preds_train_random,majorityClass_pred,majorityLabel_pred
10,2,0,1,1,1,0,1


In [30]:
# doesn't work:
# metrics2 = majority_model.score(preds_valid_majority, true_labels, metrics=['accuracy'])
# metrics2
val_accuracy_maj_label_voter_model = (preds_valid_majority_label == true_labels).mean()
val_accuracy_maj_label_voter_model

1.0

In [31]:
df_val = pd.DataFrame(validation, columns=['Number'])
df_val["is_odd"] = df_val.apply(is_odd, axis=1)
if is_even in lfs:
    df_val["is_even"] = df_val.apply(is_even, axis=1)
df_val["is_two"] = df_val.apply(is_two, axis=1)
df_val["is_known_prime"] = df_val.apply(is_known_prime, axis=1)
if gt3_ndb_3 in lfs:
    df_val["gt3_ndb_3"] = df_val.apply(gt3_ndb_3, axis=1)
df_val["pred_maj_label"] = preds_valid_majority_label
df_val["ground_truth"] = true_labels
df_val

Unnamed: 0,Number,is_odd,is_two,is_known_prime,pred_maj_label,ground_truth
0,22,0,-1,-1,0,False
1,11,-1,-1,1,1,True
2,7,-1,-1,1,1,True
3,2,0,1,1,1,True
4,32,0,-1,-1,0,False
5,101,-1,-1,1,1,True
6,102,0,-1,-1,0,False


In [32]:
# np.round(majority_model.get_weights(), 2)
# 'MajorityLabelVoter' object has no attribute 'get_weights'

### Using LabelingModel to determine the label

Documentation: https://snorkel.readthedocs.io/en/v0.9.3/packages/_autosummary/labeling/snorkel.labeling.LabelModel.html

A model for learning the LF accuracies and combining their output labels.

This class learns a model of the labeling functions’ conditional probabilities of outputting the true (unobserved) label Y, P(lf | Y), and uses this learned model to re-weight and combine their output labels.

This class is based on the approach in [Training Complex Models with Multi-Task Weak Supervision](https://arxiv.org/abs/1810.02840), published in AAAI‘19. In this approach, we compute the inverse generalized covariance matrix of the junction tree of a given LF dependency graph, and perform a matrix completion-style approach with respect to these empirical statistics. The result is an estimate of the conditional LF probabilities, P(lf | Y), which are then set as the parameters of the label model used to re-weight and combine the labels output by the LFs.

Currently this class uses a conditionally independent label model, in which the LFs are assumed to be conditionally independent given Y.



In [33]:
label_model = LabelModel()
# no y_train data!
label_model.fit(L_train=L_train, n_epochs=200, seed=100)
preds_train_label = label_model.predict(L=L_train)
preds_valid_label = label_model.predict(L=L_valid)
L_valid = applier.apply(df_val)
LFAnalysis(L_valid, lfs).lf_summary()


100%|██████████| 200/200 [00:00<00:00, 1031.04epoch/s]
100%|██████████| 7/7 [00:00<00:00, 4701.38it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
is_odd,0,[0],0.571429,0.142857,0.142857
is_two,1,[1],0.142857,0.142857,0.142857
is_known_prime,2,[1],0.571429,0.142857,0.142857


In [34]:

preds_train_labelingModel = label_model.predict(L=L_train)
preds_valid_labelingModel = label_model.predict(L=L_valid)

Examine the weights of the label_model for each classification source (labeling function).  Labeling functions that make more mistakes would be expected to have lower weights.

In [35]:
df["preds_label_model"] = preds_train_labelingModel

In [36]:
#df[["Number", "preds_train_random", "majorityClass_pred", "majorityLabel_pred", "preds_labelingModel"]]
df

Unnamed: 0,Number,is_odd,is_two,is_known_prime,preds_train_random,majorityClass_pred,majorityLabel_pred,preds_label_model
0,5,-1,-1,1,1,0,1,1
1,21,-1,-1,-1,0,0,-1,-1
2,1,-1,-1,-1,0,0,-1,-1
3,29,-1,-1,1,0,0,1,1
4,32,0,-1,-1,1,0,0,0
5,37,-1,-1,-1,1,0,-1,-1
6,10,0,-1,-1,1,0,0,0
7,20,0,-1,-1,0,0,0,0
8,10,0,-1,-1,1,0,0,0
9,26,0,-1,-1,1,0,0,0


#### LabelModel with class balance

In [37]:
# 7/20 prime numbers
label_model_wcb = LabelModel()
# no Y_train data!!
label_model_wcb.fit(L_train=L_train, n_epochs=200, class_balance = [0.7, 0.3], seed=100)
preds_train_label_wcb = label_model_wcb.predict(L=L_train)
preds_valid_label_wcb = label_model_wcb.predict(L=L_valid)
L_valid = applier.apply(df_val)
LFAnalysis(L_valid, lfs).lf_summary()


100%|██████████| 200/200 [00:00<00:00, 1409.39epoch/s]
100%|██████████| 7/7 [00:00<00:00, 6769.69it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
is_odd,0,[0],0.571429,0.142857,0.142857
is_two,1,[1],0.142857,0.142857,0.142857
is_known_prime,2,[1],0.571429,0.142857,0.142857


In [38]:
# with class balance
# preds_train_labelingModel_wcb = label_model_wcb.predict(L=L_train)
df["preds_label_model_wcb"] = preds_train_label_wcb

In [39]:
df[["Number", "preds_label_model", "preds_label_model_wcb"]]

Unnamed: 0,Number,preds_label_model,preds_label_model_wcb
0,5,1,1
1,21,-1,0
2,1,-1,0
3,29,1,1
4,32,0,0
5,37,-1,0
6,10,0,0
7,20,0,0
8,10,0,0
9,26,0,0


Looking at the weights of the label_model

In [40]:
np.round(label_model_wcb.get_weights(), 2)

array([0.94, 0.93, 0.77])

In [41]:
accuracy_labeling_model_wcbal = (preds_valid_label == true_labels).mean()
accuracy_labeling_model_wcbal

1.0

And the actual conditional probability values placed in a matrix with dimensions [number of labeling function, number of labels + 1 (for abstain), number of classes], rounded are as follows:


def get_conditional_probs(self) -> np.ndarray:

    r"""Return the estimated conditional probabilities table.

    Return the estimated conditional probabilites table cprobs, where cprobs is an
    (m, k+1, k)-dim np.ndarray with:

        cprobs[i, j, k] = P(\lf_i = j-1 | Y = k)

    where m is the number of LFs, k is the cardinality, and cprobs includes the
    conditional abstain probabilities P(\lf_i = -1 | Y = y).

    Returns
    -------
    np.ndarray
        An [m, k + 1, k] np.ndarray conditional probabilities table.
    """
```
array([
labeling function 1  (m=0)
       [[
(j = 0)        P( lf=1 | Y = 0 ) = 0.114, P( lf=1 | Y=1 ) = 0.98 ],
        [0.876, 0.01 ],
        [0.01 , 0.01 ]],
labeling function 2
       [[0.114, 0.98 ],
        [0.876, 0.01 ],
        [0.01 , 0.01 ]],

       [[0.91 , 0.923],
        [0.01 , 0.01 ],
        [0.08 , 0.067]],

       [[0.869, 0.58 ],
        [0.01 , 0.01 ],
        [0.121, 0.41 ]]])
```

Looking at the conditional probabilities 

In [42]:
np.round(label_model.get_conditional_probs(), 3)

array([[[0.179, 0.699],
        [0.811, 0.291],
        [0.01 , 0.01 ]],

       [[0.957, 0.864],
        [0.01 , 0.01 ],
        [0.033, 0.126]],

       [[0.98 , 0.571],
        [0.01 , 0.01 ],
        [0.01 , 0.419]]])

### Use the model to generate a larger set of labeled data

In [43]:
# new_data_to_be_labeled = range(50, 150)
# df_new_data = pd.DataFrame(new_data_to_be_labeled, columns=['Number'])
# L_train = applier.apply(df=df_new_data)
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
is_odd,0,[0],0.55,0.05,0.05
is_two,1,[1],0.05,0.05,0.05
is_known_prime,2,[1],0.2,0.05,0.05


## Logistic Regression

Now that we have a model (using Snorkel LabelModel with class balance) let's label some data.

Sources include ML Bookcamp, Snorkel docs

Filtering out unlabeled data points

As we saw earlier, some of the data points in our train set received no labels from any of our LFs. These data points convey no supervision signal and tend to hurt performance, so we filter them out before training using a built-in utility.


In [44]:

# from snorkel.labeling import filter_unlabeled_dataframe

# df_train = pd.DataFrame(data, columns=['Number'])
# df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
#     X=df_train, y=preds_train_label_wcb, L=L_train
# )

The output of the Snorkel LabelModel is a set of labels which can be used with most popular libraries for performing supervised learning, such as TensorFlow, Keras, PyTorch, Scikit-Learn, Ludwig, and XGBoost. In the Snorkel spam tutorial we use the well-known library Scikit-Learn. Note that typically, Snorkel is used (and really shines!) with much more complex, training data-hungry models, but we will use Logistic Regression here for simplicity of exposition.  Source: https://www.snorkel.org/use-cases/01-spam-tutorial

the LabelModel outputs probabilistic (float) labels. If the classifier we are training accepts target labels as floats, we can train on these labels directly (see describe the properties of this type of “noise-aware” loss in our NeurIPS 2016 paper).

If we want to use a library or model that doesn’t accept probabilistic labels (such as Scikit-Learn), we can instead replace each label distribution with the label of the class that has the maximum probability. This can easily be done using the probs_to_preds helper method. We do note, however, that this transformation is lossy, as we no longer have values for our confidence in each label.
 

In [45]:
# from snorkel.utils import probs_to_preds
# preds_train_filtered = probs_to_preds(probs=probs_train_filtered)

In [46]:
from sklearn.linear_model import LogisticRegression

In [47]:
logistic_regr_model = LogisticRegression(solver='liblinear', random_state=1)
logistic_regr_model.fit(L_train, preds_train_label_wcb)  # ) y_train)

In [48]:
validation_pred_probs = logistic_regr_model.predict_proba(L_valid)
validation_pred_probs

array([[0.93715619, 0.06284381],
       [0.22348245, 0.77651755],
       [0.22348245, 0.77651755],
       [0.11021325, 0.88978675],
       [0.93715619, 0.06284381],
       [0.22348245, 0.77651755],
       [0.93715619, 0.06284381]])

In [49]:
df_val = pd.DataFrame(validation, columns=['Number'])
df_val["is_odd"] = df_val.apply(is_odd, axis=1)
# df_val["is_even"] = df_val.apply(is_even, axis=1)
df_val["is_two"] = df_val.apply(is_two, axis=1)
df_val["is_known_prime"] = df_val.apply(is_known_prime, axis=1)
df_val["gt3_ndb_3"] = df_val.apply(gt3_ndb_3, axis=1)
df_val["pred_majority"] = preds_valid_majority_label
df_val["ground_truth"] = true_labels
df_val["log_reg_p(1)"] = validation_pred_probs[:,1]
gt_one_half = lambda x: (x > 0.5)
df_val["log_reg_pred"] = np.multiply(gt_one_half(df_val["log_reg_p(1)"]),1 )
df_val

Unnamed: 0,Number,is_odd,is_two,is_known_prime,gt3_ndb_3,pred_majority,ground_truth,log_reg_p(1),log_reg_pred
0,22,0,-1,-1,-1,0,False,0.062844,0
1,11,-1,-1,1,-1,1,True,0.776518,1
2,7,-1,-1,1,-1,1,True,0.776518,1
3,2,0,1,1,-1,1,True,0.889787,1
4,32,0,-1,-1,-1,0,False,0.062844,0
5,101,-1,-1,1,-1,1,True,0.776518,1
6,102,0,-1,-1,0,0,False,0.062844,0


In [50]:
accuracy_val_log_regr = (df_val["log_reg_pred"] == df_val["ground_truth"]).mean()
print(accuracy_val_log_regr) 

1.0


### Use Logistic Regression model to label new, larger data set

In [51]:
#new_data_to_be_labeled = range(50, 150)
# df_primes = is_it_prime.make_primes_df(200)
# df_new_data_to_be_labeled_by_regr_model = df_primes.loc[50:150, ["Number", "ground_truth"]].reset_index(drop=True)
# df_gr_truth_new_data = df_primes.loc[50:150, ["ground_truth"]].reset_index(drop=True)
# df_new_data = pd.DataFrame(new_data_to_be_labeled_by_regr_model, columns=['Number'])
# df2 = df_new_data_to_be_labeled_by_regr_model["Number"].reset_index(drop=True)
applier2 = PandasLFApplier(lfs=lfs)
nums, labels = is_it_prime.make_list_of_num_and_labels()
L = applier2.apply(pd.DataFrame(nums))
LFAnalysis(L, lfs).lf_summary()
# L = applier.apply(df=df2)
# LFAnalysis(L=L, lfs=lfs).lf_summary()

  0%|          | 1/200 [00:00<00:00, 1675.04it/s]


KeyError: 'Number'

In [None]:
new_data_pred_probs = logistic_regr_model.predict_proba(L)

gt_one_half = lambda x: (x > 0.5)
df_new_data_to_be_labeled_by_regr_model["log_reg_pred"] = np.multiply(gt_one_half(new_data_pred_probs[:, 1]),1 )

