In [95]:
import pandas as pd
import numpy as np
import random

from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.model import LabelModel
from snorkel.labeling import LFAnalysis
from snorkel.labeling.model import MajorityLabelVoter

### Create a small example dataset

In [96]:
data =    [5, 21, 1, 29, 32, 37, 10, 20, 10, 26, 2, 37, 34, 11, 22, 36, 12, 20, 31, 25]
y_train = [1,  0, 1, 1,   0,   1,  0,  0,  0,  0, 1,  1,  0,  1,  0,   0,   0,  0,  1,  0]

In [97]:
df = pd.DataFrame(data, columns=['Number'])

### Creating a small validation set

In [154]:
import is_it_prime
# validation = [22, 11, 7, 2, 32]
validation = [22, 11, 7, 2, 32, 101, 102]

# true_labels = np.array([0, 1, 1, 1, 0])
# true_labels = np.array([0, 1, 1, 1, 0, 1, 0])
true_labels = is_it_prime.array_map(validation)

In [152]:
#np.fromfunction(lambda i: i * i, validation, dtype=int)




TypeError: <lambda>() takes 1 positional argument but 7 were given

Ground truth array for the validation set. 

22 -> not prime [0]<br>
11 -> prime [1]<br>
7 -> prime [1]<br>
2 -> prime [1]<br>
32 -> not prime [0]<br>

In [99]:
df_val = pd.DataFrame(validation, columns=['Number'])

In [100]:
df_tl = pd.DataFrame(true_labels, columns=['Number'])

### Labeling functions

In [101]:
ABSTAIN = -1
NON_PRIME = 0
PRIME = 1

In [102]:
# if odd, abstain else non-prime.
@labeling_function()
def is_odd(record):
    if record["Number"]%2 == 1:
        return ABSTAIN
    else:
        return NON_PRIME

In [103]:
# if even, then non-prime else abstain
# @labeling_function()
# def is_even(record):
#     if record["Number"]%2 == 0:
#         return NON_PRIME
#     else:
#         return ABSTAIN

In [104]:
@labeling_function()
def is_two(record):
    if record["Number"] == 2:
        return PRIME
    else:
        return ABSTAIN

In [105]:
@labeling_function()
def is_known_prime(record):
    # known_primes = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29]
    known_primes = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 101]
    if record["Number"] in known_primes:
        return PRIME
    else:
        return ABSTAIN

In [106]:
# if > 3 and not evenly divisible by 3 abstain else non-prime.
@labeling_function()
def gt_3_and_3_not_a_factor(record):
    if record["Number"]>3 and record["Number"]%2 == 1:
        return ABSTAIN
    else:
        return NON_PRIME

### Calculating: Polarity, Coverage, Overlaps and Conflics for the labeling functions

In [107]:
lfs = [
        is_odd,
#        is_even, 
        is_two,
        is_known_prime
      ]
# apply labeling function s with PandasLFApplier to get a two-dimensional matrix of labels.
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df)
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

100%|██████████| 20/20 [00:00<00:00, 9484.01it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
is_odd,0,[0],0.55,0.05,0.05
is_two,1,[1],0.05,0.05,0.05
is_known_prime,2,[1],0.2,0.05,0.05


#### Applying the labeling functions to the dataset, just for illustration of calcualting the above metrics

In [108]:
df["is_odd"] = df.apply(is_odd, axis=1)
# df["is_even"] = df.apply(is_even, axis=1)
df["is_two"] = df.apply(is_two, axis=1)
df["is_known_prime"] = df.apply(is_known_prime, axis=1)

In [109]:
# df[["Number", "is_odd", "is_even", "is_two", "is_known_prime"]]
df[["Number", "is_odd", "is_two", "is_known_prime"]]

Unnamed: 0,Number,is_odd,is_two,is_known_prime
0,5,-1,-1,1
1,21,-1,-1,-1
2,1,-1,-1,-1
3,29,-1,-1,1
4,32,0,-1,-1
5,37,-1,-1,-1
6,10,0,-1,-1
7,20,0,-1,-1
8,10,0,-1,-1
9,26,0,-1,-1


##### Polarity

In [110]:
LFAnalysis(L=L_train, lfs=lfs).lf_polarities()

[[0], [1], [1]]

##### Coverage

In [111]:
print(df[df!= -1].count()/df.shape[0])

Number            1.00
is_odd            0.55
is_two            0.05
is_known_prime    0.20
dtype: float64


#### Overlaps

In [112]:
LFAnalysis(L=L_train, lfs=lfs).lf_overlaps()

array([0.05, 0.05, 0.05])

#### Conflicts

In [113]:
LFAnalysis(L=L_train, lfs=lfs).lf_conflicts()

array([0.05, 0.05, 0.05])

### Calculating: Correct, Incorrect and Empirical Accuracy on the validation set

In [114]:
# apply the labeling functions to the validation set; returns a label from each L.F. for each data point. 
L_valid = applier.apply(df_val)
LFAnalysis(L_valid, lfs).lf_summary(true_labels)

100%|██████████| 7/7 [00:00<00:00, 6464.14it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
is_odd,0,[0],0.571429,0.142857,0.142857,3,1,0.75
is_two,1,[1],0.142857,0.142857,0.142857,1,0,1.0
is_known_prime,2,[1],0.571429,0.142857,0.142857,4,0,1.0


#### Applying the labeling functions to the validation set, just for illustration of calculating the above metrics

In [115]:
df_val = pd.DataFrame(validation, columns=['Number'])
df_val["is_odd"] = df_val.apply(is_odd, axis=1)
# df_val["is_even"] = df_val.apply(is_even, axis=1)
df_val["is_two"] = df_val.apply(is_two, axis=1)
df_val["is_known_prime"] = df_val.apply(is_known_prime, axis=1)
df_val["ground_truth"] = true_labels
df_val

Unnamed: 0,Number,is_odd,is_two,is_known_prime,ground_truth
0,22,0,-1,-1,0
1,11,-1,-1,1,1
2,7,-1,-1,1,1
3,2,0,1,1,1
4,32,0,-1,-1,0
5,101,-1,-1,1,1
6,102,0,-1,-1,0


### Using Random Voter to determine the label

In [116]:
from snorkel.labeling.model import RandomVoter

In [117]:
random_model = RandomVoter()
preds_train_random = random_model.predict(L=L_train, tie_break_policy='abstain')
preds_valid_random = random_model.predict(L=L_valid)
df["preds_train_random"] = preds_train_random
#df[df["Number"] == 2]
df

Unnamed: 0,Number,is_odd,is_two,is_known_prime,preds_train_random
0,5,-1,-1,1,1
1,21,-1,-1,-1,0
2,1,-1,-1,-1,1
3,29,-1,-1,1,0
4,32,0,-1,-1,1
5,37,-1,-1,-1,0
6,10,0,-1,-1,1
7,20,0,-1,-1,0
8,10,0,-1,-1,1
9,26,0,-1,-1,1


Calculating the accuracy of the RandomVoter

In [147]:
#  METRICS = { 
#      "accuracy": Metric(skmetrics.accuracy_score), 
#      "coverage": Metric(_coverage_score, ["preds"]), 
#      "precision": Metric(skmetrics.precision_score), 
#      "recall": Metric(skmetrics.recall_score), 
#      "f1": Metric(_f1_score, ["golds", "preds"]), 
#      "f1_micro": Metric(_f1_micro_score, ["golds", "preds"]), 
#      "f1_macro": Metric(_f1_macro_score, ["golds", "preds"]), 
#      "fbeta": Metric(skmetrics.fbeta_score), 
#      "matthews_corrcoef": Metric(skmetrics.matthews_corrcoef), 
#      "roc_auc": Metric(_roc_auc_score, ["golds", "probs"]), 
#  } 
metrics = random_model.score(preds_valid_random, true_labels, metrics = ['accuracy', 'coverage', 'precision', 'recall', 'f1'])  #, metrics- =['accuracy'])

accuracy_random_model = metrics["accuracy"]
metrics



{'accuracy': 0.5714285714285714,
 'coverage': 1.0,
 'precision': 0.6666666666666666,
 'recall': 0.5,
 'f1': 0.5714285714285715}

### MajorityClassVoter

In [119]:
from snorkel.labeling.model import MajorityClassVoter
"""Predict probabilities using majority class.
        Assign majority class vote to each datapoint.
        In case of multiple majority classes, assign equal probabilities among them."""

'Predict probabilities using majority class.\n        Assign majority class vote to each datapoint.\n        In case of multiple majority classes, assign equal probabilities among them.'

In [120]:
majorityClass_model = MajorityClassVoter()
majorityClass_model.fit(balance=np.array([0.7, 0.3]))
majorityClass_train_random = majorityClass_model.predict(L=L_train)
majorityClass_valid_random = majorityClass_model.predict(L=L_valid)
df["majorityClass_pred"] = majorityClass_train_random
df[df["Number"] == 2]

Unnamed: 0,Number,is_odd,is_two,is_known_prime,preds_train_random,majorityClass_pred
10,2,0,1,1,0,0


In [121]:
metrics = majorityClass_model.score(majorityClass_valid_random, true_labels, metrics=['accuracy'])
accuracy_majority_class_model = metrics["accuracy"]
metrics



{'accuracy': 0.42857142857142855}

### Using MajorityLabelVoter to determine the label

In [122]:
majority_model = MajorityLabelVoter()
preds_train_majority = majority_model.predict(L=L_train)
preds_valid_majority = majority_model.predict(L=L_valid)
df["majorityLabel_pred"] = preds_train_majority

In [123]:
df[df["Number"] == 2]

Unnamed: 0,Number,is_odd,is_two,is_known_prime,preds_train_random,majorityClass_pred,majorityLabel_pred
10,2,0,1,1,0,0,1


In [124]:
# doesn't work
# metrics2 = majority_model.score(preds_valid_majority, true_labels, metrics=['accuracy'])
# metrics2
accuracy_maj_voter_model = (preds_valid_majority == true_labels).mean()
accuracy_maj_voter_model

1.0

In [125]:
df_val = pd.DataFrame(validation, columns=['Number'])
df_val["is_odd"] = df_val.apply(is_odd, axis=1)
# df_val["is_even"] = df_val.apply(is_even, axis=1)
df_val["is_two"] = df_val.apply(is_two, axis=1)
df_val["is_known_prime"] = df_val.apply(is_known_prime, axis=1)
df_val["pred_majority"] = preds_valid_majority
df_val["ground_truth"] = true_labels
df_val

Unnamed: 0,Number,is_odd,is_two,is_known_prime,pred_majority,ground_truth
0,22,0,-1,-1,0,0
1,11,-1,-1,1,1,1
2,7,-1,-1,1,1,1
3,2,0,1,1,1,1
4,32,0,-1,-1,0,0
5,101,-1,-1,1,1,1
6,102,0,-1,-1,0,0


In [126]:
# np.round(majority_model.get_weights(), 2)
# 'MajorityLabelVoter' object has no attribute 'get_weights'

### Using LabelingModel to determine the label

In [127]:
label_model = LabelModel()
# no y_train data!
label_model.fit(L_train=L_train, n_epochs=200, seed=100)
preds_train_label = label_model.predict(L=L_train)
preds_valid_label = label_model.predict(L=L_valid)
L_valid = applier.apply(df_val)
LFAnalysis(L_valid, lfs).lf_summary()


100%|██████████| 200/200 [00:00<00:00, 1552.08epoch/s]
100%|██████████| 7/7 [00:00<00:00, 6260.16it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
is_odd,0,[0],0.571429,0.142857,0.142857
is_two,1,[1],0.142857,0.142857,0.142857
is_known_prime,2,[1],0.571429,0.142857,0.142857


In [128]:

preds_train_labelingModel = label_model.predict(L=L_train)
preds_valid_labelingModel = label_model.predict(L=L_valid)

Examine the weights of the label_model for each classification source (labeling function).  Labeling functions that make more mistakes would be expected to have lower weights.

In [129]:
df["preds_labelingModel"] = preds_train_labelingModel

In [130]:
df[["Number", "preds_train_random", "majorityClass_pred", "majorityLabel_pred", "preds_labelingModel"]]

Unnamed: 0,Number,preds_train_random,majorityClass_pred,majorityLabel_pred,preds_labelingModel
0,5,1,0,1,1
1,21,0,0,-1,-1
2,1,1,0,-1,-1
3,29,0,0,1,1
4,32,1,0,0,0
5,37,0,0,-1,-1
6,10,1,0,0,0
7,20,0,0,0,0
8,10,1,0,0,0
9,26,1,0,0,0


In [131]:
# 7/20 prime numbers
label_model = LabelModel()
# no Y_train data!!
label_model.fit(L_train=L_train, n_epochs=200, class_balance = [0.7, 0.3], seed=100)
preds_train_label = label_model.predict(L=L_train)
preds_valid_label = label_model.predict(L=L_valid)
L_valid = applier.apply(df_val)
LFAnalysis(L_valid, lfs).lf_summary()


100%|██████████| 200/200 [00:00<00:00, 1559.77epoch/s]
100%|██████████| 7/7 [00:00<00:00, 7029.00it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
is_odd,0,[0],0.571429,0.142857,0.142857
is_two,1,[1],0.142857,0.142857,0.142857
is_known_prime,2,[1],0.571429,0.142857,0.142857


In [132]:

preds_train_labelingModel = label_model.predict(L=L_train)
df["preds_labelingModel_wclassBalance"] = preds_train_labelingModel

In [133]:
df[["Number", "preds_labelingModel", "preds_labelingModel_wclassBalance"]]

Unnamed: 0,Number,preds_labelingModel,preds_labelingModel_wclassBalance
0,5,1,1
1,21,-1,0
2,1,-1,0
3,29,1,1
4,32,0,0
5,37,-1,0
6,10,0,0
7,20,0,0
8,10,0,0
9,26,0,0


Looking at the weights of the label_model

In [134]:
np.round(label_model.get_weights(), 2)

array([0.94, 0.93, 0.77])

In [135]:
accuracy_labeling_model = (preds_valid_label == true_labels).mean()
accuracy_labeling_model

1.0

And the actual conditional probability values placed in a matrix with dimensions [number of labeling function, number of labels + 1 (for abstain), number of classes], rounded are as follows:


def get_conditional_probs(self) -> np.ndarray:

    r"""Return the estimated conditional probabilities table.

    Return the estimated conditional probabilites table cprobs, where cprobs is an
    (m, k+1, k)-dim np.ndarray with:

        cprobs[i, j, k] = P(\lf_i = j-1 | Y = k)

    where m is the number of LFs, k is the cardinality, and cprobs includes the
    conditional abstain probabilities P(\lf_i = -1 | Y = y).

    Returns
    -------
    np.ndarray
        An [m, k + 1, k] np.ndarray conditional probabilities table.
    """
```

array([
labeling function 1    
       [[
        P(lf=1|Y!=y) = 0.114, P(lf=|Y=y) = 0.98 ],
        [0.876, 0.01 ],
        [0.01 , 0.01 ]],
labeling function 2
       [[0.114, 0.98 ],
        [0.876, 0.01 ],
        [0.01 , 0.01 ]],

       [[0.91 , 0.923],
        [0.01 , 0.01 ],
        [0.08 , 0.067]],

       [[0.869, 0.58 ],
        [0.01 , 0.01 ],
        [0.121, 0.41 ]]])
```

Looking at the conditional probabilities 

In [136]:
np.round(label_model.get_conditional_probs(), 3)

array([[[0.257, 0.865],
        [0.733, 0.125],
        [0.01 , 0.01 ]],

       [[0.934, 0.859],
        [0.01 , 0.01 ],
        [0.056, 0.131]],

       [[0.918, 0.501],
        [0.01 , 0.01 ],
        [0.072, 0.489]]])

## Logistic Regression

From ML Bookcamp

In [137]:
from sklearn.linear_model import LogisticRegression

In [138]:
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(L_train, y_train)

In [139]:
validation_pred_probs = model.predict_proba(L_valid)
validation_pred_probs

array([[0.82825155, 0.17174845],
       [0.1104765 , 0.8895235 ],
       [0.1104765 , 0.8895235 ],
       [0.18750053, 0.81249947],
       [0.82825155, 0.17174845],
       [0.1104765 , 0.8895235 ],
       [0.82825155, 0.17174845]])

In [140]:
df_val = pd.DataFrame(validation, columns=['Number'])
df_val["is_odd"] = df_val.apply(is_odd, axis=1)
# df_val["is_even"] = df_val.apply(is_even, axis=1)
df_val["is_two"] = df_val.apply(is_two, axis=1)
df_val["is_known_prime"] = df_val.apply(is_known_prime, axis=1)
df_val["pred_majority"] = preds_valid_majority
df_val["ground_truth"] = true_labels
df_val["log_reg_p=1"] = validation_pred_probs[:,1]
x = lambda x: (x > 0.5)
df_val["log_reg_pred"] = np.multiply(x(df_val["log_reg_p=1"]),1 )
df_val

Unnamed: 0,Number,is_odd,is_two,is_known_prime,pred_majority,ground_truth,log_reg_p=1,log_reg_pred
0,22,0,-1,-1,0,0,0.171748,0
1,11,-1,-1,1,1,1,0.889523,1
2,7,-1,-1,1,1,1,0.889523,1
3,2,0,1,1,1,1,0.812499,1
4,32,0,-1,-1,0,0,0.171748,0
5,101,-1,-1,1,1,1,0.889523,1
6,102,0,-1,-1,0,0,0.171748,0


In [141]:
accuracy_log_regr = (df_val["log_reg_pred"] == df_val["ground_truth"]).mean()