In [None]:
!pip install snorkel

# Load Data

In [1]:
import pandas as pd

In [2]:
train_df = pd.read_pickle(
    "https://github.com/khuyentran1401/Data-science/blob/master/feature_engineering/snorkel_example/train_fake_jobs.pkl?raw=true"
)
test_df = pd.read_pickle(
    "https://github.com/khuyentran1401/Data-science/blob/master/feature_engineering/snorkel_example/test_fake_jobs.pkl?raw=true"
)

# Create Labeling Functions

In [3]:
from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis

In [4]:
FAKE = 1
REAL = 0
ABSTAIN = -1

In [5]:
FAKE_COMPANIES = [
    "Aker Solutions",
    "Aptitude Staffing Solutions",
    "Gary Cartwright",
    "Edison International and Refined Resources",
    "Le Meridien",
]

In [6]:
@labeling_function()
def no_requirements(x: pd.Series):

    return FAKE if x.requirements == "" else ABSTAIN


@labeling_function()
def requirements_less_than_10(x: pd.Series):
    num_words = len(x.requirements.split(" "))
    return FAKE if num_words <= 10 else ABSTAIN


@labeling_function()
def requirements_less_than_20(x: pd.Series):
    num_words = len(x.requirements.split(" "))
    return FAKE if num_words <= 20 else ABSTAIN


@labeling_function()
def no_company_profile(x: pd.Series):
    return FAKE if x.company_profile == "" else ABSTAIN


@labeling_function()
def no_company_logo(x: pd.Series):
    return FAKE if x.has_company_logo == 0 else ABSTAIN


@labeling_function()
def suspicious_company(x: pd.Series):
    return (
        FAKE
        if any(
            company.lower() in x.company_profile.lower() for company in FAKE_COMPANIES
        )
        else ABSTAIN
    )


@labeling_function()
def has_background_check(x: pd.Series):
    return REAL if "background check" in x.requirements else ABSTAIN


@labeling_function()
def required_experience(x: pd.Series):
    return REAL if pd.notna(x.required_experience) else ABSTAIN


@labeling_function()
def required_education(x: pd.Series):
    return REAL if pd.notna(x.required_education) else ABSTAIN

# Apply Labeling Functions to the Data

In [165]:
lfs = [
    no_company_profile,
    suspicious_company,
    no_company_logo,
    has_background_check,
    required_experience,
    required_education,
]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=train_df)

100%|█████████████████| 13410/13410 [00:00<00:00, 14877.06it/s]


<IPython.core.display.Javascript object>

# Evaluate Labeling Functions

In [166]:
LFAnalysis(L=L_train, lfs=lfs).lf_summary(Y=train_df.fraudulent.values)



Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
no_company_profile,0,[1],0.186204,0.170694,0.110365,459,2038,0.183821
suspicious_company,1,[1],0.006861,0.005667,0.005667,92,0,1.0
no_company_logo,2,[1],0.205742,0.161894,0.101566,459,2300,0.166365
has_background_check,3,[0],0.035496,0.032438,0.002759,464,12,0.97479
required_experience,4,[0],0.60701,0.507755,0.119165,7809,331,0.959337
required_education,5,[0],0.551603,0.489485,0.10179,7075,322,0.956469


<IPython.core.display.Javascript object>

## Evaluate Conflicts

In [167]:
from snorkel.analysis import get_label_buckets

buckets = get_label_buckets(L_train[:, 2], L_train[:, 4])

res = train_df.iloc[buckets[(FAKE, REAL)]].sample(10, random_state=1)[
    ["has_company_logo", "required_experience", "fraudulent"]
]

res

Unnamed: 0,has_company_logo,required_experience,fraudulent
16877,0,Mid-Senior level,0
17068,0,Mid-Senior level,0
16816,0,Mid-Senior level,0
12186,0,Not Applicable,0
16808,0,Mid-Senior level,0
678,0,Not Applicable,0
3706,0,Entry level,0
14150,0,Entry level,0
2199,0,Entry level,0
4411,0,Entry level,0


<IPython.core.display.Javascript object>

# Combine Predictions

## MajorityLabelVoter

In [168]:
from snorkel.labeling.model import MajorityLabelVoter

majority_model = MajorityLabelVoter()
preds_train = majority_model.predict(L=L_train)

<IPython.core.display.Javascript object>

## LabelModel

In [174]:
from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=1)


INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|                               | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.841]
INFO:root:[100 epochs]: TRAIN:[loss=0.012]
 22%|████▏              | 109/500 [00:00<00:00, 1087.56epoch/s]INFO:root:[200 epochs]: TRAIN:[loss=0.011]
INFO:root:[300 epochs]: TRAIN:[loss=0.010]
 61%|███████████▋       | 307/500 [00:00<00:00, 1610.15epoch/s]INFO:root:[400 epochs]: TRAIN:[loss=0.010]
100%|███████████████████| 500/500 [00:00<00:00, 1683.12epoch/s]
INFO:root:Finished Training


<IPython.core.display.Javascript object>

In [170]:
L_test = applier.apply(df=test_df)

100%|███████████████████| 4470/4470 [00:00<00:00, 14329.87it/s]


<IPython.core.display.Javascript object>

In [171]:
Y_train = train_df["fraudulent"]
Y_test = test_df["fraudulent"]

<IPython.core.display.Javascript object>

In [172]:
majority_acc = majority_model.score(
    L=L_test,
    Y=Y_test,
)["accuracy"]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")



Majority Vote Accuracy:   80.8%


<IPython.core.display.Javascript object>

In [173]:
label_model_acc = label_model.score(L=L_test, Y=Y_test)["accuracy"]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")



Label Model Accuracy:     71.9%


<IPython.core.display.Javascript object>