In [None]:
!pip install snorkel 

# Load Data

In [15]:
import pandas as pd

In [16]:
train_df = pd.read_pickle(
    "https://github.com/khuyentran1401/Data-science/blob/master/feature_engineering/snorkel_example/train_fake_jobs.pkl?raw=true"
)
test_df = pd.read_pickle(
    "https://github.com/khuyentran1401/Data-science/blob/master/feature_engineering/snorkel_example/test_fake_jobs.pkl?raw=true"
)

In [17]:
train_df

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
12276,12277,Big Data Analyst,"GB, WSM, London",Product Ops,,Founded in 2010 by a team from Google’s London...,Qubit: Cutting Edge Big Data EngineeringQubit ...,"What you'll need: A background in consulting, ...",Plenty of perks:As well as the opportunity to ...,0,1,1,Full-time,Associate,Bachelor's Degree,Internet,Product Management,0
14680,14681,Instructional Advocate,"US, GA, Savannah",,,We are an after-school program committed to as...,21st Century Community Learning Centers is an ...,Bachelor's Degree or an Associate's Degree; or...,,0,1,0,Part-time,,,,Education,0
16518,16519,Software Developer,"US, FL, Gainesville",,,352 Inc. is a full-service digital agency crea...,We partner with great clients to build smart s...,3-5 years of great c# work Experience in mvc o...,What You’ll GetFreedom: We trust you to do you...,0,1,0,Full-time,Mid-Senior level,,Computer Software,Information Technology,0
15478,15479,Internship in India,"IN, , Bangalore",,,,"London is a fast paced city of culture, divers...",,As specialists in delivering high quality and ...,0,1,0,,,,,,0
16348,16349,Web Developer Backend Microservices (m/f),"DE, BE, 10969",Engineering,,airfy prägt sicheres und einfach zu bedienende...,Design and develop a microservice platform for...,"Senior level experience with web backends, esp...",Flat hierarchies and a productive work environ...,0,1,0,Full-time,Associate,Bachelor's Degree,Internet,Engineering,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9225,9226,Senior Tax Professional for Product Manager,"US, CA, San Francisco",Product,100000-150000,Balanced Labs exists to provide accountants an...,Never before has a job in tax looked more inte...,10+ years tax experience. Extensive knowledge ...,,0,1,1,Full-time,Director,Bachelor's Degree,Accounting,Strategy/Planning,0
13123,13124,QA Engineer,"US, CA, San Mateo",,,Tile is one of the most successful crowd-funde...,"The RoleHere at Tile, we are looking for a sma...","Mandatory Qualities Tech enthusiast, you love ...",Own equity in the company. Every employee is a...,0,1,0,Full-time,Associate,Bachelor's Degree,Consumer Electronics,Engineering,0
9845,9846,Senior QA Engineer,"IN, , Hyderabad",Information Technology,,#url_ddb080358fa5eecf5a67c649cfb4ffc343c484389...,Responsibilities:* Acquire and maintain a comp...,Required Skills and Experience: Minimum of fiv...,What Is OfferedOur client's core values drive ...,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Marketing and Advertising,Engineering,1
10799,10800,Customer Service Associate,"US, CT, Hartford",,,"Novitex Enterprise Solutions, formerly Pitney ...",We are currently seeking a Customer Service As...,Minimum Requirements: Minimum of 2 years custo...,,0,1,0,Full-time,Entry level,High School or equivalent,Facilities Services,Customer Service,0


# Create Labeling Functions

In [18]:
from snorkel.labeling import labeling_function

In [36]:
FAKE = 1
REAL = 0
ABSTAIN = -1

In [20]:
@labeling_function()
def no_company_profile(x: pd.Series):
    return FAKE if x.company_profile == "" else ABSTAIN


@labeling_function()
def no_company_logo(x: pd.Series):
    return FAKE if x.has_company_logo == 0 else ABSTAIN


@labeling_function()
def has_background_check(x: pd.Series):
    return REAL if "background check" in x.requirements else ABSTAIN


@labeling_function()
def required_experience(x: pd.Series):
    return REAL if pd.notna(x.required_experience) else ABSTAIN


@labeling_function()
def required_education(x: pd.Series):
    return REAL if pd.notna(x.required_education) else ABSTAIN

# Apply Labeling Functions to the Data

In [21]:
from snorkel.labeling import PandasLFApplier

lfs = [
    no_company_profile,
    no_company_logo,
    has_background_check,
    required_experience,
    required_education,
]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=train_df)


100%|██████████| 13410/13410 [00:00<00:00, 42601.73it/s]


In [30]:
L_test = applier.apply(df=test_df)

100%|██████████| 4470/4470 [00:00<00:00, 39809.03it/s]


In [47]:
L_train.shape

(13410, 5)

In [48]:
L_train[0:2]

array([[-1, -1, -1,  0,  0],
       [-1, -1, -1, -1, -1]])

# Evaluate Labeling Functions

In [24]:
from snorkel.labeling import LFAnalysis

LFAnalysis(L=L_train, lfs=lfs).lf_summary(Y=train_df.fraudulent.values)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
no_company_profile,0,[1],0.186204,0.170694,0.110365,459,2038,0.183821
no_company_logo,1,[1],0.205742,0.161894,0.101566,459,2300,0.166365
has_background_check,2,[0],0.035496,0.032438,0.002759,464,12,0.97479
required_experience,3,[0],0.60701,0.507308,0.114392,7809,331,0.959337
required_education,4,[0],0.551603,0.488591,0.09657,7075,322,0.956469


$$
\text{Accuracy}=\frac{\text{Correct}}{\text{Correct}+\text{Incorrect}}
$$

In [49]:
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
no_company_profile,0,[1],0.186204,0.170694,0.110365
no_company_logo,1,[1],0.205742,0.161894,0.101566
has_background_check,2,[0],0.035496,0.032438,0.002759
required_experience,3,[0],0.60701,0.507308,0.114392
required_education,4,[0],0.551603,0.488591,0.09657


## Evaluate Conflicts

In [43]:
from snorkel.analysis import get_label_buckets

buckets = get_label_buckets(L_train[:, 1], L_train[:, 3])
buckets


{(-1, 0): array([    0,     2,     4, ..., 13406, 13407, 13408]),
 (-1, -1): array([    1,     3,     7, ..., 13398, 13401, 13403]),
 (1, 0): array([    6,     8,    20, ..., 13361, 13370, 13389]),
 (1, -1): array([   10,    21,    26, ..., 13385, 13404, 13409])}

In [50]:
conflicted_buckets = buckets[(FAKE, REAL)]

res = train_df.iloc[conflicted_buckets].sample(10, random_state=1)[
    ["has_company_logo", "required_experience", "fraudulent"]
]
res


Unnamed: 0,has_company_logo,required_experience,fraudulent
16877,0,Mid-Senior level,0
17068,0,Mid-Senior level,0
16816,0,Mid-Senior level,0
12186,0,Not Applicable,0
16808,0,Mid-Senior level,0
678,0,Not Applicable,0
3706,0,Entry level,0
14150,0,Entry level,0
2199,0,Entry level,0
4411,0,Entry level,0


# Combine Predictions

## MajorityLabelVoter

In [31]:
Y_train = train_df["fraudulent"]
Y_test = test_df["fraudulent"]

In [32]:
from snorkel.labeling.model import MajorityLabelVoter

majority_model = MajorityLabelVoter()
preds_train = majority_model.predict(L=L_train)

In [33]:
majority_acc = majority_model.score(
    L=L_test,
    Y=Y_test,
)["accuracy"]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")



Majority Vote Accuracy:   80.7%


## LabelModel

In [34]:
from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=1)


INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.841]
INFO:root:[100 epochs]: TRAIN:[loss=0.012]
INFO:root:[200 epochs]: TRAIN:[loss=0.011]
 41%|████      | 206/500 [00:00<00:00, 2056.46epoch/s]INFO:root:[300 epochs]: TRAIN:[loss=0.010]
INFO:root:[400 epochs]: TRAIN:[loss=0.010]
100%|██████████| 500/500 [00:00<00:00, 2339.73epoch/s]
INFO:root:Finished Training


In [35]:
label_model_acc = label_model.score(L=L_test, Y=Y_test)["accuracy"]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")



Label Model Accuracy:     72.0%
