In [1]:
%matplotlib inline
import os

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["PYTHONHASHSEED"] = "0"

In [2]:
import pandas as pd

DISPLAY_ALL_TEXT = False
pd.set_option("display.max_colwidth", 0 if DISPLAY_ALL_TEXT else 50)

In [3]:
# !python3 -m spacy download en_core_web_sm

In [4]:
from utils import load_spam_dataset

df_train, df_test = load_spam_dataset()
Y_test = df_test.label.values

Function running
Reading data/Youtube01-Psy.csv ...
Reading data/Youtube02-KatyPerry.csv ...
Reading data/Youtube03-LMFAO.csv ...
Reading data/Youtube04-Eminem.csv ...
Reading data/Youtube05-Shakira.csv ...


In [5]:
ABSTAIN, HAM, SPAM = -1, 0, 1

In [6]:
from snorkel.labeling import labeling_function, LabelingFunction, PandasLFApplier

In [7]:
import re

URL_RE   = re.compile(r"(https?://|www\.)", re.I)
TIME_RE  = re.compile(r"\b\d{1,2}:\d{2}\b")
EMAIL_RE = re.compile(r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", re.I)

In [8]:
PROMO_PHRASES = [
    "subscribe", "sub back", "follow me", "check my", "free giveaway",
    "promo code", "buy now", "discount", "click my", "like and subscribe"
]
SELF_PROMO = ["my channel", "my video", "new video on", "check my channel"]

CASUAL_PRAISE = ["nice", "cool", "great", "love", "cute", "amazing", "awesome", "fire", "lol", "lmao", "haha"]

CONTENT_QS = [
    "what song", "song name", "who is", "where was this", "which camera", "how did you make"
]

In [9]:
@labeling_function()
def lf_has_url(x):
    return SPAM if URL_RE.search(x.text) else ABSTAIN

@labeling_function()
def lf_has_email(x):
    return SPAM if EMAIL_RE.search(x.text) else ABSTAIN

@labeling_function()
def lf_promo_verbs(x):
    t = x.text.lower()
    return SPAM if any(p in t for p in PROMO_PHRASES) else ABSTAIN

@labeling_function()
def lf_self_promo(x):
    t = x.text.lower()
    return SPAM if any(p in t for p in SELF_PROMO) else ABSTAIN

@labeling_function()
def lf_many_exclamations(x):
    return SPAM if x.text.count("!") >= 4 else ABSTAIN

@labeling_function()
def lf_repeated_tokens(x):
    tokens = x.text.lower().split()
    return SPAM if any(tokens.count(tok) >= 3 for tok in set(tokens) if len(tok) > 2) else ABSTAIN

@labeling_function()
def lf_many_mentions(x):
    return SPAM if x.text.count("@") >= 2 else ABSTAIN

@labeling_function()
def lf_too_many_hashtags(x):
    return SPAM if x.text.count("#") >= 3 else ABSTAIN

@labeling_function()
def lf_uppercase_shout(x):
    letters = [c for c in x.text if c.isalpha()]
    if not letters:
        return ABSTAIN
    up_ratio = sum(1 for c in letters if c.isupper()) / len(letters)
    return SPAM if up_ratio >= 0.6 and len(letters) >= 8 else ABSTAIN

@labeling_function()
def lf_low_char_entropy(x):
    s = x.text.lower()
    if len(s) < 20:
        return ABSTAIN
    uniq = len(set(s))
    return SPAM if (uniq / max(len(s),1)) < 0.1 else ABSTAIN

In [10]:
@labeling_function()
def lf_has_timestamp(x):
    return HAM if TIME_RE.search(x.text) else ABSTAIN

@labeling_function()
def lf_short_casual_praise(x):
    t = x.text.lower()
    return HAM if (len(t.split()) <= 4 and any(w in t for w in CASUAL_PRAISE)) else ABSTAIN

@labeling_function()
def lf_question_about_content(x):
    t = x.text.lower()
    return HAM if any(q in t for q in CONTENT_QS) else ABSTAIN

@labeling_function()
def lf_balanced_punctuation(x):
    s = x.text
    return HAM if 5 <= len(s.split()) <= 40 and s.count("!") <= 2 and s.count("?") <= 2 else ABSTAIN

@labeling_function()
def lf_no_links_reasonable_len(x):
    s = x.text
    return HAM if (not URL_RE.search(s)) and (10 <= len(s) <= 240) else ABSTAIN

In [11]:
LF_LIST = [
    lf_has_url, lf_has_email, lf_promo_verbs, lf_self_promo, lf_many_exclamations,
    lf_repeated_tokens, lf_many_mentions, lf_too_many_hashtags, lf_uppercase_shout, lf_low_char_entropy,
    lf_has_timestamp, lf_short_casual_praise, lf_question_about_content, lf_balanced_punctuation, lf_no_links_reasonable_len
]

In [12]:
applier = PandasLFApplier(lfs=LF_LIST)
L_train = applier.apply(df=df_train)
L_test  = applier.apply(df=df_test)
L_train.shape, L_test.shape


100%|██████████| 1586/1586 [00:00<00:00, 24881.58it/s]
100%|██████████| 250/250 [00:00<00:00, 25034.64it/s]


((1586, 15), (250, 15))

In [13]:
from snorkel.labeling import LFAnalysis
lf_summary = LFAnalysis(L=L_train, lfs=LF_LIST).lf_summary()
lf_summary

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
lf_has_url,0,[1],0.12232,0.076923,0.066204
lf_has_email,1,[],0.0,0.0,0.0
lf_promo_verbs,2,[1],0.14691,0.14628,0.121059
lf_self_promo,3,[1],0.084489,0.083859,0.068726
lf_many_exclamations,4,[1],0.053594,0.053594,0.035939
lf_repeated_tokens,5,[1],0.098361,0.094578,0.042875
lf_many_mentions,6,[],0.0,0.0,0.0
lf_too_many_hashtags,7,[1],0.010719,0.010088,0.003153
lf_uppercase_shout,8,[1],0.057377,0.057377,0.053594
lf_low_char_entropy,9,[1],0.062421,0.058008,0.008827


In [14]:
from snorkel.labeling.model import MajorityLabelVoter, LabelModel

mv = MajorityLabelVoter(cardinality=2)
mv_train_preds = mv.predict(L=L_train)

label_model = LabelModel(cardinality=2, verbose=False)
label_model.fit(L_train=L_train, n_epochs=400, log_freq=100, seed=42)


100%|██████████| 400/400 [00:00<00:00, 6145.05epoch/s]


In [15]:
mv_acc  = mv.score(L=L_test,  Y=Y_test, tie_break_policy="random")["accuracy"]
lm_acc  = label_model.score(L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"]
print(f"Majority Vote Acc: {mv_acc:.3f}")
print(f"Label Model   Acc: {lm_acc:.3f}")

Majority Vote Acc: 0.712
Label Model   Acc: 0.636


In [None]:
from snorkel.labeling import filter_unlabeled_dataframe
from snorkel.utils import probs_to_preds
probs_train = label_model.predict_proba(L=L_train)

df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_train, y=probs_train, L=L_train
)
print("Filtered train size:", df_train_filtered.shape)

hard_labels = probs_to_preds(probs=probs_train_filtered)

Filtered train size: (1555, 5)


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [18]:
tfidf = TfidfVectorizer(ngram_range=(1,3), min_df=2, max_df=0.95)
X_train = tfidf.fit_transform(df_train_filtered["text"].tolist())
X_test  = tfidf.transform(df_test["text"].tolist())

In [23]:
svm = LinearSVC(class_weight="balanced", random_state=42)
svm.fit(X_train, hard_labels)
print(f"Test Accuracy: {svm.score(X=X_test, y=Y_test) * 100:.1f}%")

Test Accuracy: 64.4%


In [24]:
logit = LogisticRegression(C=3.0, max_iter=200, n_jobs=None, solver="liblinear", class_weight="balanced", random_state=42)
logit.fit(X_train, hard_labels)
print(f"Test Accuracy: {logit.score(X=X_test, y=Y_test) * 100:.1f}%")

Test Accuracy: 67.6%


In [None]:
def evaluate(name, model, X, y_true):
    y_pred = model.predict(X)
    print(f"\nModel: {name}")
    print(classification_report(y_true, y_pred, target_names=["HAM(0)","SPAM(1)"]))
    print("Confusion matrix:\n", confusion_matrix(y_true, y_pred))
    return y_pred

y_pred_svm   = evaluate("LinearSVC", svm,  X_test, Y_test)
y_pred_logit = evaluate("LogisticRegression", logit, X_test, Y_test)


Model: LinearSVC
              precision    recall  f1-score   support

      HAM(0)       0.63      0.78      0.70       132
     SPAM(1)       0.67      0.49      0.57       118

    accuracy                           0.64       250
   macro avg       0.65      0.64      0.63       250
weighted avg       0.65      0.64      0.64       250

Confusion matrix:
 [[103  29]
 [ 60  58]]

Model: LogisticRegression
              precision    recall  f1-score   support

      HAM(0)       0.65      0.83      0.73       132
     SPAM(1)       0.72      0.51      0.60       118

    accuracy                           0.68       250
   macro avg       0.69      0.67      0.66       250
weighted avg       0.69      0.68      0.67       250

Confusion matrix:
 [[109  23]
 [ 58  60]]
