In [1]:
from utils import load_spam_dataset
import pandas as pd

# Load Data

In [2]:
df_train, df_test = load_spam_dataset(load_train_labels=True)

# Writing Slicing Functions (SFs)

In [3]:
#Execute slicing functions
from SpamSlicingFunctions import *
sfs = [short_comment]

In [4]:
#Visualize
from snorkel.slicing import slice_dataframe
short_comment_df = slice_dataframe(df_test, short_comment)
cols = ['text', 'label']
short_comment_df[cols].head()

100%|█████████████████████████████████████| 250/250 [00:00<00:00, 40206.13it/s]


Unnamed: 0,text,label
194,super music﻿,0
2,I like shakira..﻿,0
110,subscribe to my feed,1
263,Awesome ﻿,0
77,Nice,0


# Monitor Slice Performance

In [5]:
#Train classifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from utils import df_to_features

vectorizer = CountVectorizer(ngram_range=(1,1))
X_train, Y_train = df_to_features(vectorizer, df_train, "train")
X_test, Y_test = df_to_features(vectorizer, df_test, "test")

sklearn_model = LogisticRegression(C=0.001, solver="liblinear")
sklearn_model.fit(X=X_train, y=Y_train)

LogisticRegression(C=0.001, solver='liblinear')

In [6]:
#Evaluate overall performance
from sklearn.metrics import f1_score
from snorkel.utils import preds_to_probs
preds_test = sklearn_model.predict(X_test)
probs_test = preds_to_probs(preds_test, 2)

print(f"Test set F1: {100*f1_score(Y_test, preds_test):.1f}%")

Test set F1: 92.5%


In [7]:
#Evaluate overall and slice performance
from snorkel.analysis import Scorer
from snorkel.slicing import PandasSFApplier

applier = PandasSFApplier(sfs)
S_test = applier.apply(df_test)
scorer = Scorer(metrics=["f1"])
scorer.score_slices(S=S_test, 
                    golds=Y_test, 
                    preds=preds_test, 
                    probs=probs_test, 
                    as_dataframe=True)

100%|█████████████████████████████████████| 250/250 [00:00<00:00, 38398.13it/s]


Unnamed: 0,f1
overall,0.925
short_comment,0.666667


# Additional SFs & Performance

In [8]:
#Polarity slicing function
from SpamSlicingFunctions import *
polarity_df = slice_dataframe(df_test, textblob_polarity)
polarity_df[["text", "label"]].head()

100%|██████████████████████████████████████| 250/250 [00:00<00:00, 1312.98it/s]


Unnamed: 0,text,label
263,Awesome ﻿,0
240,Shakira is the best dancer,0
261,OMG LISTEN TO THIS ITS SOO GOOD!! :D﻿,0
14,Shakira is very beautiful,0
114,awesome,0


In [9]:
#Evaluate performance
extra_sfs = [keyword_please, regex_check_out, short_link, textblob_polarity]
sfs = [short_comment] + extra_sfs
slice_names = [sf.name for sf in sfs]

applier = PandasSFApplier(sfs)
S_test = applier.apply(df_test)

scorer.score_slices(
    S=S_test, golds=Y_test, preds=preds_test, probs=probs_test, as_dataframe=True
)

100%|██████████████████████████████████████| 250/250 [00:00<00:00, 8953.23it/s]


Unnamed: 0,f1
overall,0.925
short_comment,0.666667
keyword_please,1.0
regex_check_out,1.0
short_link,0.5
textblob_polarity,0.727273


# Improving Slice Performance