In [11]:
import pandas as pd
import numpy as np

## Load in the Data

In [12]:
data_path = "~/downloads/sms+spam+collection/SMSSpamCollection"
df = pd.read_csv(data_path, sep="\t", header=None, names=["label", "text"])
print(df.label.value_counts())

label
ham     4825
spam     747
Name: count, dtype: int64


Dataset is imbalanced due to the proportion of positive to negative labels, which will determine the choice of metrics in the future.

## Create a Holdout Set

Most tutorials use test/train splits for easier data processing, but in production a holdout set of unseen cases helps evaluate your model more accurately.

In [13]:
from sklearn.model_selection import train_test_split

# Splitting data into a training validation set and a holdout set
df_train_val, df_holdout = train_test_split(df, 
                                            test_size=0.1,  # splits 10% of the data into the holdout set
                                            stratify=df['label'],   # stratify ensures that the correct class distribution is preserved
                                            random_state=42)

In [23]:
df_train_val.to_csv('./data/raw/spam_train_val.csv', index=False)
df_holdout.to_csv('./data/raw/spam_holdout.csv', index=False)

## Preprocess the Text

Text data is noisy in terms of "win" vs "winning" and other nuances that are not important for the meaning of the text, especially in the context of spam detection. 

In [29]:
from nltk.corpus import stopwords   # Words like "the", "a", "but", "and" w/ no meaning 
from nltk.stem import PorterStemmer # Stemming = removing the "ing" so the meaning is still preserved
from nltk.tokenize import word_tokenize
import string, nltk

nltk.download('punkt_tab')  # Punctuation
nltk.download('stopwords')

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))    # Remove punctuation
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stopwords.words('english')]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(t) for t in tokens]
    return ' '.join(tokens)

df_train_val['clean_text'] = df_train_val['text'].apply(preprocess_text)
df_train_val['label_num'] = df_train_val['label'].map({'ham': 0, 'spam': 1})

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/kalindadhikari/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kalindadhikari/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
X = df_train_val['clean_text']
y = df_train_val['label_num']

In [31]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

## Train and Compare Multiple Models

Logistic Regression, SVM, Multinomial Naive Bayes, Random Forest

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier


models = {
    'LogisticRegression': LogisticRegression(class_weight='balanced', max_iter=1000),
    'SVM': SVC(kernel='linear', class_weight='balanced', probability=True),
    'MulinomialNB': MultinomialNB(),
    'RandomForest': RandomForestClassifier(n_estimators=100, class_weight='balanced'),
}

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

for name, clf in models.items():
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('clf', clf),
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(f"\nModel: {name}")
    print(classification_report(y_test, y_pred))


Model: LogisticRegression
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       869
           1       0.92      0.92      0.92       134

    accuracy                           0.98      1003
   macro avg       0.96      0.95      0.95      1003
weighted avg       0.98      0.98      0.98      1003


Model: SVM
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       869
           1       0.97      0.90      0.93       134

    accuracy                           0.98      1003
   macro avg       0.98      0.95      0.96      1003
weighted avg       0.98      0.98      0.98      1003


Model: MulinomialNB
              precision    recall  f1-score   support

           0       0.95      1.00      0.98       869
           1       1.00      0.68      0.81       134

    accuracy                           0.96      1003
   macro avg       0.98      0.84      0.89      1003
weighted avg  

Based on these results, LogisticRegression looks like the best option. 

## Grid Search for Spam Recall

In [35]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, recall_score

param_grid = {
    'clf__C': [0.01, 0.1, 1, 10],
    'clf__class_weight': [None, 'balanced'],
}

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression(max_iter=1000, solver='liblinear')),
])
grid = GridSearchCV(pipeline, param_grid, scoring=make_scorer(recall_score, pos_label=1), cv=5)
grid.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...liblinear'))])
,param_grid,"{'clf__C': [0.01, 0.1, ...], 'clf__class_weight': [None, 'balanced']}"
,scoring,"make_scorer(r..., pos_label=1)"
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'liblinear'
,max_iter,1000


Our initial LogReg model gave us a higher recall score, and that's what we want since detecting spam >>>> potentially letting one spam leak. 

recall = true_positives / (true_positives + false_negatives)

## Retrain on Full Data and Save the Model

In [38]:
x_full = pd.concat([X_train, X_test])
y_full = pd.concat([y_train, y_test])

In [47]:
import pickle

final_model = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression(class_weight='balanced', max_iter=1000)),
])
final_model.fit(x_full, y_full)
with open('./models/logreg_spam_pipeline.pkl', 'wb') as f:
    pickle.dump(final_model, f)

## Final Check on Holdout Set

In [49]:
df_holdout = pd.read_csv('./data/raw/spam_holdout.csv')
df_holdout['label_num'] = df_holdout['label'].map({'ham': 0, 'spam': 1})
df_holdout['clean_text'] = df_holdout['text'].apply(preprocess_text)

In [50]:
with open('./models/logreg_spam_pipeline.pkl', 'rb') as f:
    model = pickle.load(f)
X_holdout = df_holdout['clean_text']
y_holdout = df_holdout['label_num']
y_pred = model.predict(X_holdout)
print(classification_report(y_holdout, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       483
           1       0.88      0.89      0.89        75

    accuracy                           0.97       558
   macro avg       0.93      0.94      0.93       558
weighted avg       0.97      0.97      0.97       558

