In [None]:
! pip install pyro-ppl spacy transformers
! python -m spacy download en_core_web_sm
! pip install torch==2.3.0
! pip install torchtext

# data loading
data src: https://www.kaggle.com/datasets/venky73/spam-mails-dataset/data
we only care about column: text and label_num(binary identifier)

In [None]:
import pandas as pd

# Load CSV
df = pd.read_csv('spam_ham_dataset.csv') 

# cleansing and conversion to tokens,
A text can include 
1. Subject
2. From|To|CC|BCC
3. attachment (e.g., filenames like "hplnol09.xls")
4. numbers/extra spaces

In [None]:
import re
import spacy

nlp = spacy.load("en_core_web_sm")

def clean_email(text):
    # Remove email headers 
    text = re.sub(r"Subject:\s*[^\n]+\n", "", text)
    text = re.sub(r"\b(From|To|CC|BCC):\s*[^\n]+\n", "", text, flags=re.I)
    
    # Remove attachments 
    text = re.sub(r"\(?\s*see attached file:\s*[^)]+\.\w{3,4}\s*\)?", "", text, flags=re.I)
    
    # Remove numbers, punctuation, and extra spaces
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip().lower()
    
    # Lemmatize with spaCy
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.lemma_.strip()]
    
    return " ".join(tokens)

df["cleaned_text"] = df["text"].apply(clean_email)

# feature engineering
N-grams and TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(
    max_features=2000,
    ngram_range=(1, 3),  # Include unigrams, bigrams, trigrams
    stop_words="english"
)

X = tfidf.fit_transform(df["cleaned_text"])
y = df["label_num"].values

# Bayesian Logistic Regression with HMC
Prior: Normal(0,1)
Likelihood: Sigmoid(X @ coef + bias)

In [None]:
import torch
import pyro
import pyro.distributions as dist
from pyro.infer.mcmc import HMC, MCMC
from sklearn.model_selection import train_test_split

# Convert to PyTorch tensors
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train = torch.tensor(X_train.toarray(), dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_test = torch.tensor(X_test.toarray(), dtype=torch.float32)

# Define Bayesian model
def model(X, y=None):
    coef = pyro.sample("coef", dist.Normal(0, 1).expand([X.shape[1]]))
    bias = pyro.sample("bias", dist.Normal(0, 1))
    logits = X @ coef + bias
    p = torch.sigmoid(logits)
    
    with pyro.plate("data", X.shape[0]):
        obs = pyro.sample("obs", dist.Bernoulli(p), obs=y)
    return logits

# Run HMC
hmc_kernel = HMC(model, step_size=0.001, trajectory_length=1)
mcmc = MCMC(hmc_kernel, num_samples=1000, warmup_steps=200)
mcmc.run(X_train, y_train)

# Get posterior samples
posterior = mcmc.get_samples()

# predication & accuracy testing

In [None]:

# Predict on test data
with torch.no_grad():
    logits_test = X_test @ posterior["coef"].mean(dim=0) + posterior["bias"].mean()
    prob_spam = torch.sigmoid(logits_test).numpy()

# Accuracy
accuracy = (prob_spam.round() == y_test).mean()
print(f"Test Accuracy: {accuracy:.2f}")