In [5]:
import sys
!{sys.executable} -m pip install pandas scikit-learn numpy nltk


Collecting pandas
  Downloading pandas-2.3.3-cp313-cp313-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting click (from nltk)
  Downloading click-8.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2025.10.23-cp313-cp313-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting tqdm (from nltk)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading pandas-2.3.3-cp313-cp313-macosx_11_0_arm64.whl (10.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.7/10.7 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [7]:
# =============================================================================
# SVM Preprocessing Pipeline for the Enron Spam Dataset
# -----------------------------------------------------------------------------
# This notebook cleans, preprocesses, and vectorizes emails so they can be 
# used with a Support Vector Machine or other classical ML models.
# =============================================================================

import pandas as pd
import numpy as np
import re
import string
import nltk
import pickle
import random
import os

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from scipy.sparse import save_npz

# ----------------------------------------------------------------------
# 1. Reproducibility setup
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
random.seed(SEED)

# ----------------------------------------------------------------------
# 2. Download NLTK resources (only needs to be done once)
nltk.download('stopwords')
nltk.download('wordnet')

# ----------------------------------------------------------------------
# 3. Load the Enron Spam dataset (CSV format)
# Make sure the path matches where you've stored enron_spam_data.csv
df = pd.read_csv("enron_spam_data.csv")
print("Initial shape:", df.shape)
print("Columns:", df.columns.tolist())



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nanthansr/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nanthansr/nltk_data...


Initial shape: (33716, 5)
Columns: ['Message ID', 'Subject', 'Message', 'Spam/Ham', 'Date']


In [8]:

# ----------------------------------------------------------------------
# 4. Combine 'Subject' and 'Message' into a single text field
# Some emails may have missing subject or message fields; fillna avoids NaN
df["text"] = df["Subject"].fillna('') + " " + df["Message"].fillna('')

# ----------------------------------------------------------------------
# 5. Encode labels: ham → 0, spam → 1
df["label"] = df["Spam/Ham"].map({"ham": 0, "spam": 1})
# Drop any rows that still have missing text or label
df = df.dropna(subset=["label", "text"])

print("After cleaning:", df.shape)
print(df["label"].value_counts())  # check class distribution

# ----------------------------------------------------------------------
# 6. Text cleaning function
# Lowercases, removes URLs, numbers, punctuation, stopwords, and lemmatizes
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def clean_text(text):
    """Lowercase, remove punctuation/numbers/stopwords, and lemmatize."""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)   # Remove URLs
    text = re.sub(r"\d+", "", text)              # Remove digits
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = [lemmatizer.lemmatize(w) for w in text.split() if w not in stop_words]
    return " ".join(tokens)

# Apply cleaning
df["clean_text"] = df["text"].apply(clean_text)

# Optionally remove extremely long emails (>2000 words) to reduce noise
df = df[df["clean_text"].str.split().apply(len) < 2000]
print("After filtering:", df.shape)
print("Max words now:", df["clean_text"].str.split().apply(len).max())


After cleaning: (33716, 7)
label
1    17171
0    16545
Name: count, dtype: int64
After filtering: (33603, 8)
Max words now: 1984


In [9]:

# ----------------------------------------------------------------------
# 7. Train–test split
# Use stratify=y to preserve class distribution
X = df["clean_text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

# ----------------------------------------------------------------------
# 8. TF‑IDF vectorization
# Here we limit features to top 5,000 and include unigrams and bigrams
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)  # don’t re-fit on test data

print("TF‑IDF Train Shape:", X_train_tfidf.shape)
print("TF‑IDF Test Shape:", X_test_tfidf.shape)

# ----------------------------------------------------------------------
# 9. Save processed data for later modeling
# Sparse matrices are saved in .npz format to save space
save_npz("X_train_tfidf.npz", X_train_tfidf)
save_npz("X_test_tfidf.npz", X_test_tfidf)

# Labels saved as NumPy arrays
np.save("y_train.npy", y_train.to_numpy())
np.save("y_test.npy", y_test.to_numpy())

# Persist the fitted vectorizer (so you can transform new data later)
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

print("\nPreprocessing complete! Files saved:")
print("- X_train_tfidf.npz, X_test_tfidf.npz (vectorized features)")
print("- y_train.npy, y_test.npy (label arrays)")
print("- tfidf_vectorizer.pkl (fitted vectorizer)")

TF‑IDF Train Shape: (26882, 5000)
TF‑IDF Test Shape: (6721, 5000)

Preprocessing complete! Files saved:
- X_train_tfidf.npz, X_test_tfidf.npz (vectorized features)
- y_train.npy, y_test.npy (label arrays)
- tfidf_vectorizer.pkl (fitted vectorizer)
