In [None]:

# File: preprocessing_pipeline.ipynb
# It will Clean, preprocess, and vectorize the Enron Spam dataset
# Importing libraries
import pandas as pd
import numpy as np
import re
import string
import nltk
import pickle
import random
import os
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from scipy.sparse import save_npz

# Reproducibility Setup to reproduce same results during experimentations

SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
random.seed(SEED)

# Downloading NLTK Resources

nltk.download('stopwords')
nltk.download('wordnet')


# Loading Dataset in csv format

df = pd.read_csv("enron_spam_data.csv")
print("Initial shape:", df.shape) # before preprocessing
print("Columns:", df.columns.tolist()) # list of columns

# Combine 'Subject' and 'Message' 
df["text"] = df["Subject"].fillna('') + " " + df["Message"].fillna('')

# Encode labels from 'Spam/Ham' column that we can feed into model
df["label"] = df["Spam/Ham"].map({"ham": 0, "spam": 1})
df = df.dropna(subset=["label", "text"]) # dropping label and text . these are not required for model

print("After cleaning:", df.shape) # after cleaning dataset
print(df["label"].value_counts()) # to check whether class imbalance

# Text Cleaning Function

lemmatizer = WordNetLemmatizer() # lemmatizer is relevant here, as it maintains root words from vocabulary, for ML models, but we will not require it for transfer learning with BERT pretrained model
stop_words = set(stopwords.words("english"))

def clean_text(text):
    """Lowercase, remove punctuation/numbers/stopwords, and lemmatize."""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)   # Remove URLs
    text = re.sub(r"\d+", "", text)              # Remove numbers
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = [lemmatizer.lemmatize(w) for w in text.split() if w not in stop_words]
    return " ".join(tokens)

df["clean_text"] = df["text"].apply(clean_text)
df = df[df["clean_text"].str.split().apply(len) < 2000]
 # removing samples with very long text (>2000 words) to avoid noise

print("After filtering:", df.shape)
print("Max words now:", df["clean_text"].str.split().apply(len).max())


# Train-Test Split

X = df["clean_text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)


# TF-IDF Vectorization inputs (X) for traditional ML models

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2)) # limits 5000 as top terms with importance, captures unigrams and bigrams only.
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test) # # using vocabulary and IDF learned from training data, only transformed to avoid leakage of test information into model

print("TF-IDF Train Shape:", X_train_tfidf.shape) # its shape will be (no_train_samples, no_features)
print("TF-IDF Test Shape:", X_test_tfidf.shape) # its shape will be (no_test_samples, n_features)


# Saveing Processed Data

save_npz("X_train_tfidf.npz", X_train_tfidf) # this format efficiently stores sparse matrixes (vectorizer produces sparse matrices)
save_npz("X_test_tfidf.npz", X_test_tfidf)

np.save("y_train.npy", y_train) # it stores NumPy arrays in a compact, and efficient binary format.
np.save("y_test.npy", y_test)

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f) # saving vectorizer outputs as pickle file, and We can reload it later, without rerunning all above preprocessing and vectorization steps.

print("\n All Preprocessing steps complete! and Files are saved:")
print(" X_train_tfidf.npz, X_test_tfidf.npz") #sparse matrixes
print(" y_train.npy, y_test.npy") # it stores numpy arrays in binary format.
print(" tfidf_vectorizer.pkl") # vectorizer outputs


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Initial shape: (33716, 5)
Columns: ['Message ID', 'Subject', 'Message', 'Spam/Ham', 'Date']
After cleaning: (33716, 7)
label
1    17171
0    16545
Name: count, dtype: int64
After filtering: (33603, 8)
Max words now: 1984
TF-IDF Train Shape: (26882, 5000)
TF-IDF Test Shape: (6721, 5000)

 All Preprocessing steps complete! and Files are saved:
 X_train_tfidf.npz, X_test_tfidf.npz
 y_train.npy, y_test.npy
 tfidf_vectorizer.pkl
