In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

import nltk
from nltk.corpus import stopwords
import re

try:
    stopwords.words("english")
except LookupError:
    nltk.download("stopwords")

In [None]:
df = pd.read_csv("data/spam.csv", encoding="latin1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


Data Cleaning and Label Encoding

In [12]:
df = df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
df.columns = ["label", "message"]
df.head()

df["label"] = df["label"].map({"ham": 0, "spam": 1})

df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


Text Cleaning and Preprocessing

In [13]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    try:
        text = text.encode("latin1").decode("utf-8")
    except (UnicodeEncodeError, UnicodeDecodeError, AttributeError):
        pass
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"\d+", "", text)
    text = text.lower()
    words = [word for word in text.split() if word not in stopwords.words("english")]
    words = [word for word in words if len(word) > 2]
    return " ".join(words)


df["cleaned_text"] = df["message"].apply(clean_text)
df.head()

Unnamed: 0,label,message,cleaned_text
0,0,"Go until jurong point, crazy.. Available only ...","jurong point, crazy.. available bugis great wo..."
1,0,Ok lar... Joking wif u oni...,lar... joking wif oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win cup final tkts may te...
3,0,U dun say so early hor... U c already then say...,dun say early hor... already say...
4,0,"Nah I don't think he goes to usf, he lives aro...","nah think goes usf, lives around though"


Model Training and Evaluation

In [16]:
def train_models(X, y, title):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    models = {
        "Naive Bayes": MultinomialNB(),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
        "XGBoost": XGBClassifier(eval_metric="logloss"),
    }

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        print(f"{title} - {name}: {acc:.4f}")

Vectorization and Ensemble Model Training

In [17]:
vectorizers = {"BoW": CountVectorizer(), "TF-IDF": TfidfVectorizer()}

y = df["label"]

for vec_name, vectorizer in vectorizers.items():
    print(f"\nTraining with {vec_name}")
    X_raw = vectorizer.fit_transform(df["message"])
    X_clean = vectorizer.fit_transform(df["cleaned_text"])

    train_models(X_raw, y, f"{vec_name} (Raw Text)")
    train_models(X_clean, y, f"{vec_name} (Cleaned Text)")

voting_clf = VotingClassifier(
    estimators=[
        ("nb", MultinomialNB()),
        ("rf", RandomForestClassifier(n_estimators=100, random_state=42)),
        ("xgb", XGBClassifier(eval_metric="logloss")),
    ],
    voting="hard",
)

X_final = vectorizers["TF-IDF"].fit_transform(df["cleaned_text"])
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.2, random_state=42
)

voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)
print(f"\nEnsemble Model Accuracy: {accuracy_score(y_test, y_pred):.4f}")


Training with BoW
BoW (Raw Text) - Naive Bayes: 0.9803
BoW (Raw Text) - Random Forest: 0.9740
BoW (Raw Text) - XGBoost: 0.9794
BoW (Cleaned Text) - Naive Bayes: 0.9713
BoW (Cleaned Text) - Random Forest: 0.9758
BoW (Cleaned Text) - XGBoost: 0.9767

Training with TF-IDF
TF-IDF (Raw Text) - Naive Bayes: 0.9623
TF-IDF (Raw Text) - Random Forest: 0.9749
TF-IDF (Raw Text) - XGBoost: 0.9803
TF-IDF (Cleaned Text) - Naive Bayes: 0.9659
TF-IDF (Cleaned Text) - Random Forest: 0.9758
TF-IDF (Cleaned Text) - XGBoost: 0.9785

Ensemble Model Accuracy: 0.9758
