In [8]:
import pandas as pd
import string
import re
import joblib

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


# -----------------------------
# 1. Load Data
# -----------------------------
def load_data(path: str) -> pd.DataFrame:
    df = pd.read_csv(
        path,
        sep="\t",
        names=["label", "message"]
    )
    return df


# -----------------------------
# 2. Text Cleaning
# -----------------------------
def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"\d+", "", text)  # remove digits
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = text.strip()
    return text


def preprocess(df: pd.DataFrame):
    df["clean_message"] = df["message"].apply(clean_text)
    df["label_num"] = df["label"].map({"ham": 0, "spam": 1})
    return df


# -----------------------------
# 3. Train Models
# -----------------------------
def train_models(X_train, y_train):
    vectorizer = TfidfVectorizer(stop_words="english")

    X_train_tfidf = vectorizer.fit_transform(X_train)

    nb_model = MultinomialNB()
    nb_model.fit(X_train_tfidf, y_train)

    lr_model = LogisticRegression(max_iter=1000)
    lr_model.fit(X_train_tfidf, y_train)

    return vectorizer, nb_model, lr_model


# -----------------------------
# 4. Evaluation
# -----------------------------
def evaluate_model(model, X_test_tfidf, y_test, model_name="Model"):
    y_pred = model.predict(X_test_tfidf)

    print(f"\n--- {model_name} ---")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


# -----------------------------
# 5. Prediction Function
# -----------------------------
def predict_spam(text: str, model, vectorizer) -> str:
    cleaned = clean_text(text)
    vector = vectorizer.transform([cleaned])
    prediction = model.predict(vector)[0]
    return "Spam" if prediction == 1 else "Ham"


# -----------------------------
# 6. Main Pipeline
# -----------------------------
def main():
    df = load_data("../data/SMSSpamCollection")
    df = preprocess(df)

    X = df["clean_message"]
    y = df["label_num"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    vectorizer, nb_model, lr_model = train_models(X_train, y_train)

    X_test_tfidf = vectorizer.transform(X_test)

    evaluate_model(nb_model, X_test_tfidf, y_test, "Naive Bayes")
    evaluate_model(lr_model, X_test_tfidf, y_test, "Logistic Regression")

    # Save best model (example: Naive Bayes)
    joblib.dump(nb_model, "spam_model.pkl")
    joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

    # Demo prediction
    example = "Congratulations! You have won a free iPhone. Click now!"
    print("\nExample Prediction:", predict_spam(example, nb_model, vectorizer))


if __name__ == "__main__":
    main()




--- Naive Bayes ---
Accuracy: 0.9641255605381166
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       1.00      0.73      0.84       149

    accuracy                           0.96      1115
   macro avg       0.98      0.87      0.91      1115
weighted avg       0.97      0.96      0.96      1115

Confusion Matrix:
 [[966   0]
 [ 40 109]]

--- Logistic Regression ---
Accuracy: 0.9632286995515695
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       1.00      0.72      0.84       149

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115

Confusion Matrix:
 [[966   0]
 [ 41 108]]

Example Prediction: Spam


In [2]:
import pandas as pd

df = pd.read_csv(
    "../data/SMSSpamCollection",
    sep="t",
    names=["label","message"]
)

df.head()
df['label'].value_counts()
df['message'] = df['message'].str.lower()
df['message'].head()




ParserError: Error tokenizing data. C error: Expected 8 fields in line 3, saw 15
