In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# ===== 1. LOAD DATA =====
train1 = pd.read_csv("./SpamEmailDetection/spam_train1.csv")
train2 = pd.read_csv("./SpamEmailDetection/spam_train2.csv")
test  = pd.read_csv("./SpamEmailDetection/spam_test.csv")
# ===== 2. CLEAN STRUCTURE =====

# --- Train 1: ---
# v1: label ('ham'/'spam'), v2: message text, other columns mostly NaN
train1_clean = train1[['v1', 'v2']].copy()
train1_clean = train1_clean.rename(columns={'v1': 'label', 'v2': 'text'})

# --- Train 2: ---
# label: 'ham'/'spam', text: email body, Unnamed:0 is just an index, label_num is numeric label
train2_clean = train2[['label', 'text']].copy()

# --- Test: ---
# message: text only, no label
test_clean = test.rename(columns={'message': 'text'}).copy()

import re

def clean_text(s: str) -> str:
    # convert to string (in case anything weird got in)
    s = str(s)
    # lowercase
    s = s.lower()
    # remove HTML tags
    s = re.sub(r'<[^>]+>', ' ', s)
    # replace URLs
    s = re.sub(r'http\S+|www\.\S+', ' url ', s)
    # replace email addresses
    s = re.sub(r'\S+@\S+', ' email ', s)
    # remove non-alphanumeric characters (keep some useful symbols)
    s = re.sub(r"[^a-z0-9'$%&*@#\s]", " ", s)
    # collapse multiple spaces
    s = re.sub(r'\s+', ' ', s).strip()
    return s

# Apply cleaning
train1_clean['clean_text'] = train1_clean['text'].apply(clean_text)
train2_clean['clean_text'] = train2_clean['text'].apply(clean_text)
test_clean['clean_text']   = test_clean['text'].apply(clean_text)

# Combine
train_all = pd.concat([train1_clean, train2_clean], ignore_index=True)

#map labels to 0/1:
label_map = {'ham': 0, 'spam': 1}

y_train = train_all['label'].map(label_map).values   # numeric labels
X_text_train = train_all['clean_text'].values        # cleaned text for training
X_text_test  = test_clean['clean_text'].values       # cleaned text for test


# ===== 3. TF-IDF VECTORIZATION =====

tfidf = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),
    min_df=2
)

# Fit on ALL training text, then transform train & test
X_train = tfidf.fit_transform(X_text_train)
X_test  = tfidf.transform(X_text_test)

print("X_train shape:", X_train.shape)  # (num_train_samples, num_features)
print("X_test shape: ", X_test.shape)
print("y_train shape:", y_train.shape)

X_train shape: (4296, 20000)
X_test shape:  (6447, 20000)
y_train shape: (4296,)


In [3]:
import numpy as np
from pprint import pprint

from sklearn.model_selection import StratifiedKFold, cross_val_predict, GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report
)

# =====================================================================
# Helper function: compute metrics given true labels and predicted probs
# =====================================================================
def evaluate_model_cv(name, estimator, X, y, cv_splits=5):
    """
    Runs stratified K-fold CV, returns a dict of metrics and prints a summary.
    Uses out-of-fold predictions so we get an honest estimate.
    """
    print(f"\n=== {name}: Cross-Validation ({cv_splits}-fold) ===")
    
    cv = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=42)
    
    # Out-of-fold predicted labels
    y_pred = cross_val_predict(estimator, X, y, cv=cv, method='predict')
    
    # Probabilities (or decision scores) for ROC-AUC
    if hasattr(estimator, "predict_proba"):
        y_proba = cross_val_predict(estimator, X, y, cv=cv, method='predict_proba')[:, 1]
    elif hasattr(estimator, "decision_function"):
        y_proba = cross_val_predict(estimator, X, y, cv=cv, method='decision_function')
    else:
        y_proba = y_pred
    
    acc  = accuracy_score(y, y_pred)
    prec = precision_score(y, y_pred, pos_label=1)
    rec  = recall_score(y, y_pred, pos_label=1)
    f1   = f1_score(y, y_pred, pos_label=1)
    try:
        roc = roc_auc_score(y, y_proba)
    except ValueError:
        roc = np.nan
    
    print("Accuracy :", acc)
    print("Precision:", prec)
    print("Recall   :", rec)
    print("F1-score :", f1)
    print("ROC-AUC  :", roc)
    print("\nClassification report:")
    print(classification_report(y, y_pred, target_names=["Ham (0)", "Spam (1)"]))
    
    return {
        "model": name,
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "roc_auc": roc
    }

In [4]:
# ======================================
# 2. MODEL SELECTION (base models, no tuning)
# ======================================

# Decision Tree
dt_base = DecisionTreeClassifier(
    criterion="gini",
    max_depth=None,
    class_weight="balanced",
    random_state=42
)
evaluate_model_cv("Decision Tree (base)", dt_base, X_train, y_train)


=== Decision Tree (base): Cross-Validation (5-fold) ===
Accuracy : 0.9117783985102421
Precision: 0.7829218106995884
Recall   : 0.8191603875134553
F1-score : 0.8006312467122567
ROC-AUC  : 0.8782466624231963

Classification report:
              precision    recall  f1-score   support

     Ham (0)       0.95      0.94      0.94      3367
    Spam (1)       0.78      0.82      0.80       929

    accuracy                           0.91      4296
   macro avg       0.87      0.88      0.87      4296
weighted avg       0.91      0.91      0.91      4296



{'model': 'Decision Tree (base)',
 'accuracy': 0.9117783985102421,
 'precision': 0.7829218106995884,
 'recall': 0.8191603875134553,
 'f1': 0.8006312467122567,
 'roc_auc': 0.8782466624231963}

In [5]:
# SVM with linear kernel
svm_base = SVC(
    kernel="linear",
    C=1.0,
    probability=True,
    class_weight="balanced",
    random_state=42
)
evaluate_model_cv("SVM (base)", svm_base, X_train, y_train)


=== SVM (base): Cross-Validation (5-fold) ===
Accuracy : 0.9718342644320298
Precision: 0.954954954954955
Recall   : 0.9128094725511302
F1-score : 0.9334067143643369
ROC-AUC  : 0.9937198344087472

Classification report:
              precision    recall  f1-score   support

     Ham (0)       0.98      0.99      0.98      3367
    Spam (1)       0.95      0.91      0.93       929

    accuracy                           0.97      4296
   macro avg       0.97      0.95      0.96      4296
weighted avg       0.97      0.97      0.97      4296



{'model': 'SVM (base)',
 'accuracy': 0.9718342644320298,
 'precision': 0.954954954954955,
 'recall': 0.9128094725511302,
 'f1': 0.9334067143643369,
 'roc_auc': 0.9937198344087472}

In [6]:
# Simple MLP (Neural Network)
mlp_base = MLPClassifier(
    hidden_layer_sizes=(10,10),
    activation="relu",
    solver="adam",
    alpha=0.1,
    max_iter=1000,
    random_state=42
)
evaluate_model_cv("MLP (base)",mlp_base, X_train, y_train)


=== MLP (base): Cross-Validation (5-fold) ===
Accuracy : 0.9695065176908753
Precision: 0.9761336515513126
Recall   : 0.8805166846071044
F1-score : 0.9258630447085455
ROC-AUC  : 0.9941977842946627

Classification report:
              precision    recall  f1-score   support

     Ham (0)       0.97      0.99      0.98      3367
    Spam (1)       0.98      0.88      0.93       929

    accuracy                           0.97      4296
   macro avg       0.97      0.94      0.95      4296
weighted avg       0.97      0.97      0.97      4296



{'model': 'MLP (base)',
 'accuracy': 0.9695065176908753,
 'precision': 0.9761336515513126,
 'recall': 0.8805166846071044,
 'f1': 0.9258630447085455,
 'roc_auc': 0.9941977842946627}

In [7]:
# ============================
# Tuning grid for Linear SVM
# ============================

param_grid = {
    "C": [0.01, 0.1, 1, 5, 10]
}

svm = SVC(
    kernel="linear",
    probability=True,    
    class_weight="balanced",
    random_state=42
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    estimator=svm,
    param_grid=param_grid,
    scoring="f1",
    cv=cv,
    n_jobs=-1,
    verbose=1
)

# Run GridSearch
grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print("Best F1 score:  ", grid.best_score_)

# Retrieve tuned model
svm_best = grid.best_estimator_


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best parameters: {'C': 1}
Best F1 score:   0.9333983822783054


In [10]:
# Fit best model on all training data
svm_best.fit(X_train, y_train)

# Predict test labels
y_pred_test = svm_best.predict(X_test)

np.savetxt("./SpamEmailDetection/Dao_Corona_SpamDectection.txt", y_pred_test , fmt="%d")