# Spam Classifier

### Preliminary Preparations

In [1]:
import pandas as pd 
df= pd.read_csv('./data/emails.csv')
X= df.iloc[:, :-1].values
y= df.iloc[:, -1].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Vectorization

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(df['text'])

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Multinomial Naive Bayes

In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9869109947643979
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       885
           1       0.96      0.99      0.97       261

    accuracy                           0.99      1146
   macro avg       0.98      0.99      0.98      1146
weighted avg       0.99      0.99      0.99      1146



In [5]:
import joblib
joblib.dump(model, 'spam_classifier_nb.pkl')
joblib.dump(cv, 'vectorizer.pkl')

['vectorizer.pkl']

### Support Vector Machine

In [6]:
from sklearn.svm import LinearSVC

svm_model = LinearSVC()
svm_model.fit(X_train, y_train)

y_pred_svm = svm_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

Accuracy: 0.9851657940663177
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       885
           1       0.97      0.96      0.97       261

    accuracy                           0.99      1146
   macro avg       0.98      0.98      0.98      1146
weighted avg       0.99      0.99      0.99      1146





In [7]:
joblib.dump(svm_model, 'spam_classifier_svm.pkl')

['spam_classifier_svm.pkl']

### Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

log_reg = LogisticRegression(
    solver='liblinear',
    max_iter=2000,  
    class_weight=None
)
log_reg.fit(X_train, y_train)

y_pred_lr   = log_reg.predict(X_test)
y_proba_lr  = log_reg.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_lr))
print(classification_report(y_test, y_pred_lr))

Accuracy: 0.9886561954624782
ROC-AUC: 0.9990951793406498
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       885
           1       0.97      0.98      0.98       261

    accuracy                           0.99      1146
   macro avg       0.98      0.98      0.98      1146
weighted avg       0.99      0.99      0.99      1146



In [9]:
joblib.dump(log_reg, 'spam_classifier_logreg.pkl')

['spam_classifier_logreg.pkl']

### Random Forest Classifier

In [10]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,      # try 200–800
    max_depth=None,       # try 20–60 to curb overfitting
    max_features='sqrt',  # good default for RF
    min_samples_split=2,
    n_jobs=-1,
    random_state=0
)

rf.fit(X_train, y_train)

y_pred_rf  = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_rf))
print(classification_report(y_test, y_pred_rf))

Accuracy: 0.9738219895287958
ROC-AUC: 0.997458709440007
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       885
           1       0.98      0.90      0.94       261

    accuracy                           0.97      1146
   macro avg       0.98      0.95      0.96      1146
weighted avg       0.97      0.97      0.97      1146



In [11]:
joblib.dump(log_reg, 'spam_classifier_randforest.pkl')

['spam_classifier_randforest.pkl']