In [68]:
import pandas as pd 
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
data = pd.read_csv("data/final_dataset.csv")

In [3]:
data.head(3)

Unnamed: 0,spam,subject,length,message
0,1,naturally irresistible your corporate identit...,338,naturally irresistible corporate identity lt r...
1,1,the stock trading gunslinger fanny is merril...,103,stock trading merrill like group try kansa yes...
2,1,unbelievable new homes made easy im wanting ...,103,unbelievable new home made easy im wanting sho...


In [15]:
data = data.dropna()

In [4]:
data['spam'].value_counts()

spam
1    4439
0    3988
Name: count, dtype: int64

**Load lightweight BERT model**

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")

In [9]:
class EmailEmbedder:
    def __init__(self, model_name='distilbert-base-uncased', device=None):    
        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(self.device)
    
    def embed_text(self, text):
        """Return CLS token embedding for a single text"""
        inputs = self.tokenizer(
            text, return_tensors='pt', truncation=True, padding=True, max_length=512
        )
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = self.model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # CLS token
        return cls_embedding.cpu().numpy().squeeze()
    
    def embed_batch(self, messages):
        """Embed a batch of messages"""
        embeddings = []
        for msg in messages:
            embeddings.append(self.embed_text(msg))
        return np.array(embeddings)   

In [16]:
# Initialize embedder
embedder = EmailEmbedder()
X = embedder.embed_batch(data['message'])
y = data['spam'].values

In [76]:
# --- Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [77]:
clf = LogisticRegression(
    C=10,
    class_weight='balanced',
    penalty='l2',
    solver='saga',
    max_iter=1000,
    random_state=42
)

In [78]:
clf.fit(X_train, y_train)



In [79]:
# --- Evaluate ---
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.97      0.97       798
           1       0.98      0.98      0.98       887

    accuracy                           0.98      1685
   macro avg       0.98      0.98      0.98      1685
weighted avg       0.98      0.98      0.98      1685



**Testing**

In [80]:
df_test = pd.read_csv("data/spam_data.csv")

In [81]:
df_test['Spam/Ham'] = df_test['Spam/Ham'].map({'spam': 1, 'ham': 0})

In [82]:
df_test = df_test.dropna()

In [83]:
# --- Convert Spam/Ham to 1/0 ---
y_test = df_test['Spam/Ham'].map({'spam': 1, 'ham': 0}).values

# Separate spam and ham
spam_df = df_test[df_test['Spam/Ham'] == 1]
ham_df = df_test[df_test['Spam/Ham'] == 0]

# Take 2500 random samples from each
spam_sample = resample(spam_df, n_samples=2500, random_state=42)
ham_sample = resample(ham_df, n_samples=2500, random_state=42)

In [84]:
test_data = pd.concat([spam_sample, ham_sample])

In [86]:
# --- Embed messages using the trained embedder ---
X_test = embedder.embed_batch(test_data['Message'])

# --- Predict using trained Logistic Regression classifier ---
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]  # probability of spam

# --- Evaluation ---
from sklearn.metrics import classification_report, accuracy_score

print("Accuracy:", accuracy_score(test_data['Spam/Ham'], y_pred))
print(classification_report(test_data['Spam/Ham'], y_pred))

Accuracy: 0.9346
              precision    recall  f1-score   support

           0       0.96      0.91      0.93      2500
           1       0.92      0.96      0.94      2500

    accuracy                           0.93      5000
   macro avg       0.94      0.93      0.93      5000
weighted avg       0.94      0.93      0.93      5000



In [98]:
message = """Dear Albert:

Thanks to all who have responded to this call for volunteers. We have heard from a sufficient number of interested volunteers and have closed the call. We are reviewing the responses and will communicate next steps to those who have responded.

 

Do not reply to this email. This mailbox is not monitored for responses to calls for volunteers.

To unsubscribe from SWB emails, see Unsubscribe on the Technical Support page. """

# Pass it as a list with one element
X_test = embedder.embed_batch([message])


# --- Predict using trained Logistic Regression classifier ---
y_pred = clf.predict(X_test)
spam_prob = clf.predict_proba(X_test)[:, 1] 

if spam_prob[0] >= 0.9:
    label = 1  # spam
else:
    label = 0  # ham
print(spam_prob)
print(label)

[0.32839742]
0


**Saving Model as file**

In [93]:
import joblib
joblib.dump(clf, "models/logreg_spam_classifier.joblib")

['models/logreg_spam_classifier.joblib']