In [None]:
# Classification model for phishing email detection
# trained in collaboration with ChatGPT

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
# Load dataset


df = pd.read_csv("CEAS_08.csv", engine="python", on_bad_lines="skip", encoding="utf-8")

print(df.columns)

X = df["subject"].fillna('') + ' ' + df["body"].fillna('')
y = df["label"]

# Train / test split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

model = Pipeline([
    ("tfidf", TfidfVectorizer(
        stop_words="english",
        max_features=7000,
        ngram_range=(1, 2)
    )),
    ("clf", LogisticRegression(
        max_iter=2000,
        class_weight="balanced"
    ))
])

scores = cross_val_score(
    model,
    X,
    y,
    cv=3,
    scoring="f1_macro"
)

print("F1 scores:", scores)
print("Mean F1:", scores.mean())

# Train
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


email = """
Security alert:
We detected unusual activity on your account.
Please review immediately.
"""

probs = model.predict_proba([email])[0]
pred = model.predict([email])[0]
confidence = probs.max()

# Set a confidence threshold
threshold = 0.8

if confidence < threshold:
    pred_label = "Suspicious / Unknown"
else:
    pred_label = pred

print(f"Prediction: {pred_label}")
print(f"Confidence: {confidence:.2f}")

Index(['sender', 'receiver', 'date', 'subject', 'body', 'label', 'urls'], dtype='object')
F1 scores: [0.9623577  0.96453572 0.97077232]
Mean F1: 0.9658885805276215
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       179
           1       0.99      0.96      0.97       267

    accuracy                           0.97       446
   macro avg       0.96      0.97      0.97       446
weighted avg       0.97      0.97      0.97       446

Prediction: Suspicious / Unknown
Confidence: 0.64


In [24]:
# --- Save model for backend ---
import pickle
import os

# Ensure folder exists
os.makedirs("models", exist_ok=True)

model_output_path = os.path.join("models", "phishing_model.pkl")
pickle.dump(
    {
        "model": model.named_steps["clf"],
        "vectorizer": model.named_steps["tfidf"],
    },
    open(model_output_path, "wb"),
)
print(f"Model saved to {model_output_path}")

# Optional: single email test
email = """
Security alert:
We detected unusual activity on your account.
Please review immediately.
"""
probs = model.predict_proba([email])[0]
pred = model.predict([email])[0]
confidence = probs.max()

threshold = 0.8
pred_label = pred if confidence >= threshold else "Suspicious / Unknown"

print(f"Prediction: {pred_label}")
print(f"Confidence: {confidence:.2f}")

Model saved to models/phishing_model.pkl
Prediction: Suspicious / Unknown
Confidence: 0.64
