## Import required libraries

In [2]:
import pandas as pd
import numpy as np

## Load Dataset

In [3]:
# Load dataset
data = pd.read_csv("CEAS_08.csv")

# Display first 5 rows
data.head()

Unnamed: 0,sender,receiver,date,subject,body,label,urls
0,Young Esposito <Young@iworld.de>,user4@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 16:31:02 -0700",Never agree to be a loser,"Buck up, your troubles caused by small dimensi...",1,1
1,Mok <ipline's1983@icable.ph>,user2.2@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 18:31:03 -0500",Befriend Jenna Jameson,\nUpgrade your sex and pleasures with these te...,1,1
2,Daily Top 10 <Karmandeep-opengevl@universalnet...,user2.9@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 20:28:00 -1200",CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1,1
3,Michael Parker <ivqrnai@pobox.com>,SpamAssassin Dev <xrh@spamassassin.apache.org>,"Tue, 05 Aug 2008 17:31:20 -0600",Re: svn commit: r619753 - in /spamassassin/tru...,Would anyone object to removing .so from this ...,0,1
4,Gretchen Suggs <externalsep1@loanofficertool.com>,user2.2@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 19:31:21 -0400",SpecialPricesPharmMoreinfo,\nWelcomeFastShippingCustomerSupport\nhttp://7...,1,1


## Understand Dataset

In [4]:
# Check columns
data.columns

Index(['sender', 'receiver', 'date', 'subject', 'body', 'label', 'urls'], dtype='object')

## Check Missing Values

In [5]:
# Check missing values
data.isnull().sum()

sender        0
receiver    462
date          0
subject      28
body          0
label         0
urls          0
dtype: int64

## Combine Email Content

In [8]:
# Convert everything to string before combining

data['combined_text'] = data['subject'].fillna('').astype(str) + " " + \
                        data['body'].fillna('').astype(str) + " " + \
                        data['urls'].fillna('').astype(str)

## Define Input & Output

In [10]:
# Input feature
X = data['combined_text']

# Output label
y = data['label']

## Convert Text into Numbers (NLP)  AI-Understandable Format

- Use TF-IDF (industry standard for phishing detection)   This removes noise words and improves detection quality.

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text into numeric representation
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

X_vectorized = vectorizer.fit_transform(X)

## Train/Test Split

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_vectorized, y,
    test_size=0.2,
    random_state=42
)

## Train Multiple Models

### Model 1 — Logistic Regression

In [35]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

### Model 2 — Naive Bayes (Great for text)

In [41]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_test)

### Model 3 — Random Forest (Adds depth)

In [50]:
from sklearn.ensemble import RandomForestClassifier

# Fast version of Random Forest
rf = RandomForestClassifier(
    n_estimators=30,   # fewer trees = faster
    max_depth=15,      # limits tree size
    n_jobs=-1          # uses all CPU cores
)

rf.fit(X_train, y_train)

rf_pred = rf.predict(X_test)

### Compare Accuracy

In [52]:
from sklearn.metrics import accuracy_score

print("Logistic Regression:", accuracy_score(y_test, lr_pred))
print("Naive Bayes:", accuracy_score(y_test, nb_pred))
print("Fast Random Forest:", accuracy_score(y_test, rf_pred))

Logistic Regression: 0.9941259098454859
Naive Bayes: 0.9927212361128847
Fast Random Forest: 0.9463670029370451


## Evaluate Model

### Classification Report

In [54]:
from sklearn.metrics import classification_report

print("Classification Report:\n")
print(classification_report(y_test, lr_pred))

Classification Report:

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      3490
           1       0.99      1.00      0.99      4341

    accuracy                           0.99      7831
   macro avg       0.99      0.99      0.99      7831
weighted avg       0.99      0.99      0.99      7831



### Confusion Matrix

In [56]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, lr_pred)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[3461   29]
 [  17 4324]]


## Save Best Model

In [58]:
import joblib

joblib.dump(lr, "phishing_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")

['vectorizer.pkl']

## Real Testing

In [61]:
sample = ["Your account has been suspended. Click here to verify"]

sample_vector = vectorizer.transform(sample)

prediction = lr.predict(sample_vector)

print("Prediction:", prediction)

Prediction: [1]


## Interpret Result
- 0 = safe
1 = phishing

In [63]:
data['label'].value_counts()

label
1    21842
0    17312
Name: count, dtype: int64

## Feature Importance  
- Shows what AI looks for.
- This makes your project look analytical.

In [75]:
feature_names = vectorizer.get_feature_names_out()
coefficients = lr.coef_[0]

top_phishing_words = sorted(
    zip(coefficients, feature_names),
    reverse=True
)[:10]

print("Top Phishing Indicators:")
for coef, word in top_phishing_words:
    print(word)

Top Phishing Indicators:
com
love
cnn
health
men
http
life
livefilestore
watches
replica
