In [None]:
!pip install transformers -q
!pip install datasets -q
!pip install spacy -q
!pip install wordcloud -q

In [None]:
# Core Libraries
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from wordcloud import WordCloud

# NLP & Text Preprocessing
import re
from bs4 import BeautifulSoup
from collections import Counter

# Scikit-learn: Data Handling
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# Scikit-learn: Text Processing
from sklearn.feature_extraction.text import TfidfVectorizer

# Scikit-learn: Evaluation Metrics
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay
)

# Scikit-learn: Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# Utilities
import warnings

warnings.filterwarnings('ignore')

In [None]:
pip install kagglehub -q

In [None]:
import kagglehub

# Download latest version
email_path = kagglehub.dataset_download("naserabdullahalam/phishing-email-dataset")

print("Path to Email dataset files:", email_path)

In [None]:
import os

# List all files in the dataset folder
email_files = os.listdir(email_path)

print("Files in Email dataset:", email_files)

In [None]:
email_df = pd.read_csv(f'{email_path}/Ling.csv')
print(email_df.head())

In [None]:
email_df.shape

In [None]:
email_df.info()

In [None]:
email_df['label'].value_counts()

In [None]:
email_df.duplicated().sum()

In [None]:
email_df['email'] = email_df['subject'] + ' ' + email_df['body']

In [None]:
# Clean up unused columns
email_df = email_df.drop(columns=['subject', 'body'])

In [None]:
email_df.head()

In [None]:
email_df.info()

In [None]:
email_df.dropna(inplace=True)

In [None]:
email_df.info()

In [None]:
email_df['label'].value_counts()

In [None]:
# Fix
!python -m spacy download en_core_web_sm
!pip install lxml

In [None]:
# Load spaCy English model
nlp = spacy.load('en_core_web_sm')

def preprocess_email(text):
    # Handle Empty values
    if not isinstance(text, str):
        return ""

    # Remove HTML tags
    soup = BeautifulSoup(text, 'lxml')
    text = soup.get_text(separator=' ')

    # Replace Links with [URL]
    url_pattern = r'(https?://\S+|www\.\S+)'
    text = re.sub(url_pattern, '[URL]', text)

    # 4. Remove Email Addresses
    email_pattern = r'\S+@\S+'
    text = re.sub(email_pattern, '[EMAIL]', text)

    # Process with spaCy
    doc = nlp(text.lower())

    # Remove stopwords + non-alphabetic + lemmatize
    tokens = [
      token.lemma_
      for token in doc
      if (token.is_alpha or token.text in ['[URL]', '[EMAIL]']) and not token.is_stop
    ]

    # Join tokens back to string
    return ' '.join(tokens)

email_df['cleaned_email'] = email_df['email'].apply(preprocess_email)

In [None]:
email_df.head()

In [None]:
# Visualize the most frequent words per category using bar plots
real_emails = ' '.join(email_df[email_df['label']==0]['cleaned_email'])
spam_emails = ' '.join(email_df[email_df['label']==1]['cleaned_email'])

def most_common_words(text, title, n, filename):
    if not text:
        print(f'No words found for {title}')
        return

    words = text.split()  # Split the text into words
    counter = Counter(words)
    common = counter.most_common(n) # Get the top n most common words

    if not common:
        print(f'No words found for {title}')
        return

    words, counts = zip(*common)
    plt.figure(figsize=(10,5))
    plt.bar(words, counts, color='#33f')
    plt.title(title)
    plt.xticks(rotation=45)

    # âœ… Save as image
    plt.savefig(filename, dpi=300, bbox_inches='tight')

    plt.show()

most_common_words(real_emails, 'Top Real Words', 20, 'top_real_Email_Words.png')
most_common_words(spam_emails, 'Top Spam Words', 20, 'top_spam_Email_Words.png')

In [None]:
# Visualize the most frequent words per category using word clouds
real_WordCloud = WordCloud(width=800, height=400, background_color='white', colormap='Greens').generate(real_emails)
spam_WordCloud = WordCloud(width=800, height=400, background_color='white', colormap='Reds').generate(spam_emails)

plt.figure(figsize=(15,10))

plt.subplot(1,2,1)
plt.imshow(real_WordCloud, interpolation='bilinear')
plt.title('Most frequent Real Email Words')
plt.axis('off')

plt.subplot(1,2,2)
plt.imshow(spam_WordCloud, interpolation='bilinear')
plt.title('Most frequent Spam Words')
plt.axis('off')

plt.tight_layout()

# âœ… Save as image
plt.savefig('word_cloud.png', dpi=300, bbox_inches='tight')

plt.show()

In [None]:
data = email_df.copy()
data.shape

In [None]:
# Expansion 01
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Train-Test Split
X = data['cleaned_email']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23, stratify=y)

# Build Pipelines with Multiple Models
models = {
    'Logistic Regression': LogisticRegression(solver='saga', max_iter=1000, random_state=23),
    'Linear SVM': LinearSVC(random_state=23),
    'Multinomial NB': MultinomialNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=23),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=23)
}

result = {}

for name, regressor in models.items():
    # Note: Using Pipeline to ensure vectorization is fit only on training data
    pipeline = Pipeline([
        ('preprocessor', TfidfVectorizer(ngram_range=(1,2), max_df=0.8, min_df=5)),
        ('regressor', regressor)
    ])

    # Train and evaluate the model
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    result[name] = accuracy

    print(f'Model: {name}')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Classification Report:\n{classification_report(y_test, y_pred)}\n')
    print('-' * 30)

In [None]:
# Expansion 02
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Train-Test Split
X = data['cleaned_email']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23, stratify=y)

# Expanded Model Dictionary
models = {
    'Logistic Regression': LogisticRegression(solver='saga', max_iter=1000, random_state=23),
    'Linear SVM': LinearSVC(random_state=23),
    'Multinomial NB': MultinomialNB(),
    'Random Forest': RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=23),
    'SGD Classifier': SGDClassifier(loss='hinge', penalty='l2', random_state=23),
    'Passive Aggressive': PassiveAggressiveClassifier(max_iter=1000, random_state=23),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=23)
}

result = {}

for name, regressor in models.items():
    pipeline = Pipeline([
        ('preprocessor', TfidfVectorizer(ngram_range=(1,2), max_df=0.8, min_df=5)),
        ('regressor', regressor)
    ])

    # Train and evaluate
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    result[name] = accuracy

    print(f'Model: {name}')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Classification Report\n {classification_report(y_test, y_pred)}\n')
    print('-' * 30)

In [None]:
# Expansion 03
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Train-Test Split
X = data['cleaned_email']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23, stratify=y)

# Expanded Model Dictionary
models = {
    # Linear & Fast Models
    'Logistic Regression': LogisticRegression(solver='saga', max_iter=1000, random_state=23),
    'Linear SVM': LinearSVC(random_state=23),
    'Ridge Classifier': RidgeClassifier(random_state=23),
    'SGD Classifier': SGDClassifier(loss='modified_huber', random_state=23), # modified_huber gives probas
    
    # Naive Bayes (Standard for Text)
    'Multinomial NB': MultinomialNB(),
    'Complement NB': ComplementNB(), # Better for imbalanced text data
    
    # Tree Ensembles
    'Random Forest': RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=23),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, n_jobs=-1, random_state=23),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=23),
    
    # Gradient Boosting (State-of-the-art)
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=23),
    'LightGBM': LGBMClassifier(random_state=23, verbose=-1)
}

result = {}

for name, regressor in models.items():
    pipeline = Pipeline([
        ('preprocessor', TfidfVectorizer(ngram_range=(1,2), max_df=0.8, min_df=5)),
        ('regressor', regressor)
    ])

    # Train and evaluate
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    result[name] = accuracy

    print(f'Model: {name}')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Classification Report:\n {classification_report(y_test, y_pred)}\n')
    print('=' * 40)

In [None]:
# Original Expanded
# Train-Test Split
X = data['cleaned_email']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23, stratify=y)

# Build Pipelines with Multiple Models
models = {
    'Logistic Regression': LogisticRegression(solver='saga', max_iter=1000, random_state=23),
    'SVM': LinearSVC(random_state=23)
}

result = {}

for name, regressor in models.items():
    pipeline = Pipeline([
        ('preprocessor', TfidfVectorizer(ngram_range=(1,2), max_df=0.8, min_df=5)),
        ('regressor', regressor)
    ])

    # Train and evaluate the model
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    result[name] = accuracy

    print(f'Model: {name}')
    print(f'Accuracy: {accuracy}')
    print(f'Classification Report\n {classification_report(y_test, y_pred)}\n')
    print('-' * 30)

In [None]:
# Find the best model
best_model = max(result, key=result.get)
best_accuracy = result[best_model]
print(f'The best model is {best_model}, with an accuracy of {best_accuracy}')

In [None]:
# Confusion Matrix
for name, regressor in models.items():
    pipeline = Pipeline([
        ('preprocessor', TfidfVectorizer(ngram_range=(1,2), max_df=0.8, min_df=5)),
        ('regressor', regressor)
    ])

# Train and evaluate the model
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    fig, ax = plt.subplots(figsize=(10,5))
    cm = confusion_matrix(y_test, y_pred, labels=[1, 0])
    ConfusionMatrixDisplay(cm, display_labels=['Spam Emails', 'Real Emails']).plot(cmap='Blues', ax=ax)
    plt.title(f'Confusion Matrix: {name}')
    plt.tight_layout()

    # âœ… Save as image
    plt.savefig(f'confusion_matrix_{name.replace(" ", "_").lower()}.png', dpi=300, bbox_inches='tight')

    plt.show()

In [None]:
# Compare Model Accuracies
plt.figure(figsize=(8,5))
sns.barplot(x=result.values(), y=result.keys(), color='#33f')
plt.xlabel('Accuracy')
plt.ylabel('Models')
plt.xticks(rotation=45)
plt.title('Model Accuracy Comparison')
plt.tight_layout()

# âœ… Save as image
plt.savefig('compare_model_accuracy.png', dpi=300, bbox_inches='tight')

plt.show()

In [None]:
# Testing on real data
sample_emails=[
    'CONGRATULATIONS! Your email address has been selected as the winner of the $1,000,000 Microsoft Promotion. To claim your prize, reply with your bank details immediately.',
    'Hi Team, please find the minutes of our last meeting attached. We need to finalize the project budget by Friday. Let\'s meet on Zoom at 2 PM to discuss.',
    'webcam dating is hot - - - - - - - - - - - - - - - - please no more'
]

cleaned_samples = [preprocess_email(email) for email in sample_emails]

best_model = Pipeline([
    ('preprocessor', TfidfVectorizer()),
    ('regressor', LinearSVC(random_state=23))
])

best_model.fit(X_train, y_train)
preds = best_model.predict(cleaned_samples)

for i, (j, k) in enumerate(zip(sample_emails, preds)):
    label = "SPAM ðŸš¨" if k == 1 else "REAL âœ…"
    print(f'\n{i+1}. Email: {j}')
    print(f'Prediction: {label} (Class {k})')

In [None]:
# Save the model after training
from joblib import dump
dump(pipeline, 'model.joblib')

print('âœ… Pipeline trained and saved as model.joblib')