<a href="https://colab.research.google.com/github/krishnamalani1164/email-phishing-detection/blob/main/phishing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split ,cross_val_score,GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,roc_auc_score ,roc_curve
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')

In [17]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [18]:
print("Email Phishing Detection Pipeline")
print("=" * 50)

Email Phishing Detection Pipeline


In [19]:
def load_csv_data():
    """
    Load email spam detection dataset from CSV file: emails.csv
    Expected format:
    - Column 1: Email identifiers
    - Columns 2-3001: Word frequency features (3000 most common words)
    - Column 3002: Labels (1 for spam, 0 for not spam)
    """
    try:
        # Try to load emails.csv directly (if already uploaded to Colab)
        csv_filename = 'emails.csv'
        df = pd.read_csv(csv_filename)
        print(f"Dataset loaded successfully from {csv_filename}")
    except FileNotFoundError:
        # If file not found, prompt user to upload
        print("emails.csv not found. Please upload your dataset file:")
        from google.colab import files
        uploaded = files.upload()

        # Check if emails.csv was uploaded
        if 'emails.csv' in uploaded:
            csv_filename = 'emails.csv'
        else:
            # If a different filename was uploaded, use the first one and rename it
            uploaded_filename = list(uploaded.keys())[0]
            print(f"Renaming {uploaded_filename} to emails.csv")
            import os
            os.rename(uploaded_filename, 'emails.csv')
            csv_filename = 'emails.csv'

        # Load the dataset
        df = pd.read_csv(csv_filename)
        print(f"Dataset loaded successfully from {csv_filename}")

    print(f"Dataset shape: {df.shape}")

    # Verify the expected structure
    expected_cols = 3002
    if df.shape[1] != expected_cols:
        print(f"Warning: Expected {expected_cols} columns, found {df.shape[1]} columns")
        print("Please verify your dataset structure matches the expected format")

    # Extract features and labels according to your dataset structure
    email_ids = df.iloc[:, 0]  # First column: Email identifiers
    word_features = df.iloc[:, 1:-1]  # Columns 2-3001: Word frequency features
    labels = df.iloc[:, -1]  # Last column: Labels

    # Verify labels are binary (0 and 1)
    unique_labels = labels.unique()
    print(f"Unique labels found: {unique_labels}")
    if not all(label in [0, 1] for label in unique_labels):
        print("Warning: Labels should be binary (0 for not spam, 1 for spam)")

    print(f"Email IDs shape: {email_ids.shape}")
    print(f"Word features shape: {word_features.shape}")
    print(f"Labels shape: {labels.shape}")

    return df, word_features, labels, email_ids


In [20]:
print("Loading dataset...")
df, word_features, labels, email_ids = load_csv_data()

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
print(df.info())
print("\nClass Distribution:")
print(labels.value_counts())
print("\nClass Distribution Percentage:")
print(labels.value_counts(normalize=True) * 100)

Loading dataset...
Dataset loaded successfully from emails.csv
Dataset shape: (5172, 3002)
Unique labels found: [0 1]
Email IDs shape: (5172,)
Word features shape: (5172, 3000)
Labels shape: (5172,)
Dataset Shape: (5172, 3002)

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5172 entries, 0 to 5171
Columns: 3002 entries, Email No. to Prediction
dtypes: int64(3001), object(1)
memory usage: 118.5+ MB
None

Class Distribution:
Prediction
0    3672
1    1500
Name: count, dtype: int64

Class Distribution Percentage:
Prediction
0    70.99768
1    29.00232
Name: proportion, dtype: float64


In [21]:
# We'll use this as our feature matrix X
X = word_features.values
y = labels.values

In [22]:
print(f"\nFeature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

# Check data sparsity
sparsity = (X == 0).sum() / (X.shape[0] * X.shape[1])
print(f"Data sparsity: {sparsity:.2%}")

# Display some basic statistics
print(f"\nFeature statistics:")
print(f"Mean word count per email: {X.mean(axis=1).mean():.2f}")
print(f"Max word count in any email: {X.max()}")
print(f"Min word count in any email: {X.min()}")


Feature matrix shape: (5172, 3000)
Target vector shape: (5172,)
Data sparsity: 94.37%

Feature statistics:
Mean word count per email: 0.39
Max word count in any email: 2327
Min word count in any email: 0


In [23]:
# Show example of word frequency for first few emails
print(f"\nFirst 5 emails word count statistics:")
for i in range(min(5, len(X))):
    total_words = X[i].sum()
    non_zero_features = (X[i] > 0).sum()
    label_text = "Spam" if y[i] == 1 else "Not Spam"
    print(f"Email {i+1}: {total_words} total words, {non_zero_features} unique words, Label: {label_text}")


First 5 emails word count statistics:
Email 1: 53 total words, 34 unique words, Label: Not Spam
Email 2: 2203 total words, 276 unique words, Label: Not Spam
Email 3: 113 total words, 45 unique words, Label: Not Spam
Email 4: 1019 total words, 172 unique words, Label: Not Spam
Email 5: 1075 total words, 167 unique words, Label: Not Spam
