# Phishing URL EDA

This notebook creates:
- A class distribution plot
- Basic feature summary statistics


In [None]:
from pathlib import Path
import sys

import matplotlib.pyplot as plt
import pandas as pd

PROJECT_ROOT = Path.cwd().resolve().parent
SRC_DIR = PROJECT_ROOT / 'src'
if str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))

from data import load_phishing_dataset, build_feature_matrix

DATA_PATH = SRC_DIR / 'phishing_site_urls.csv'
print(f'Using dataset: {DATA_PATH}')


In [None]:
raw_df = pd.read_csv(DATA_PATH)
label_counts = raw_df['Label'].str.lower().value_counts().sort_index()

label_counts.plot(kind='bar', color=['#4C78A8', '#F58518'], figsize=(7, 4))
plt.title('Class Distribution')
plt.xlabel('Label')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

label_counts


In [None]:
inputs, labels = load_phishing_dataset(DATA_PATH)
X = build_feature_matrix(inputs)

feature_names = [
    'url_length', 'host_length', 'path_length', 'query_length',
    'num_host_parts', 'num_dots', 'num_hyphens', 'num_at',
    'num_question_marks', 'num_equals', 'num_slashes',
    'num_digits', 'num_letters', 'num_special_chars',
    'double_slash_in_path', 'contains_https_text',
    'has_ipv4', 'has_suspicious_keyword'
]

feature_df = pd.DataFrame(X, columns=feature_names)
feature_df['label'] = labels
feature_df['label_name'] = feature_df['label'].map({0: 'good', 1: 'bad'})

feature_summary = feature_df[feature_names].describe().T
feature_summary


In [None]:
feature_means_by_class = feature_df.groupby('label_name')[feature_names].mean().T
feature_means_by_class
