## Naive Bayes

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.naive_bayes import GaussianNB
import random

# Load dataset
df = pd.read_csv('phishing_site_urls.csv')
urls = df['URL']
labels = df['Label']

# Map labels to binary values
label_mapping = {'good': 0, 'bad': 1}
y = labels.map(label_mapping).values

# Feature extraction
def extract_features(url):
    features = {}
    # Basic features
    features['https'] = 1 if 'https' in url else 0
    features['url_length'] = len(url)
    features['special_char_length'] = sum(1 for char in url if char in '!@#$%^&*()-=')
    # Additional features
    features['num_dots'] = url.count('.')
    features['num_slashes'] = url.count('/')
    features['total_special_chars'] = sum(url.count(char) for char in '!@#$%^&*()-=')
    # Domain type
    features['common_domain'] = 1 if any(domain in url for domain in ['.com', '.org', '.net', '.me']) else 0
    features['is_other_domain'] = 1 if not features['common_domain'] else 0
    return features

# Apply feature extraction
features_df = pd.DataFrame([extract_features(url) for url in urls])

# Add group-based features
features_df['domain'] = urls.apply(lambda x: x.split('.')[-1])  # Extract domain suffix (e.g., 'com', 'org')
features_df['feat1'] = features_df.groupby('domain')['url_length'].transform('mean')  # Mean of `url_length` by domain
features_df['feat2'] = features_df['url_length'] - features_df['feat1']  # Deviation from mean

# Prepare final feature matrix
X = features_df[['https', 'url_length', 'special_char_length', 'num_dots', 'num_slashes', 
                 'total_special_chars', 'common_domain', 'is_other_domain', 'feat1', 'feat2']]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize lists to store metrics
accuracies = []
roc_auc_scores = []
f1_scores = []

# Train and evaluate Naive Bayes model
for i in range(15):
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=random.randint(0, 1000)
    )
    
    # Train Naive Bayes model
    nb_model = GaussianNB()
    nb_model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = nb_model.predict(X_test)
    y_pred_proba = nb_model.predict_proba(X_test)[:, 1]  # Probability of the positive class
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    f1 = f1_score(y_test, y_pred)
    
    # Append metrics to lists
    accuracies.append(accuracy)
    roc_auc_scores.append(roc_auc)
    f1_scores.append(f1)
    
    print(f"Iteration {i+1}: Accuracy = {accuracy:.4f}, ROC AUC = {roc_auc:.4f}, F1 = {f1:.4f}")

# Prepare results DataFrame for Naive Bayes
results_df = pd.DataFrame({
    'nb_accuracy': accuracies,
    'nb_roc_auc': roc_auc_scores,
    'nb_f1_score': f1_scores
})

# Define the shared results file path
shared_results_file = '../results.csv'

# Save to shared results file
if os.path.exists(shared_results_file):
    # If the file exists, load it and append new results
    existing_results = pd.read_csv(shared_results_file)
    combined_results = pd.concat([existing_results, results_df.reset_index(drop=True)], axis=1)
else:
    # If the file doesn't exist, create it with the current results
    combined_results = results_df

# Save the combined results
combined_results.to_csv(shared_results_file, index=False)

print(f"\nMetrics saved to shared results file: {shared_results_file}")


Iteration 1: Accuracy = 0.7428, ROC AUC = 0.7334, F1 = 0.3423
Iteration 2: Accuracy = 0.7447, ROC AUC = 0.7352, F1 = 0.3373
Iteration 3: Accuracy = 0.7441, ROC AUC = 0.7367, F1 = 0.3470
Iteration 4: Accuracy = 0.7448, ROC AUC = 0.7366, F1 = 0.3454
Iteration 5: Accuracy = 0.7439, ROC AUC = 0.7382, F1 = 0.3474
Iteration 6: Accuracy = 0.7440, ROC AUC = 0.7364, F1 = 0.3394
Iteration 7: Accuracy = 0.7430, ROC AUC = 0.7351, F1 = 0.3449
Iteration 8: Accuracy = 0.7438, ROC AUC = 0.7351, F1 = 0.3407
Iteration 9: Accuracy = 0.7432, ROC AUC = 0.7373, F1 = 0.3372
Iteration 10: Accuracy = 0.7438, ROC AUC = 0.7387, F1 = 0.3417
Iteration 11: Accuracy = 0.7434, ROC AUC = 0.7349, F1 = 0.3370
Iteration 12: Accuracy = 0.7443, ROC AUC = 0.7346, F1 = 0.3460
Iteration 13: Accuracy = 0.7436, ROC AUC = 0.7365, F1 = 0.3422
Iteration 14: Accuracy = 0.7434, ROC AUC = 0.7360, F1 = 0.3461
Iteration 15: Accuracy = 0.7433, ROC AUC = 0.7347, F1 = 0.3282

Metrics saved to shared results file: ../results.csv
