## Naive Bayes

### Importing libraries

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.naive_bayes import GaussianNB
import random

### Preprocessing the data

In [None]:
# Loading the phishing dataset
df = pd.read_csv('phishing_site_urls.csv')
urls = df['URL']
labels = df['Label']

label_mapping = {'good': 0, 'bad': 1}
y = labels.map(label_mapping).values

# Extracting selected URL features
def extract_features(url):
    features = {}
    features['https'] = 1 if 'https' in url else 0
    features['url_length'] = len(url)
    features['special_char_length'] = sum(1 for char in url if char in '!@#$%^&*()-=')
    features['num_dots'] = url.count('.')
    features['num_slashes'] = url.count('/')
    features['total_special_chars'] = sum(url.count(char) for char in '!@#$%^&*()-=')
    features['common_domain'] = 1 if any(domain in url for domain in ['.com', '.org', '.net', '.me']) else 0
    features['is_other_domain'] = 1 if not features['common_domain'] else 0
    return features

features_df = pd.DataFrame([extract_features(url) for url in urls])

features_df['domain'] = urls.apply(lambda x: x.split('.')[-1]) 
features_df['feat1'] = features_df.groupby('domain')['url_length'].transform('mean')
features_df['feat2'] = features_df['url_length'] - features_df['feat1']

X = features_df[['https', 'url_length', 'special_char_length', 'num_dots', 'num_slashes', 
                 'total_special_chars', 'common_domain', 'is_other_domain', 'feat1', 'feat2']]

# Scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


### Splitting, training, & testing

In [None]:
accuracies = []
roc_auc_scores = []
f1_scores = []

# Training 15 models 
for i in range(15):
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=random.randint(0, 1000)
    ) # new test and train split for each model
    
    nb_model = GaussianNB()
    nb_model.fit(X_train, y_train)
    
    y_pred = nb_model.predict(X_test)
    y_pred_proba = nb_model.predict_proba(X_test)[:, 1]
    
    # Calculate scores
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    f1 = f1_score(y_test, y_pred)
    
    accuracies.append(accuracy)
    roc_auc_scores.append(roc_auc)
    f1_scores.append(f1)
    
    print(f"Iteration {i+1}: Accuracy = {accuracy:.4f}, ROC AUC = {roc_auc:.4f}, F1 = {f1:.4f}")

Iteration 1: Accuracy = 0.7428, ROC AUC = 0.7334, F1 = 0.3423
Iteration 2: Accuracy = 0.7447, ROC AUC = 0.7352, F1 = 0.3373
Iteration 3: Accuracy = 0.7441, ROC AUC = 0.7367, F1 = 0.3470
Iteration 4: Accuracy = 0.7448, ROC AUC = 0.7366, F1 = 0.3454
Iteration 5: Accuracy = 0.7439, ROC AUC = 0.7382, F1 = 0.3474
Iteration 6: Accuracy = 0.7440, ROC AUC = 0.7364, F1 = 0.3394
Iteration 7: Accuracy = 0.7430, ROC AUC = 0.7351, F1 = 0.3449
Iteration 8: Accuracy = 0.7438, ROC AUC = 0.7351, F1 = 0.3407
Iteration 9: Accuracy = 0.7432, ROC AUC = 0.7373, F1 = 0.3372
Iteration 10: Accuracy = 0.7438, ROC AUC = 0.7387, F1 = 0.3417
Iteration 11: Accuracy = 0.7434, ROC AUC = 0.7349, F1 = 0.3370
Iteration 12: Accuracy = 0.7443, ROC AUC = 0.7346, F1 = 0.3460
Iteration 13: Accuracy = 0.7436, ROC AUC = 0.7365, F1 = 0.3422
Iteration 14: Accuracy = 0.7434, ROC AUC = 0.7360, F1 = 0.3461
Iteration 15: Accuracy = 0.7433, ROC AUC = 0.7347, F1 = 0.3282

Metrics saved to shared results file: ../results.csv


### Saving metrics

In [None]:
results_df = pd.DataFrame({
    'nb_accuracy': accuracies,
    'nb_roc_auc': roc_auc_scores,
    'nb_f1_score': f1_scores
})

# Save to final results .csv file
if os.path.exists('results.csv'):
    existing_results = pd.read_csv('results.csv')
    combined_results = pd.concat([existing_results, results_df.reset_index(drop=True)], axis=1)
else:
    combined_results = results_df


combined_results.to_csv('results.csv', index=False)