# Deep Neural Network

### Importing libraries and data

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import random

# Load urls
df = pd.read_csv('phishing_site_urls.csv')
urls = df['URL']
labels = df['Label']

### Preprocessing the data

In [None]:
label_mapping = {'good': 0, 'bad': 1}
y = labels.map(label_mapping).values

# Feature extraction
def extract_features(url):
    features = {}
    features['https'] = 1 if 'https' in url else 0
    features['url_length'] = len(url)
    features['special_char_length'] = sum(1 for char in url if char in '!@#$%^&*()-=')
    features['num_dots'] = url.count('.')
    features['num_slashes'] = url.count('/')
    features['total_special_chars'] = sum(url.count(char) for char in '!@#$%^&*()-=')
    features['common_domain'] = 1 if any(domain in url for domain in ['.com', '.org', '.net', '.me']) else 0
    features['is_other_domain'] = 1 if not features['common_domain'] else 0
    return features


features_df = pd.DataFrame([extract_features(url) for url in urls])

# Add group-based features
features_df['domain'] = urls.apply(lambda x: x.split('.')[-1]) 
features_df['feat1'] = features_df.groupby('domain')['url_length'].transform('mean')  # Mean of `url_length` by domain
features_df['feat2'] = features_df['url_length'] - features_df['feat1']  # Deviation from mean


X = features_df[['https', 'url_length', 'special_char_length', 'num_dots', 'num_slashes', 
                 'total_special_chars', 'common_domain', 'is_other_domain', 'feat1', 'feat2']]

# Scaling url features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)


### Splitting, training and testing

In [None]:
# Initialize lists to store results
accuracies = []
roc_auc_scores = []
f1_scores = []

# Define DNN model 
class DNNModel(nn.Module):
    def __init__(self, input_dim):
        super(DNNModel, self).__init__()
        layers = []
        for _ in range(10):  
            layers.append(nn.Linear(input_dim, 30))
            layers.append(nn.ReLU())
            input_dim = 30
        layers.append(nn.Linear(30, 1)) 
        layers.append(nn.Sigmoid())
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

# splitting, and training
for i in range(15):
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_tensor, y_tensor, test_size=0.2, random_state=random.randint(0, 1000)
    )
    
    train_data = TensorDataset(X_train, y_train)
    test_data = TensorDataset(X_test, y_test)
    train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=16, shuffle=False)

    model = DNNModel(input_dim=X_train.shape[1])
    criterion = nn.BCELoss()  
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    
   
    for epoch in range(5):  
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            y_pred = model(batch_X).squeeze()
            loss = criterion(y_pred, batch_y)
            loss.backward()
            optimizer.step()
    
    # testing
    model.eval()
    y_pred_proba = []
    y_true = []
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            y_pred = model(batch_X).squeeze()
            y_pred_proba.extend(y_pred.tolist())
            y_true.extend(batch_y.tolist())
    
    
    y_pred_binary = [1 if prob > 0.5 else 0 for prob in y_pred_proba]
    
    # calculating the results
    accuracy = accuracy_score(y_true, y_pred_binary)
    roc_auc = roc_auc_score(y_true, y_pred_proba)
    f1 = f1_score(y_true, y_pred_binary)
    
    
    accuracies.append(accuracy)
    roc_auc_scores.append(roc_auc)
    f1_scores.append(f1)
    
    print(f"Iteration {i+1}: Accuracy = {accuracy:.4f}, ROC AUC = {roc_auc:.4f}, F1 = {f1:.4f}")




### Saving results

In [None]:
#Create the data frame
results_df = pd.DataFrame({
    'dnn_accuracy': accuracies,
    'dnn_roc_auc': roc_auc_scores,
    'dnn_f1_score': f1_scores
})

shared_results_file = './results.csv'

# Save results to the results file
if os.path.exists(shared_results_file):
    existing_results = pd.read_csv(shared_results_file)
    combined_results = pd.concat([existing_results, results_df.reset_index(drop=True)], axis=1)
else:
    combined_results = results_df
    
combined_results.to_csv(shared_results_file, index=False)

print(f"\nMetrics saved to shared results file: {shared_results_file}")