# Multi-Layer Perceptron Neural Network

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import random

# Load dataset
df = pd.read_csv('phishing_site_urls.csv')
urls = df['URL']
labels = df['Label']

# Map labels to binary values
label_mapping = {'good': 0, 'bad': 1}
y = labels.map(label_mapping).values

# Feature extraction
def extract_features(url):
    features = {}
    # Basic features
    features['https'] = 1 if 'https' in url else 0
    features['url_length'] = len(url)
    features['special_char_length'] = sum(1 for char in url if char in '!@#$%^&*()-=')
    # Additional features
    features['num_dots'] = url.count('.')
    features['num_slashes'] = url.count('/')
    features['total_special_chars'] = sum(url.count(char) for char in '!@#$%^&*()-=')
    # Domain type
    features['common_domain'] = 1 if any(domain in url for domain in ['.com', '.org', '.net', '.me']) else 0
    features['is_other_domain'] = 1 if not features['common_domain'] else 0
    return features

# Apply feature extraction
features_df = pd.DataFrame([extract_features(url) for url in urls])

# Add group-based features
features_df['domain'] = urls.apply(lambda x: x.split('.')[-1])  # Extract domain suffix (e.g., 'com', 'org')
features_df['feat1'] = features_df.groupby('domain')['url_length'].transform('mean')  # Mean of `url_length` by domain
features_df['feat2'] = features_df['url_length'] - features_df['feat1']  # Deviation from mean

# Prepare final feature matrix
X = features_df[['https', 'url_length', 'special_char_length', 'num_dots', 'num_slashes', 
                 'total_special_chars', 'common_domain', 'is_other_domain', 'feat1', 'feat2']]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert data to PyTorch tensors
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

# Initialize lists to store metrics
accuracies = []
roc_auc_scores = []
f1_scores = []

# Define MLP model in PyTorch
class MLPModel(nn.Module):
    def __init__(self, input_dim):
        super(MLPModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 10)  # First hidden layer
        self.fc2 = nn.Linear(10, 10)        # Second hidden layer
        self.fc3 = nn.Linear(10, 1)         # Output layer
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

# Train and evaluate MLP neural network
for i in range(15):
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_tensor, y_tensor, test_size=0.2, random_state=random.randint(0, 1000)
    )
    
    # Prepare data loaders
    train_data = TensorDataset(X_train, y_train)
    test_data = TensorDataset(X_test, y_test)
    train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=16, shuffle=False)
    
    # Initialize model, loss function, and optimizer
    model = MLPModel(input_dim=X_train.shape[1])
    criterion = nn.BCELoss()  # Binary cross-entropy loss
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    
    # Train the model
    for epoch in range(5):  # Train for 5 epochs
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            y_pred = model(batch_X).squeeze()
            loss = criterion(y_pred, batch_y)
            loss.backward()
            optimizer.step()
    
    # Evaluate the model
    model.eval()
    y_pred_proba = []
    y_true = []
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            y_pred = model(batch_X).squeeze()
            y_pred_proba.extend(y_pred.tolist())
            y_true.extend(batch_y.tolist())
    
    # Convert predictions to binary
    y_pred_binary = [1 if prob > 0.5 else 0 for prob in y_pred_proba]
    
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred_binary)
    roc_auc = roc_auc_score(y_true, y_pred_proba)
    f1 = f1_score(y_true, y_pred_binary)
    
    # Append metrics to lists
    accuracies.append(accuracy)
    roc_auc_scores.append(roc_auc)
    f1_scores.append(f1)
    
    print(f"Iteration {i+1}: Accuracy = {accuracy:.4f}, ROC AUC = {roc_auc:.4f}, F1 = {f1:.4f}")

# Prepare results DataFrame for MLP
results_df = pd.DataFrame({
    'mlp_accuracy': accuracies,
    'mlp_roc_auc': roc_auc_scores,
    'mlp_f1_score': f1_scores
})

# Define the shared results file path
shared_results_file = '../results.csv'

# Save to shared results file
if os.path.exists(shared_results_file):
    # If the file exists, load it and append new results
    existing_results = pd.read_csv(shared_results_file)
    combined_results = pd.concat([existing_results, results_df.reset_index(drop=True)], axis=1)
else:
    # If the file doesn't exist, create it with the current results
    combined_results = results_df

# Save the combined results
combined_results.to_csv(shared_results_file, index=False)

print(f"\nMetrics saved to shared results file: {shared_results_file}")


Iteration 1: Accuracy = 0.8129, ROC AUC = 0.8484, F1 = 0.5998
Iteration 2: Accuracy = 0.8039, ROC AUC = 0.8451, F1 = 0.5561
Iteration 3: Accuracy = 0.8065, ROC AUC = 0.8346, F1 = 0.5513
Iteration 4: Accuracy = 0.8067, ROC AUC = 0.8492, F1 = 0.6406
Iteration 5: Accuracy = 0.8135, ROC AUC = 0.8417, F1 = 0.6099
