### Single Layer Perceptron Neural Network

### Import Libraries and data set

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import random

# Load dataset
df = pd.read_csv('phishing_site_urls.csv')
urls = df['URL']
labels = df['Label']

### Preprocess data

In [None]:
# Loading the phishing dataset
df = pd.read_csv('phishing_site_urls.csv')
urls = df['URL']
labels = df['Label']

label_mapping = {'good': 0, 'bad': 1}
y = labels.map(label_mapping).values

# Extracting selected URL features
def extract_features(url):
    features = {}
    features['https'] = 1 if 'https' in url else 0
    features['url_length'] = len(url)
    features['special_char_length'] = sum(1 for char in url if char in '!@#$%^&*()-=')
    features['num_dots'] = url.count('.')
    features['num_slashes'] = url.count('/')
    features['total_special_chars'] = sum(url.count(char) for char in '!@#$%^&*()-=')
    features['common_domain'] = 1 if any(domain in url for domain in ['.com', '.org', '.net', '.me']) else 0
    features['is_other_domain'] = 1 if not features['common_domain'] else 0
    return features

features_df = pd.DataFrame([extract_features(url) for url in urls])

features_df['domain'] = urls.apply(lambda x: x.split('.')[-1]) 
features_df['feat1'] = features_df.groupby('domain')['url_length'].transform('mean')
features_df['feat2'] = features_df['url_length'] - features_df['feat1']

X = features_df[['https', 'url_length', 'special_char_length', 'num_dots', 'num_slashes', 
                 'total_special_chars', 'common_domain', 'is_other_domain', 'feat1', 'feat2']]

# Scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Converting to pytorch tensors
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

### Split, train, and test

In [None]:
accuracies = []
roc_auc_scores = []
f1_scores = []

# Define SLP architecture
class SLPModel(nn.Module):
    def __init__(self, input_dim):
        super(SLPModel, self).__init__()
        self.fc = nn.Linear(input_dim, 1)  
    
    def forward(self, x):
        return torch.sigmoid(self.fc(x))

# Train 15 SLP models
for i in range(15):
    X_train, X_test, y_train, y_test = train_test_split(
        X_tensor, y_tensor, test_size=0.2, random_state=random.randint(0, 1000)
    ) # new test and train split for each model
    
    # data loaders for TF
    train_data = TensorDataset(X_train, y_train)
    test_data = TensorDataset(X_test, y_test)
    train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=16, shuffle=False)
    
    # Create model
    model = SLPModel(input_dim=X_train.shape[1])
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    
    # Train for 5 epochs
    for epoch in range(5):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            y_pred = model(batch_X).squeeze()
            loss = criterion(y_pred, batch_y)
            loss.backward()
            optimizer.step()
    
    # Evaluating model and calculating scores
    model.eval()
    y_pred_proba = []
    y_true = []
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            y_pred = model(batch_X).squeeze()
            y_pred_proba.extend(y_pred.tolist())
            y_true.extend(batch_y.tolist())
    
    y_pred_binary = [1 if prob > 0.5 else 0 for prob in y_pred_proba]
    
    accuracy = accuracy_score(y_true, y_pred_binary)
    roc_auc = roc_auc_score(y_true, y_pred_proba)
    f1 = f1_score(y_true, y_pred_binary)
    
    # Append metrics to lists
    accuracies.append(accuracy)
    roc_auc_scores.append(roc_auc)
    f1_scores.append(f1)
    
    print(f"Iteration {i+1}: Accuracy = {accuracy:.4f}, ROC AUC = {roc_auc:.4f}, F1 = {f1:.4f}")




In [None]:
results_df = pd.DataFrame({
    'slp_accuracy': accuracies,
    'slp_roc_auc': roc_auc_scores,
    'slp_f1_score': f1_scores
})

# Save to final results .csv file
if os.path.exists('results.csv'):
    existing_results = pd.read_csv('results.csv')
    combined_results = pd.concat([existing_results, results_df.reset_index(drop=True)], axis=1)
else:
    combined_results = results_df

combined_results.to_csv('results.csv', index=False)