### Single Layer Perceptron Neural Network

### Import Libraries and data set

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from nltk.tokenize import RegexpTokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import pandas as pd
import numpy as np
import random

# Read dataset
df = pd.read_csv('phishing_site_urls.csv')
urls = df['URL']
labels = df['Label']

### Preprocess data

In [5]:
# Map labels to binary values
label_mapping = {'good': 1, 'bad': 0}
y = labels.map(label_mapping).values

# Tokenize URLs
tokenizer = RegexpTokenizer(r'\w+')
url_tokens = [' '.join(tokenizer.tokenize(url)) for url in urls]

# Vectorize with reduced feature space
vectorizer = CountVectorizer(max_features=5000, min_df=5)  # Limit to top 5000 tokens, min 5 occurrences
X = vectorizer.fit_transform(url_tokens).toarray()


### Split, train, and test

In [6]:
# Lists to store metrics
accuracies = []
roc_auc_scores = []
f1_scores = []

# Define the SLP model structure
def create_slp_model(input_dim):
    model = Sequential()
    model.add(Dense(1, input_dim=input_dim, activation='sigmoid'))
    model.compile(optimizer=Adam(learning_rate=0.02), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Train and evaluate 15 SLP models
for i in range(15):
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=random.randint(0, 1000)
    )
    
    # Create and train the SLP model
    model = create_slp_model(X_train.shape[1])
    model.fit(X_train, y_train, epochs=3, batch_size=16, verbose=0)  # Reduced epochs and batch size
    
    # Predict on the test set
    y_pred_proba = model.predict(X_test).flatten()  # Flatten for ROC AUC
    y_pred = (y_pred_proba > 0.5).astype(int)  # Threshold to convert probabilities to binary
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    f1 = f1_score(y_test, y_pred)
    
    # Append metrics to lists
    accuracies.append(accuracy)
    roc_auc_scores.append(roc_auc)
    f1_scores.append(f1)
    
    print(f"Iteration {i+1}: Accuracy = {accuracy:.4f}, ROC AUC = {roc_auc:.4f}, F1 = {f1:.4f}")


Iteration 1: Accuracy = 0.9336, ROC AUC = 0.9798, F1 = 0.9541
Iteration 2: Accuracy = 0.9359, ROC AUC = 0.9802, F1 = 0.9561
Iteration 3: Accuracy = 0.9345, ROC AUC = 0.9790, F1 = 0.9553
Iteration 4: Accuracy = 0.9364, ROC AUC = 0.9800, F1 = 0.9568
Iteration 5: Accuracy = 0.9361, ROC AUC = 0.9797, F1 = 0.9561
Iteration 6: Accuracy = 0.9340, ROC AUC = 0.9787, F1 = 0.9548


: 

: 

### Saving metrics

In [None]:
# Save metrics to separate CSV files
pd.DataFrame({'accuracy': accuracies}).to_csv('SLP_accuracies.csv', index=False)
pd.DataFrame({'roc_auc': roc_auc_scores}).to_csv('SLP_roc_auc.csv', index=False)
pd.DataFrame({'f1_score': f1_scores}).to_csv('SLP_f1_scores.csv', index=False)

print("\nMetrics saved to 'SLP_accuracies.csv', 'SLP_roc_auc.csv', and 'SLP_f1_scores.csv'")