### Single Layer Perceptron Neural Network

### Import Libraries and data set

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from nltk.tokenize import RegexpTokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import pandas as pd
import numpy as np
import random
import gc

# Read dataset
df = pd.read_csv('phishing_site_urls.csv')
urls = df['URL']
labels = df['Label']

### Preprocess data

In [2]:
# Map labels to binary values
label_mapping = {'good': 1, 'bad': 0}
y = labels.map(label_mapping).values

# Tokenize URLs
tokenizer = RegexpTokenizer(r'\w+')
url_tokens = [' '.join(tokenizer.tokenize(url)) for url in urls]

# Vectorize with reduced feature space
# Tokenize and vectorize URLs
vectorizer = CountVectorizer(max_features=5000, min_df=5)  # Limit features and filter rare tokens
X = vectorizer.fit_transform(url_tokens)  # Keep X as a sparse matrix

# Check feature space and sparsity
print(f"Number of samples: {X.shape[0]}")  # Rows (URLs)
print(f"Number of features: {X.shape[1]}")  # Columns (tokens)
print(f"Sparsity: {X.nnz / (X.shape[0] * X.shape[1]):.4f}")  # Percentage of non-zero values



Number of samples: 549346
Number of features: 5000
Sparsity: 0.0009


### Split, train, and test

In [3]:
# Lists to store metrics
accuracies = []
roc_auc_scores = []
f1_scores = []

# Define the SLP model structure
def create_slp_model(input_dim):
    model = Sequential()
    model.add(Dense(1, input_dim=input_dim, activation='sigmoid'))
    model.compile(optimizer=Adam(learning_rate=0.02), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Train and evaluate SLP models with sparse-to-dense conversion
for i in range(15):
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=random.randint(0, 1000)
    )
    
    # Convert sparse matrices to dense arrays
    X_train = X_train.toarray()
    X_test = X_test.toarray()
    
    # Create and train the SLP model
    model = create_slp_model(X_train.shape[1])
    model.fit(X_train, y_train, epochs=3, batch_size=16, verbose=0)
    
    # Predict on the test set
    y_pred_proba = model.predict(X_test).flatten()  # Flatten for ROC AUC
    y_pred = (y_pred_proba > 0.5).astype(int)  # Threshold to convert probabilities to binary
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    f1 = f1_score(y_test, y_pred)
    
    # Append metrics to lists
    accuracies.append(accuracy)
    roc_auc_scores.append(roc_auc)
    f1_scores.append(f1)
    
    print(f"Iteration {i+1}: Accuracy = {accuracy:.4f}, ROC AUC = {roc_auc:.4f}, F1 = {f1:.4f}")
    
    # Save intermediate results
    results = pd.DataFrame({'accuracy': accuracies, 'roc_auc': roc_auc_scores, 'f1_score': f1_scores})
    results.to_csv('SLP_intermediate_results.csv', index=False)
    
    # Free up memory
    del X_train, X_test, y_train, y_test, model
    gc.collect()

print("\nFinal Metrics saved to 'SLP_intermediate_results.csv'")

Iteration 1: Accuracy = 0.9355, ROC AUC = 0.9796, F1 = 0.9559
Iteration 2: Accuracy = 0.9350, ROC AUC = 0.9793, F1 = 0.9555
Iteration 3: Accuracy = 0.9363, ROC AUC = 0.9793, F1 = 0.9566
Iteration 4: Accuracy = 0.9350, ROC AUC = 0.9799, F1 = 0.9556
Iteration 5: Accuracy = 0.9345, ROC AUC = 0.9795, F1 = 0.9552
Iteration 6: Accuracy = 0.9338, ROC AUC = 0.9789, F1 = 0.9544
Iteration 7: Accuracy = 0.9347, ROC AUC = 0.9794, F1 = 0.9554
Iteration 8: Accuracy = 0.9336, ROC AUC = 0.9791, F1 = 0.9543
Iteration 9: Accuracy = 0.9355, ROC AUC = 0.9799, F1 = 0.9555
Iteration 10: Accuracy = 0.9337, ROC AUC = 0.9794, F1 = 0.9544
Iteration 11: Accuracy = 0.9353, ROC AUC = 0.9797, F1 = 0.9556
Iteration 12: Accuracy = 0.9352, ROC AUC = 0.9793, F1 = 0.9557
Iteration 13: Accuracy = 0.9352, ROC AUC = 0.9794, F1 = 0.9557
Iteration 14: Accuracy = 0.9352, ROC AUC = 0.9793, F1 = 0.9558
Iteration 15: Accuracy = 0.9358, ROC AUC = 0.9797, F1 = 0.9562

Final Metrics saved to 'SLP_intermediate_results.csv'
