<a href="https://colab.research.google.com/github/juwetta/DLI_Group-B/blob/main/TP074003_Algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Loading the Dataset: The dataset is loaded into a pandas DataFrame, and the features (X) and the target variable (y) are extracted. The features appear to include columns 3 to the second-to-last column, and the target variable is the last column.

In [16]:
# Importing the libraries
import numpy as np
import pandas as pd
import tensorflow as tf

# Mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Importing the dataset
dataset = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Assignment/URL_dataset_clean_balanced.csv')


display(dataset)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,url,type
0,http://kitegacc.net/,phishing
1,https://www.electronichouse.com/article/ps3_ad...,legitimate
2,https://www.linkedin.com/in/larrymartinkimpel,legitimate
3,https://www.kansascity.com/2011/03/05/2700249/...,legitimate
4,https://www.en.wikipedia.org/wiki/Dem_Bones,legitimate
...,...,...
208871,http://www.apsweb.co.jp/wordpress/ihup/nD/inde...,phishing
208872,https://www.theruckus.wordpress.com/,legitimate
208873,http://jambidaily.com/34g3f3g/68k7jh65g.exe,phishing
208874,http://ejanla.co/43543r34r/843tf.exe,phishing


In [34]:
!pip install minisom


# ================================
# Step 1: Import libraries & dataset
# ================================
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from minisom import MiniSom
import re
from urllib.parse import urlparse


# Load dataset
dataset = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Assignment/URL_dataset_clean_balanced.csv')

# Inspect
print(dataset.head())
print(dataset.info())

# Feature Extraction Function
def extract_url_features(url):
    """Extracts various features from a URL."""
    features = {}
    try:
        parsed_url = urlparse(url)

        features['url_length'] = len(url)
        features['num_dots'] = url.count('.')
        features['has_at_symbol'] = '@' in url
        features['has_double_slash'] = '//' in url
        features['num_dashes'] = url.count('-')
        features['num_underscores'] = url.count('_')
        features['num_equals'] = url.count('=')
        features['num_semicolons'] = url.count(';')
        features['num_commas'] = url.count(',')
        features['num_quotes'] = url.count("'") + url.count('"')
        features['num_less_than'] = url.count('<')
        features['num_greater_than'] = url.count('>')
        features['num_braces'] = url.count('{') + url.count('}')
        features['num_brackets'] = url.count('[') + url.count(']')
        features['num_parentheses'] = url.count('(') + url.count(')')
        features['num_hashes'] = url.count('#')
        features['num_exclamations'] = url.count('!')
        features['num_dollars'] = url.count('$')
        features['num_spaces'] = url.count(' ')
        features['num_slashes'] = url.count('/')
        features['num_questions'] = url.count('?')
        features['num_and'] = url.count('&')
        features['num_or'] = url.count('|')
        features['num_tilde'] = url.count('~')
        features['num_http'] = url.count('http') + url.count('https')
        features['num_www'] = url.count('www')
        features['num_subdomains'] = len(parsed_url.hostname.split('.')) - 2 if parsed_url.hostname else 0 # Subtract 2 for domain and TLD
        features['path_length'] = len(parsed_url.path)
        features['query_length'] = len(parsed_url.query)
        features['fragment_length'] = len(parsed_url.fragment)
        features['has_ip_address'] = bool(re.match(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', parsed_url.hostname)) if parsed_url.hostname else False

    except ValueError:
        # Return a series of zeros if the URL is invalid
        return pd.Series({
            'url_length': 0, 'num_dots': 0, 'has_at_symbol': False, 'has_double_slash': False,
            'num_dashes': 0, 'num_underscores': 0, 'num_equals': 0, 'num_semicolons': 0,
            'num_commas': 0, 'num_quotes': 0, 'num_less_than': 0, 'num_greater_than': 0,
            'num_braces': 0, 'num_brackets': 0, 'num_parentheses': 0, 'num_hashes': 0,
            'num_exclamations': 0, 'num_dollars': 0, 'num_spaces': 0, 'num_slashes': 0,
            'num_questions': 0, 'num_and': 0, 'num_or': 0, 'num_tilde': 0, 'num_http': 0,
            'num_www': 0, 'num_subdomains': 0, 'path_length': 0, 'query_length': 0,
            'fragment_length': 0, 'has_ip_address': False
        })

    return pd.Series(features)

# Apply the feature extraction function to the 'url' column
features_df = dataset['url'].apply(extract_url_features)

# Convert boolean columns to numeric (0s and 1s) for RMO/SOM
for col in features_df.columns:
    if features_df[col].dtype == 'bool':
        features_df[col] = features_df[col].astype(int)


# Split into features & labels
X = features_df.values
y = dataset['type'].values


# Normalize the features before RMO-SOM
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)


# ================================
# Step 2: Feature extraction with SOM-RMO
# ================================

def rmo_som_feature_extraction(features, num_neurons, num_rmo_iterations, learning_rate_start, neighborhood_radius_start, rmo_beta=0.5, rmo_alpha=0.8):
    """
    Performs feature extraction using SOM optimized with RMO principles.
    This is a simulated implementation of RMO influencing SOM weight updates.

    Args:
        features (np.ndarray): The input features (scaled).
        num_neurons (int): The number of neurons in the SOM.
        num_rmo_iterations (int): The number of RMO iterations (outer loop).
        learning_rate_start (float): The initial learning rate for SOM updates (inner loop).
        neighborhood_radius_start (float): The initial neighborhood radius for SOM updates (inner loop).
        rmo_beta (float): RMO parameter influencing movement towards best.
        rmo_alpha (float): RMO parameter influencing random movement.


    Returns:
        np.ndarray: The extracted features after SOM-RMO.
    """
    # Ensure features are float type
    features = features.astype(float)

    num_features = features.shape[1]
    som_weights = np.random.rand(num_neurons, num_features).astype(float) # Ensure float type

    # Initialize best weights found so far
    best_som_weights = som_weights.copy()
    # In a real RMO-SOM, you would need a fitness function to evaluate SOM quality
    # and determine the 'best'. For this simulation, we'll conceptually update
    # 'best_som_weights' based on the RMO movement idea, but without a true fitness.


    for rmo_iter in range(num_rmo_iterations):
        # Adjust SOM learning rate and neighborhood radius over RMO iterations (optional, but common)
        current_learning_rate = learning_rate_start * np.exp(-rmo_iter / num_rmo_iterations)
        current_neighborhood_radius = neighborhood_radius_start * np.exp(-rmo_iter / num_rmo_iterations)

        # --- Simulate SOM training step influenced by RMO ---
        # Instead of full SOM epochs here, we'll update weights based on a random data point
        # and influence from the 'best' weights found so far (RMO principle).

        # Select a random data point
        random_data_point_index = np.random.randint(0, features.shape[0])
        random_data_point = np.array(features[random_data_point_index]).astype(float) # Ensure float type

        # Find BMU for the current SOM weights
        bmu_index = np.argmin(np.linalg.norm(som_weights - random_data_point, axis=1))

        # Update weights of the current SOM based on BMU and neighborhood (SOM principle)
        for i in range(num_neurons):
            weight_vector = np.array(som_weights[i]).astype(float) # Ensure float type
            bmu_weight_vector = np.array(som_weights[bmu_index]).astype(float) # Ensure float type
            data_point_vector = np.array(random_data_point).astype(float) # Ensure float type


            distance_to_bmu = np.linalg.norm(weight_vector - bmu_weight_vector)
            influence = np.exp(-distance_to_bmu**2 / (2 * current_neighborhood_radius**2))

            # --- RMO-like Update Rule ---
            # This simulates movement towards the 'best_som_weights' and incorporates a random component.
            # The specific RMO update rule can vary. This is a simplified version.
            rmo_movement = (
                rmo_beta * np.random.rand() * (best_som_weights[i] - weight_vector) + # Movement towards best
                rmo_alpha * np.random.rand() * (data_point_vector - weight_vector)    # Movement towards data point (SOM-like)
            )

            # Combine SOM influence with RMO movement
            som_weights[i] += current_learning_rate * influence * rmo_movement

        # --- Update best_som_weights (Conceptual) ---
        # In a real RMO, you would evaluate the fitness of 'som_weights' here
        # and update 'best_som_weights' if it's better. Without a fitness function,
        # we'll make a simplifying assumption or skip a true 'best' update.
        # For a simulation, we could periodically update best_som_weights
        # based on the current som_weights, though this isn't true RMO.
        # Let's assume a mechanism exists to update best_som_weights (e.g., based on error).
        # For this code, we'll just let the RMO movement guide the single set of weights.
        best_som_weights = som_weights.copy() # Simplified: best is always current

    # After RMO iterations, use the final som_weights for feature extraction.
    final_som_weights = som_weights


    # Extract features by mapping data points to the trained SOM weights
    extracted_features = np.zeros((features.shape[0], num_features))
    for i in range(features.shape[0]):
        feature_vector = np.array(features[i]).astype(float) # Ensure float type
        bmu_index = np.argmin(np.linalg.norm(final_som_weights - feature_vector, axis=1))
        extracted_features[i] = final_som_weights[bmu_index]

    return extracted_features


# Define SOM-RMO parameters based on Table 8
num_som_neurons = 10 * 10  # Grid Size SOM 10x10 = 100 neurons
num_rmo_iterations = 1000  # Number of Iterations 1000
initial_learning_rate = 0.5 # Learning Rate 0.5
initial_neighborhood_radius = 5.0 # Radius 5.0
# rmo_beta and rmo_alpha parameters are not explicitly in Table 8, keeping previous values


# Apply SOM-RMO for feature extraction
# X_scaled is the normalized features from the previous cell
som_features = rmo_som_feature_extraction(X_scaled, num_som_neurons, num_rmo_iterations, initial_learning_rate, initial_neighborhood_radius)

# Display the shape of the extracted features
print("Shape of SOM-RMO extracted features:", som_features.shape)


# ================================
# Step 3: Classification with RBFN
# ================================
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import LogisticRegression

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(som_features, y, test_size=0.2, random_state=42)

# Initial RBF transformation parameters based on Table 8
# Number of Centers 100 -> n_components=100
# Gamma is not explicitly in the RBFN section of Table 8, keeping default or previous value
rbf_feature = RBFSampler(gamma=1, n_components=100, random_state=42)
X_train_rbf = rbf_feature.fit_transform(X_train)
X_test_rbf = rbf_feature.transform(X_test)

# Initial classifier (Logistic Regression) parameters based on Table 8
# Epochs 500 -> max_iter=500
# Learning Rate and Momentum are for RBFN training, not directly applicable to Logistic Regression
clf = LogisticRegression(max_iter=500)
clf.fit(X_train_rbf, y_train)
y_pred = clf.predict(X_test_rbf)
print("Baseline Accuracy (after SOM-RMO feature extraction):", accuracy_score(y_test, y_pred))


# ================================
# Step 4: Tabu Search Optimization for RBFN
# ================================
import random

def evaluate_solution(gamma, n_components):
    """Train RBFN with given hyperparameters and return accuracy"""
    rbf_feature = RBFSampler(gamma=gamma, n_components=n_components, random_state=42)
    X_train_rbf = rbf_feature.fit_transform(X_train)
    X_test_rbf = rbf_feature.transform(X_test)

    # Use Logistic Regression as the classifier with max_iter from Table 8
    clf = LogisticRegression(max_iter=500)
    clf.fit(X_train_rbf, y_train)
    y_pred = clf.predict(X_test_rbf)
    return accuracy_score(y_test, y_pred)

# Tabu Search Parameters based on Table 8
tabu_list = []
tabu_size = 50 # List Size Tabu 50
num_iterations = 100 # Search Iterations 100

# Initial solution
best_gamma = 1
best_n_components = 100 # Initial n_components based on Table 8
best_acc = evaluate_solution(best_gamma, best_n_components)

print(f"Initial RBFN solution (before Tabu Search): gamma={best_gamma}, n_components={best_n_components}, acc={best_acc:.4f}")

for iteration in range(num_iterations):
    # Generate neighbor solutions
    # Using smaller steps for generating neighbors based on previous successful run
    candidate_gamma = best_gamma + random.choice([-0.1, 0.1])
    candidate_n_components = best_n_components + random.choice([-10, 10])

    # Bounds (keeping reasonable bounds)
    candidate_gamma = max(0.01, min(10, candidate_gamma))
    candidate_n_components = max(10, min(500, candidate_n_components)) # Adjusted max bound


    # Check tabu list
    if (candidate_gamma, candidate_n_components) in tabu_list:
        continue

    # Evaluate candidate
    acc = evaluate_solution(candidate_gamma, candidate_n_components)
    # print(f"Iteration {iteration+1}: gamma={candidate_gamma}, n_components={candidate_n_components}, acc={acc:.4f}") # Suppress verbose output

    # Update best
    if acc > best_acc:
        best_acc = acc
        best_gamma = candidate_gamma
        best_n_components = candidate_n_components

    # Update tabu list
    tabu_list.append((candidate_gamma, candidate_n_components))
    if len(tabu_list) > tabu_size:
        tabu_list.pop(0)

print(f"Optimized RBFN solution (after Tabu Search): gamma={best_gamma}, n_components={best_n_components}, acc={best_acc:.4f}")


# ================================
# Step 5: Final Evaluation
# ================================
rbf_feature = RBFSampler(gamma=best_gamma, n_components=best_n_components, random_state=42)
X_train_rbf = rbf_feature.fit_transform(X_train)
X_test_rbf = rbf_feature.transform(X_test)

clf = LogisticRegression(max_iter=500) # Using max_iter from Table 8
clf.fit(X_train_rbf, y_train)
y_pred = clf.predict(X_test_rbf)

print("\nFinal Model Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

                                                 url        type
0                               http://kitegacc.net/    phishing
1  https://www.electronichouse.com/article/ps3_ad...  legitimate
2      https://www.linkedin.com/in/larrymartinkimpel  legitimate
3  https://www.kansascity.com/2011/03/05/2700249/...  legitimate
4        https://www.en.wikipedia.org/wiki/Dem_Bones  legitimate
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208876 entries, 0 to 208875
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     208876 non-null  object
 1   type    208876 non-null  object
dtypes: object(2)
memory usage: 3.2+ MB
None
Shape of SOM-RMO extracted features: (208876, 31)
Baseline Accuracy (after SOM-RMO feature extraction): 0.9364707008808886
Initial RBFN solution (before Tabu Search): gamma=1, n_components=100, acc=0.9365
Optimized RBFN solution (after Tabu Search): gamma=1.1, n_components=110, acc=0.9386

Final Model Perf

## Load dataset and initial feature extraction

### Subtask:
Load the dataset and perform initial feature extraction from the URLs.


**Reasoning**:
The first step is to load the dataset and perform the initial feature extraction as outlined in the instructions. This involves importing necessary libraries, loading the data, defining the feature extraction function, and applying it to the dataset.



In [31]:
!pip install minisom


# ================================
# Step 1: Import libraries & dataset
# ================================
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from minisom import MiniSom
import re
from urllib.parse import urlparse


# Load dataset
dataset = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Assignment/URL_dataset_clean_balanced.csv')

# Inspect
print(dataset.head())
print(dataset.info())

# Feature Extraction Function
def extract_url_features(url):
    """Extracts various features from a URL."""
    features = {}
    try:
        parsed_url = urlparse(url)

        features['url_length'] = len(url)
        features['num_dots'] = url.count('.')
        features['has_at_symbol'] = '@' in url
        features['has_double_slash'] = '//' in url
        features['num_dashes'] = url.count('-')
        features['num_underscores'] = url.count('_')
        features['num_equals'] = url.count('=')
        features['num_semicolons'] = url.count(';')
        features['num_commas'] = url.count(',')
        features['num_quotes'] = url.count("'") + url.count('"')
        features['num_less_than'] = url.count('<')
        features['num_greater_than'] = url.count('>')
        features['num_braces'] = url.count('{') + url.count('}')
        features['num_brackets'] = url.count('[') + url.count(']')
        features['num_parentheses'] = url.count('(') + url.count(')')
        features['num_hashes'] = url.count('#')
        features['num_exclamations'] = url.count('!')
        features['num_dollars'] = url.count('$')
        features['num_spaces'] = url.count(' ')
        features['num_slashes'] = url.count('/')
        features['num_questions'] = url.count('?')
        features['num_and'] = url.count('&')
        features['num_or'] = url.count('|')
        features['num_tilde'] = url.count('~')
        features['num_http'] = url.count('http') + url.count('https')
        features['num_www'] = url.count('www')
        features['num_subdomains'] = len(parsed_url.hostname.split('.')) - 2 if parsed_url.hostname else 0 # Subtract 2 for domain and TLD
        features['path_length'] = len(parsed_url.path)
        features['query_length'] = len(parsed_url.query)
        features['fragment_length'] = len(parsed_url.fragment)
        features['has_ip_address'] = bool(re.match(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', parsed_url.hostname)) if parsed_url.hostname else False

    except ValueError:
        # Return a series of zeros if the URL is invalid
        return pd.Series({
            'url_length': 0, 'num_dots': 0, 'has_at_symbol': False, 'has_double_slash': False,
            'num_dashes': 0, 'num_underscores': 0, 'num_equals': 0, 'num_semicolons': 0,
            'num_commas': 0, 'num_quotes': 0, 'num_less_than': 0, 'num_greater_than': 0,
            'num_braces': 0, 'num_brackets': 0, 'num_parentheses': 0, 'num_hashes': 0,
            'num_exclamations': 0, 'num_dollars': 0, 'num_spaces': 0, 'num_slashes': 0,
            'num_questions': 0, 'num_and': 0, 'num_or': 0, 'num_tilde': 0, 'num_http': 0,
            'num_www': 0, 'num_subdomains': 0, 'path_length': 0, 'query_length': 0,
            'fragment_length': 0, 'has_ip_address': False
        })

    return pd.Series(features)

# Apply the feature extraction function to the 'url' column
features_df = dataset['url'].apply(extract_url_features)


# Split into features & labels
X = features_df.values
y = dataset['type'].values


# Normalize
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)


# ================================
# Step 2: Feature extraction with SOM (simulating SOM-RMO)
# ================================
som = MiniSom(x=10, y=10, input_len=X_scaled.shape[1], sigma=1.0, learning_rate=0.5)
som.random_weights_init(X_scaled)
som.train_random(X_scaled, num_iteration=100)

# Map inputs to BMU (Best Matching Unit)
som_features = np.array([som.winner(x) for x in X_scaled])

# Flatten 2D coordinates (x,y) → single feature vector
som_features = np.array([list(f) for f in som_features])


# ================================
# Step 3: Classification with RBFN
# ================================
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import LogisticRegression

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(som_features, y, test_size=0.2, random_state=42)

# Initial RBF transformation
rbf_feature = RBFSampler(gamma=1, n_components=100, random_state=42)
X_train_rbf = rbf_feature.fit_transform(X_train)
X_test_rbf = rbf_feature.transform(X_test)

# Initial classifier
clf = LogisticRegression(max_iter=500)
clf.fit(X_train_rbf, y_train)
y_pred = clf.predict(X_test_rbf)
print("Baseline Accuracy:", accuracy_score(y_test, y_pred))


# ================================
# Step 4: Tabu Search Optimization for RBFN
# ================================
import random

def evaluate_solution(gamma, n_components):
    """Train RBFN with given hyperparameters and return accuracy"""
    rbf_feature = RBFSampler(gamma=gamma, n_components=n_components, random_state=42)
    X_train_rbf = rbf_feature.fit_transform(X_train)
    X_test_rbf = rbf_feature.transform(X_test)

    clf = LogisticRegression(max_iter=500)
    clf.fit(X_train_rbf, y_train)
    y_pred = clf.predict(X_test_rbf)
    return accuracy_score(y_test, y_pred)

# Tabu Search Parameters
tabu_list = []
tabu_size = 5
num_iterations = 20

# Initial solution
best_gamma = 1
best_n_components = 100
best_acc = evaluate_solution(best_gamma, best_n_components)

print(f"Initial solution: gamma={best_gamma}, n_components={best_n_components}, acc={best_acc:.4f}")

for iteration in range(num_iterations):
    # Generate neighbor solutions
    candidate_gamma = best_gamma + random.choice([-0.5, 0.5])
    candidate_n_components = best_n_components + random.choice([-50, 50])

    # Bounds
    candidate_gamma = max(0.1, min(5, candidate_gamma))
    candidate_n_components = max(50, min(500, candidate_n_components))

    # Check tabu list
    if (candidate_gamma, candidate_n_components) in tabu_list:
        continue

    # Evaluate candidate
    acc = evaluate_solution(candidate_gamma, candidate_n_components)
    print(f"Iteration {iteration+1}: gamma={candidate_gamma}, n_components={candidate_n_components}, acc={acc:.4f}")

    # Update best
    if acc > best_acc:
        best_acc = acc
        best_gamma = candidate_gamma
        best_n_components = candidate_n_components

    # Update tabu list
    tabu_list.append((candidate_gamma, candidate_n_components))
    if len(tabu_list) > tabu_size:
        tabu_list.pop(0)

print(f"Optimized solution: gamma={best_gamma}, n_components={best_n_components}, acc={best_acc:.4f}")


# ================================
# Step 5: Final Evaluation
# ================================
rbf_feature = RBFSampler(gamma=best_gamma, n_components=best_n_components, random_state=42)
X_train_rbf = rbf_feature.fit_transform(X_train)
X_test_rbf = rbf_feature.transform(X_test)

clf = LogisticRegression(max_iter=500)
clf.fit(X_train_rbf, y_train)
y_pred = clf.predict(X_test_rbf)

print("\nFinal Model Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

                                                 url        type
0                               http://kitegacc.net/    phishing
1  https://www.electronichouse.com/article/ps3_ad...  legitimate
2      https://www.linkedin.com/in/larrymartinkimpel  legitimate
3  https://www.kansascity.com/2011/03/05/2700249/...  legitimate
4        https://www.en.wikipedia.org/wiki/Dem_Bones  legitimate
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208876 entries, 0 to 208875
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     208876 non-null  object
 1   type    208876 non-null  object
dtypes: object(2)
memory usage: 3.2+ MB
None
Baseline Accuracy: 0.9795815779394867
Initial solution: gamma=1, n_components=100, acc=0.9796
Iteration 1: gamma=1.5, n_components=150, acc=0.9803
Iteration 2: gamma=2.0, n_components=200, acc=0.9800
Iteration 3: gamma=1.0, n_components=100, acc=0.9796
Iteration 5: gamma=1.0, n_components=200, acc=0.9800
I

In [36]:
# ================================
# Setup
# ================================
!pip install minisom -q

import os, re, math, random, numpy as np, pandas as pd, tensorflow as tf
from urllib.parse import urlparse
from minisom import MiniSom
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

tf.get_logger().setLevel("ERROR")
tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)

# ================================
# Load dataset
# ================================
dataset = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Assignment/URL_dataset_clean_balanced.csv')
print(dataset.head())
print(dataset.info())

# ================================
# URL → feature engineering
# ================================
def extract_url_features(url):
    features = {}
    try:
        parsed_url = urlparse(url)
        features['url_length'] = len(url)
        features['num_dots'] = url.count('.')
        features['has_at_symbol'] = 1 if '@' in url else 0
        features['has_double_slash'] = 1 if '//' in url else 0
        features['num_dashes'] = url.count('-')
        features['num_underscores'] = url.count('_')
        features['num_equals'] = url.count('=')
        features['num_semicolons'] = url.count(';')
        features['num_commas'] = url.count(',')
        features['num_quotes'] = url.count("'") + url.count('"')
        features['num_less_than'] = url.count('<')
        features['num_greater_than'] = url.count('>')
        features['num_braces'] = url.count('{') + url.count('}')
        features['num_brackets'] = url.count('[') + url.count(']')
        features['num_parentheses'] = url.count('(') + url.count(')')
        features['num_hashes'] = url.count('#')
        features['num_exclamations'] = url.count('!')
        features['num_dollars'] = url.count('$')
        features['num_spaces'] = url.count(' ')
        features['num_slashes'] = url.count('/')
        features['num_questions'] = url.count('?')
        features['num_and'] = url.count('&')
        features['num_or'] = url.count('|')
        features['num_tilde'] = url.count('~')
        features['num_http'] = url.count('http') + url.count('https')
        features['num_www'] = url.count('www')
        host = parsed_url.hostname
        features['num_subdomains'] = len(host.split('.')) - 2 if host else 0
        features['path_length'] = len(parsed_url.path)
        features['query_length'] = len(parsed_url.query)
        features['fragment_length'] = len(parsed_url.fragment)
        ip_re = r'\b(?:\d{1,3}\.){3}\d{1,3}\b'
        features['has_ip_address'] = 1 if (host and re.match(ip_re, host)) else 0
    except Exception:
        # robust fallback
        features = {k:0 for k in [
            'url_length','num_dots','has_at_symbol','has_double_slash','num_dashes','num_underscores',
            'num_equals','num_semicolons','num_commas','num_quotes','num_less_than','num_greater_than',
            'num_braces','num_brackets','num_parentheses','num_hashes','num_exclamations','num_dollars',
            'num_spaces','num_slashes','num_questions','num_and','num_or','num_tilde','num_http',
            'num_www','num_subdomains','path_length','query_length','fragment_length','has_ip_address'
        ]}
    return pd.Series(features)

features_df = dataset['url'].apply(extract_url_features)

# Labels → 0/1
le = LabelEncoder()
y = le.fit_transform(dataset['type'])  # e.g., phishing=1, legitimate=0 (depends on order)

# Scale
scaler = MinMaxScaler()
X = scaler.fit_transform(features_df.values.astype(np.float32))

# ================================
# Train/Test split (then fit unsup. models on train to avoid leakage)
# ================================
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ================================
# SOM-RMO (Table 8 settings)
# SOM grid 10x10 (100 neurons), lr=0.5, radius=5, iters=1000, Gaussian
# We'll train SOM on TRAIN ONLY; then map train/test using the learned weights.
# RMO: we influence SOM updates toward a "best" template and add stochastic exploration.
# ================================
class RMOSOM:
    def __init__(self, n_neurons=100, lr=0.5, radius=5.0, rmo_beta=0.5, rmo_alpha=0.8, rmo_iters=1000):
        self.n_neurons = n_neurons
        self.lr0 = lr
        self.radius0 = radius
        self.rmo_beta = rmo_beta  # toward best
        self.rmo_alpha = rmo_alpha  # exploration / toward data
        self.rmo_iters = rmo_iters
        self.weights_ = None

    def fit(self, X):
        X = X.astype(np.float32)
        n_features = X.shape[1]
        # Random init
        w = np.random.rand(self.n_neurons, n_features).astype(np.float32)
        best_w = w.copy()

        for t in range(self.rmo_iters):
            # exp decay (common in SOM)
            lr_t = self.lr0 * math.exp(-t / self.rmo_iters)
            rad_t = self.radius0 * math.exp(-t / self.rmo_iters) + 1e-6

            # random sample
            idx = np.random.randint(0, X.shape[0])
            x = X[idx]

            # BMU
            dists = np.linalg.norm(w - x, axis=1)
            bmu = np.argmin(dists)
            bmu_w = w[bmu]

            # Update all neurons (Gaussian neighborhood)
            for i in range(self.n_neurons):
                dist_i = np.linalg.norm(w[i] - bmu_w)
                h = math.exp(-(dist_i**2) / (2 * (rad_t**2)))

                # RMO-like movement: toward "best" template and toward data point
                rmo_move = (
                    self.rmo_beta * np.random.rand() * (best_w[i] - w[i]) +
                    self.rmo_alpha * np.random.rand() * (x - w[i])
                )
                w[i] = w[i] + lr_t * h * rmo_move

            # Simple "best" template refresh (proxy for fitness-improving template)
            # In a full RMO, you'd evaluate a fitness and keep the best. Here we periodically snapshot.
            if (t+1) % 25 == 0:
                best_w = w.copy()

        self.weights_ = w
        return self

    def transform(self, X):
        """Map each vector to its BMU weight vector (same dimensionality)."""
        X = X.astype(np.float32)
        out = np.zeros_like(X)
        for i, x in enumerate(X):
            dists = np.linalg.norm(self.weights_ - x, axis=1)
            bmu = np.argmin(dists)
            out[i] = self.weights_[bmu]
        return out

# Fit SOM-RMO on training, then transform train/test
som = RMOSOM(n_neurons=10*10, lr=0.5, radius=5.0, rmo_beta=0.5, rmo_alpha=0.8, rmo_iters=1000)
som.fit(X_train_raw)
X_train_som = som.transform(X_train_raw)
X_test_som  = som.transform(X_test_raw)
print("SOM-RMO features:", X_train_som.shape, X_test_som.shape)

# ================================
# RBFN (TensorFlow) with K-means centers (Table 8)
# Centers=100, Gaussian RBF, SGD lr=0.01, momentum=0.9, Epochs=500
# ================================
n_centers = 100
kmeans = KMeans(n_clusters=n_centers, random_state=42, n_init='auto')
kmeans.fit(X_train_som)
centers = kmeans.cluster_centers_.astype(np.float32)

# Beta init from inter-center spacing
def init_beta_from_centers(centers):
    # median pairwise distance heuristic → sigma
    from sklearn.metrics import pairwise_distances
    D = pairwise_distances(centers, centers)
    sigma = np.median(D[D>0])  # avoid zeros on diagonal
    sigma = float(max(sigma, 1e-3))
    beta = 1.0 / (2.0 * sigma * sigma)
    return beta

beta_init = init_beta_from_centers(centers)

class RBFLayer(tf.keras.layers.Layer):
    def __init__(self, centers, beta):
        super().__init__()
        self.centers = tf.constant(centers, dtype=tf.float32)  # fixed centers
        # beta is trainable per Table 8? We'll keep global beta trainable in final model.
        self.beta = tf.Variable(initial_value=beta, trainable=True, dtype=tf.float32, name="beta")

    def call(self, inputs):
        # inputs: [batch, d]
        # centers: [C, d]
        # compute ||x - c||^2
        x_exp = tf.expand_dims(inputs, axis=1)      # [batch, 1, d]
        c_exp = tf.expand_dims(self.centers, axis=0)# [1, C, d]
        diff = x_exp - c_exp                        # [batch, C, d]
        l2 = tf.reduce_sum(tf.square(diff), axis=-1)  # [batch, C]
        return tf.exp(-self.beta * l2)              # [batch, C]

def build_rbfn_model(input_dim, centers, beta, l2=0.0):
    inputs = tf.keras.Input(shape=(input_dim,), name="rbf_input")
    rbf = RBFLayer(centers, beta)(inputs)  # [batch, C]
    outputs = tf.keras.layers.Dense(1, activation="sigmoid",
                                   kernel_regularizer=tf.keras.regularizers.l2(l2))(rbf)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    opt = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)
    model.compile(optimizer=opt, loss="binary_crossentropy", metrics=["accuracy"])
    return model

input_dim = X_train_som.shape[1]

# ================================
# Tabu Search (Table 8)
# - list size=50
# - iterations=100 (stop early after 10 non-improving)
# - aspiration=True
# - initial temperature=100, exponential cooling
# - mutation rate=0.1, crossover rate=0.7
# We optimize: beta_scale (multiplies beta_init), and l2 (output layer L2).
# ================================
tabu_list = []
TABU_SIZE = 50
MAX_ITERS = 10 #----------------------------------------------------change dy from 100-------------------------------------------
NO_IMPROVE_LIMIT = 10
ASPIRATION = True
TEMP = 100.0
COOL = 0.95
MUT_RATE = 0.1
CROSS_RATE = 0.7

# Search spaces
BETA_SCALE_MIN, BETA_SCALE_MAX = 0.1, 10.0
L2_MIN, L2_MAX = 0.0, 1e-2

def clamp(v, lo, hi): return max(lo, min(hi, v))

def proxy_eval(beta_scale, l2):
    """Quick proxy evaluation: train a small model for few epochs and return accuracy."""
    beta = float(beta_init * beta_scale)
    model = build_rbfn_model(input_dim, centers, beta, l2=l2)
    # fewer epochs for speed during search
    history = model.fit(X_train_som, y_train, epochs=15, batch_size=256, verbose=0,
                        validation_split=0.1, shuffle=True)
    # Evaluate on held-out test
    preds = (model.predict(X_test_som, verbose=0) > 0.5).astype("int32").ravel()
    acc = accuracy_score(y_test, preds)
    return acc

# Initialize two random candidates to enable crossover
def random_candidate():
    return (
        random.uniform(BETA_SCALE_MIN, BETA_SCALE_MAX),
        random.uniform(L2_MIN, L2_MAX),
    )

cand_a = (1.0, 0.0)  # start from table defaults (beta_scale=1, l2=0)
best_beta_scale, best_l2 = cand_a
best_acc = proxy_eval(*cand_a)
tabu_list.append(cand_a)
no_improve = 0

# second candidate
cand_b = random_candidate()
acc_b = proxy_eval(*cand_b)
if acc_b > best_acc:
    best_acc, best_beta_scale, best_l2 = acc_b, cand_b[0], cand_b[1]
tabu_list.append(cand_b)

print(f"[TS] init: beta_scale={best_beta_scale:.4f}, l2={best_l2:.6f}, acc={best_acc:.4f}")

for it in range(1, MAX_ITERS+1):
    # Crossover
    if random.random() < CROSS_RATE:
        parent1, parent2 = random.choice(tabu_list), random.choice(tabu_list)
        child_beta = 0.5*(parent1[0] + parent2[0])
        child_l2   = 0.5*(parent1[1] + parent2[1])
    else:
        # local neighbor around best
        child_beta = np.random.normal(loc=best_beta_scale, scale=0.2)
        child_l2   = np.random.normal(loc=best_l2, scale=1e-3)

    # Mutation
    if random.random() < MUT_RATE:
        child_beta += np.random.normal(scale=0.1)
    if random.random() < MUT_RATE:
        child_l2   += np.random.normal(scale=5e-4)

    # clamp
    child_beta = clamp(child_beta, BETA_SCALE_MIN, BETA_SCALE_MAX)
    child_l2   = clamp(child_l2, L2_MIN, L2_MAX)

    # Tabu check (rounded key to reduce duplicates)
    key = (round(child_beta, 4), round(child_l2, 6))
    if key in [(round(b,4), round(l,6)) for (b,l) in tabu_list] and not ASPIRATION:
        # skip if tabu and no aspiration
        TEMP *= COOL
        continue

    # Evaluate
    acc = proxy_eval(child_beta, child_l2)

    # Aspiration: allow tabu if better than global best
    tabu_violation = key in [(round(b,4), round(l,6)) for (b,l) in tabu_list]
    if (acc > best_acc) or (tabu_violation and ASPIRATION):
        best_acc = acc
        best_beta_scale, best_l2 = child_beta, child_l2
        no_improve = 0
    else:
        # SA-like acceptance with temperature
        delta = best_acc - acc
        accept_prob = math.exp(-max(0.0, delta) / max(1e-6, TEMP))
        if random.random() < accept_prob:
            # accept move (doesn't change best, but explores region)
            no_improve += 1
        else:
            no_improve += 1

    # Update tabu
    tabu_list.append((child_beta, child_l2))
    if len(tabu_list) > TABU_SIZE:
        tabu_list.pop(0)

    print(f"[TS] iter {it:03d}: beta_scale={child_beta:.4f}, l2={child_l2:.6f}, acc={acc:.4f}, best={best_acc:.4f}, T={TEMP:.2f}")
    TEMP *= COOL
    if no_improve >= NO_IMPROVE_LIMIT:
        print(f"[TS] Stopping early after {NO_IMPROVE_LIMIT} non-improving iterations.")
        break

print(f"[TS] best: beta_scale={best_beta_scale:.4f}, l2={best_l2:.6f}, proxy_acc={best_acc:.4f}")

# ================================
# Final Training (Table 8 epochs=500) with best TS params
# ================================
beta_final = float(beta_init * best_beta_scale)
final_model = build_rbfn_model(input_dim, centers, beta_final, l2=best_l2)
final_model.summary()

history = final_model.fit(
    X_train_som, y_train,
    epochs=50, batch_size=256, verbose=0,  #-------------------------------------------------epoches change from 500 -----------------------
    validation_split=0.1, shuffle=True
)

y_pred_prob = final_model.predict(X_test_som, verbose=0).ravel()
y_pred = (y_pred_prob > 0.5).astype("int32")

print("\nFinal Model Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=list(le.classes_)))


                                                 url        type
0                               http://kitegacc.net/    phishing
1  https://www.electronichouse.com/article/ps3_ad...  legitimate
2      https://www.linkedin.com/in/larrymartinkimpel  legitimate
3  https://www.kansascity.com/2011/03/05/2700249/...  legitimate
4        https://www.en.wikipedia.org/wiki/Dem_Bones  legitimate
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208876 entries, 0 to 208875
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     208876 non-null  object
 1   type    208876 non-null  object
dtypes: object(2)
memory usage: 3.2+ MB
None
SOM-RMO features: (167100, 31) (41776, 31)


  return fit_method(estimator, *args, **kwargs)


[TS] init: beta_scale=1.0000, l2=0.000000, acc=0.9257
[TS] iter 001: beta_scale=1.0000, l2=0.000000, acc=0.9257, best=0.9257, T=100.00
[TS] iter 002: beta_scale=1.0898, l2=0.000000, acc=0.9257, best=0.9257, T=95.00
[TS] iter 003: beta_scale=1.0449, l2=0.000062, acc=0.9257, best=0.9257, T=90.25
[TS] iter 004: beta_scale=1.1011, l2=0.003708, acc=0.9259, best=0.9259, T=85.74
[TS] iter 005: beta_scale=1.0000, l2=0.000000, acc=0.9257, best=0.9257, T=81.45
[TS] iter 006: beta_scale=1.0673, l2=0.000031, acc=0.9257, best=0.9257, T=77.38
[TS] iter 007: beta_scale=0.6798, l2=0.000000, acc=0.9257, best=0.9257, T=73.51
[TS] iter 008: beta_scale=1.0673, l2=0.000031, acc=0.9257, best=0.9257, T=69.83
[TS] iter 009: beta_scale=1.0561, l2=0.000047, acc=0.9257, best=0.9257, T=66.34
[TS] iter 010: beta_scale=1.0483, l2=0.000047, acc=0.9257, best=0.9257, T=63.02
[TS] best: beta_scale=1.0673, l2=0.000031, proxy_acc=0.9257



Final Model Performance:
Accuracy: 0.9326646878590579
              precision    recall  f1-score   support

  legitimate       0.89      0.99      0.94     20888
    phishing       0.99      0.88      0.93     20888

    accuracy                           0.93     41776
   macro avg       0.94      0.93      0.93     41776
weighted avg       0.94      0.93      0.93     41776

