In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier

# Load dataset
df = pd.read_csv("dataset/All.csv")

# Replace inf and -inf with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN with 0
df.fillna(0, inplace=True)

# Encode labels
le = LabelEncoder()
df["label"] = le.fit_transform(df["URL_Type_obf_Type"])
df = df.drop(columns=["URL_Type_obf_Type"])

X = df.drop(columns=["label"]).values
y = df["label"].values

# ----------------------------------------
# MOA Fitness Function
# ----------------------------------------
def fitness_function(feature_mask):
    selected_features = np.where(feature_mask == 1)[0]
    
    if len(selected_features) == 0:
        return 0

    X_selected = X[:, selected_features]
    
    X_train, X_test, y_train, y_test = train_test_split(
        X_selected, y, test_size=0.2, random_state=42
    )

    model = AdaBoostClassifier(n_estimators=100)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    return accuracy_score(y_test, preds)

# ----------------------------------------
# Mayfly Optimization Algorithm (MOA)
# ----------------------------------------
def MOA(num_mayflies=8, max_iter=15, num_features=X.shape[1]):
    population = np.random.randint(0, 2, (num_mayflies, num_features))
    best_solution = population[0].copy()
    best_fitness = fitness_function(best_solution)

    for it in range(max_iter):
        for i in range(num_mayflies):
            r = np.random.rand()
            step = np.random.randint(0, 2, num_features)

            if r < 0.5:
                population[i] = np.logical_xor(population[i], step).astype(int)
            else:
                population[i] = np.logical_or(population[i], best_solution).astype(int)

            current_fitness = fitness_function(population[i])

            if current_fitness > best_fitness:
                best_fitness = current_fitness
                best_solution = population[i].copy()

        print(f"Iteration {it+1}/{max_iter} | Best Fitness: {best_fitness}")

    return best_solution, best_fitness

# Run MOA
best_features, best_accuracy = MOA()

print("\nBest MOA Accuracy:", best_accuracy)
print("Selected Feature Count:", best_features.sum())
print("Selected Feature Indexes:", np.where(best_features == 1)[0])


Iteration 1/15 | Best Fitness: 0.761781530918006
Iteration 2/15 | Best Fitness: 0.761781530918006
Iteration 3/15 | Best Fitness: 0.7722691364750749
Iteration 4/15 | Best Fitness: 0.7722691364750749
Iteration 5/15 | Best Fitness: 0.7722691364750749
Iteration 6/15 | Best Fitness: 0.7722691364750749
Iteration 7/15 | Best Fitness: 0.7722691364750749
Iteration 8/15 | Best Fitness: 0.7722691364750749
Iteration 9/15 | Best Fitness: 0.7722691364750749
Iteration 10/15 | Best Fitness: 0.7722691364750749
Iteration 11/15 | Best Fitness: 0.7722691364750749
Iteration 12/15 | Best Fitness: 0.7722691364750749
Iteration 13/15 | Best Fitness: 0.7722691364750749
Iteration 14/15 | Best Fitness: 0.7722691364750749
Iteration 15/15 | Best Fitness: 0.7722691364750749

Best MOA Accuracy: 0.7722691364750749
Selected Feature Count: 66
Selected Feature Indexes: [ 0  2  3  4  5  9 10 11 12 13 14 16 18 19 20 21 23 24 25 26 27 28 30 31
 32 34 35 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
 58 59 6

In [2]:
from sklearn.metrics import confusion_matrix, classification_report

# Rebuild FULL dataset again
df = pd.read_csv("dataset/All.csv")

df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(0, inplace=True)

le = LabelEncoder()
df["label"] = le.fit_transform(df["URL_Type_obf_Type"])
df = df.drop(columns=["URL_Type_obf_Type"])

X_full = df.drop(columns=["label"]).values
y = df["label"].values

# Convert MOA mask → indexes
selected_indexes = np.where(best_features == 1)[0]

print("Selected feature count:", len(selected_indexes))
print("Max index:", selected_indexes.max())
print("Total features:", X_full.shape[1])

# Apply feature selection correctly
X_selected = X_full[:, selected_indexes]

# Train-test split (80–20)
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42
)

# Train AdaBoost baseline
model = AdaBoostClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
preds = model.predict(X_test)

# Metrics
acc = accuracy_score(y_test, preds)
cm = confusion_matrix(y_test, preds)
cr = classification_report(y_test, preds)

print("=== AdaBoost (MOA Selected Features - Baseline) ===")
print("Accuracy:", acc)
print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n", cr)


Selected feature count: 66
Max index: 78
Total features: 79
=== AdaBoost (MOA Selected Features - Baseline) ===
Accuracy: 0.7722691364750749

Confusion Matrix:
 [[1306   37   49  158   78]
 [  48 1297   77   89   15]
 [ 107  258  799  148   20]
 [  99   75  148 1140   35]
 [ 123    7   25   76 1128]]

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.80      0.79      1628
           1       0.77      0.85      0.81      1526
           2       0.73      0.60      0.66      1332
           3       0.71      0.76      0.73      1497
           4       0.88      0.83      0.86      1359

    accuracy                           0.77      7342
   macro avg       0.77      0.77      0.77      7342
weighted avg       0.77      0.77      0.77      7342



In [3]:
import numpy as np
from sklearn.metrics import accuracy_score

# Use the same X_selected, y from previous step

# ---------------------------
# Fitness Function for BAT
# ---------------------------
def bat_fitness(params):
    n_estimators = int(params[0])
    learning_rate = float(params[1])

    model = AdaBoostClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        random_state=42
    )

    X_train, X_test, y_train, y_test = train_test_split(
        X_selected, y, test_size=0.2, random_state=42
    )

    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    return accuracy_score(y_test, preds)

# ---------------------------
# BAT Algorithm
# ---------------------------
def BAT(pop_size=8, max_iter=15):
    # Bounds: [n_estimators, learning_rate]
    bounds = np.array([
        [50, 300],     # n_estimators
        [0.01, 1.0]    # learning_rate
    ])

    dim = bounds.shape[0]
    
    # Initialize population
    population = np.random.uniform(bounds[:,0], bounds[:,1], (pop_size, dim))
    
    fitness_vals = np.array([bat_fitness(ind) for ind in population])

    best_idx = np.argmax(fitness_vals)
    best_solution = population[best_idx].copy()
    best_fitness = fitness_vals[best_idx]

    for it in range(max_iter):
        for i in range(pop_size):
            # Random walk
            eps = np.random.uniform(-1, 1, dim)
            new_solution = population[i] + eps * (best_solution - population[i])
            new_solution = np.clip(new_solution, bounds[:,0], bounds[:,1])

            new_fitness = bat_fitness(new_solution)

            if new_fitness > fitness_vals[i]:
                population[i] = new_solution
                fitness_vals[i] = new_fitness

                if new_fitness > best_fitness:
                    best_fitness = new_fitness
                    best_solution = new_solution.copy()

        print(f"Iteration {it+1}/{max_iter} | Best Fitness: {best_fitness}")

    return best_solution, best_fitness

# Run BAT
best_params, best_bat_acc = BAT()

print("\nBest BAT Parameters:")
print("n_estimators:", int(best_params[0]))
print("learning_rate:", best_params[1])
print("Best BAT Accuracy:", best_bat_acc)


Iteration 1/15 | Best Fitness: 0.7877962408063198
Iteration 2/15 | Best Fitness: 0.7931081449196404
Iteration 3/15 | Best Fitness: 0.7931081449196404
Iteration 4/15 | Best Fitness: 0.7931081449196404
Iteration 5/15 | Best Fitness: 0.7973304276763824
Iteration 6/15 | Best Fitness: 0.7973304276763824
Iteration 7/15 | Best Fitness: 0.7973304276763824
Iteration 8/15 | Best Fitness: 0.7973304276763824
Iteration 9/15 | Best Fitness: 0.7973304276763824
Iteration 10/15 | Best Fitness: 0.7973304276763824
Iteration 11/15 | Best Fitness: 0.7973304276763824
Iteration 12/15 | Best Fitness: 0.800463089076546
Iteration 13/15 | Best Fitness: 0.800463089076546
Iteration 14/15 | Best Fitness: 0.8023699264505584
Iteration 15/15 | Best Fitness: 0.8023699264505584

Best BAT Parameters:
n_estimators: 290
learning_rate: 0.8896174277790695
Best BAT Accuracy: 0.8023699264505584


In [4]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Train AdaBoost with BAT-optimized parameters
optimized_model = AdaBoostClassifier(
    n_estimators=178,
    learning_rate=0.7574395794737445,
    random_state=42
)

optimized_model.fit(X_train, y_train)

# Predictions
opt_preds = optimized_model.predict(X_test)

# Metrics
opt_acc = accuracy_score(y_test, opt_preds)
opt_cm = confusion_matrix(y_test, opt_preds)
opt_cr = classification_report(y_test, opt_preds)

print("=== AdaBoost (MOA + BAT Optimized) ===")
print("Accuracy:", opt_acc)
print("\nConfusion Matrix:\n", opt_cm)
print("\nClassification Report:\n", opt_cr)


=== AdaBoost (MOA + BAT Optimized) ===
Accuracy: 0.7823481340234268

Confusion Matrix:
 [[1306   41   52  142   87]
 [  12 1299  149   50   16]
 [  74  248  813  130   67]
 [  82   63  161 1147   44]
 [  84   13   17   66 1179]]

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.80      0.82      1628
           1       0.78      0.85      0.81      1526
           2       0.68      0.61      0.64      1332
           3       0.75      0.77      0.76      1497
           4       0.85      0.87      0.86      1359

    accuracy                           0.78      7342
   macro avg       0.78      0.78      0.78      7342
weighted avg       0.78      0.78      0.78      7342

