In [9]:
import pandas as pd
import numpy as np
import re
import random

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [10]:
df = pd.read_csv(r"C:\Users\91738\OneDrive\Desktop\DESKTOP\PROJECTS\major project\dataset\malicious_phish.csv")
print(df.shape)
df.head()


(651191, 2)


Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [11]:
def extract_features(url):
    features = []
    
    features.append(len(url))                          # URL length
    features.append(len(re.findall(r'\d', url)))       # digits
    features.append(len(re.findall(r'[^\w]', url)))    # special chars
    features.append(1 if re.search(r'\d+\.\d+\.\d+\.\d+', url) else 0)  # IP
    features.append(url.count('http'))
    features.append(url.count('https'))
    features.append(url.count('.'))                     # subdomains
    
    return features


In [12]:
X = np.array(df['url'].apply(extract_features).to_list())
y = df['type']


In [13]:
le = LabelEncoder()
y = le.fit_transform(y)

print("Classes:", le.classes_)


Classes: ['benign' 'defacement' 'malware' 'phishing']


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [15]:
def fho_feature_selection(X, y, n_hawks=10, iterations=10):
    n_features = X.shape[1]
    best_mask = np.ones(n_features)
    best_score = 0

    for _ in range(iterations):
        for _ in range(n_hawks):
            mask = np.random.randint(0, 2, n_features)
            if mask.sum() == 0:
                continue

            X_sel = X[:, mask == 1]
            model = AdaBoostClassifier(n_estimators=50)
            model.fit(X_sel, y)
            score = model.score(X_sel, y)

            if score > best_score:
                best_score = score
                best_mask = mask.copy()

    return best_mask


In [16]:
best_features = fho_feature_selection(X_train, y_train)

X_train_fs = X_train[:, best_features == 1]
X_test_fs  = X_test[:, best_features == 1]

print("Selected features:", best_features)
print("Feature count:", X_train_fs.shape[1])




Selected features: [0 1 1 1 1 0 1]
Feature count: 5


In [20]:

def adaboost_fitness(params):
    n_estimators = int(params[0])
    lr = params[1]
    depth = int(params[2])

    model = AdaBoostClassifier(
        estimator=DecisionTreeClassifier(max_depth=depth),
        n_estimators=n_estimators,
        learning_rate=lr,
        algorithm="SAMME"
    )

    model.fit(X_train_fs, y_train)
    preds = model.predict(X_test_fs)

    return accuracy_score(y_test, preds)

In [21]:
def cuckoo_search(n_nests=10, iterations=10):
    nests = np.array([
        [random.randint(50, 200), random.uniform(0.01, 1), random.randint(1, 5)]
        for _ in range(n_nests)
    ])

    best_nest = nests[0]
    best_score = adaboost_fitness(best_nest)

    for _ in range(iterations):
        for i in range(n_nests):
            step = np.random.normal(0, 1, 3)
            new_nest = nests[i] + step

            new_nest[0] = int(np.clip(new_nest[0], 50, 200))
            new_nest[1] = np.clip(new_nest[1], 0.01, 1)
            new_nest[2] = int(np.clip(new_nest[2], 1, 5))

            score = adaboost_fitness(new_nest)

            if score > best_score:
                best_score = score
                best_nest = new_nest

    return best_nest, best_score


In [23]:
best_params, best_acc = cuckoo_search()

print("Best Parameters:", best_params)
print("Best Accuracy:", best_acc) 


Best Parameters: [59.          0.56160099  5.        ]
Best Accuracy: 0.8565867366917743
