<a href="https://colab.research.google.com/github/juwetta/DLI_Group-B/blob/main/TP074003_Algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Loading the Dataset: The dataset is loaded into a pandas DataFrame, and the features (X) and the target variable (y) are extracted. The features appear to include columns 3 to the second-to-last column, and the target variable is the last column.

In [8]:
# Importing the libraries
import numpy as np
import pandas as pd
import tensorflow as tf

# Mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Importing the dataset
dataset = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Assignment/URL_dataset_clean_balanced.csv')


display(dataset)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,url,type
0,http://kitegacc.net/,phishing
1,https://www.electronichouse.com/article/ps3_ad...,legitimate
2,https://www.linkedin.com/in/larrymartinkimpel,legitimate
3,https://www.kansascity.com/2011/03/05/2700249/...,legitimate
4,https://www.en.wikipedia.org/wiki/Dem_Bones,legitimate
...,...,...
208871,http://www.apsweb.co.jp/wordpress/ihup/nD/inde...,phishing
208872,https://www.theruckus.wordpress.com/,legitimate
208873,http://jambidaily.com/34g3f3g/68k7jh65g.exe,phishing
208874,http://ejanla.co/43543r34r/843tf.exe,phishing


**Reasoning**:
The first step is to load the dataset and perform the initial feature extraction as outlined in the instructions. This involves importing necessary libraries, loading the data, defining the feature extraction function, and applying it to the dataset.



# XGBoost Version 1

In [10]:
# STEP 1: Install required libraries
!pip install xgboost optuna

# STEP 2: Import libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import optuna
from sklearn.preprocessing import LabelEncoder

# STEP 3: Load dataset
dataset = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Assignment/URL_dataset_clean_balanced.csv')

# Assuming first column is the url and last column is the label
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values

# Encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)


# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# STEP 4: Define Optuna objective function for tuning
def objective(trial):
    param = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "booster": "gbtree",
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0)
    }

    model = xgb.XGBClassifier(**param, use_label_encoder=False, verbosity=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return accuracy_score(y_test, preds)

# STEP 5: Run Optuna optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)  # you can increase n_trials for better tuning

print("Best trial:", study.best_trial.params)

# STEP 6: Train final model with best parameters
best_params = study.best_trial.params
model = xgb.XGBClassifier(**best_params, use_label_encoder=False, verbosity=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# STEP 7: Evaluate model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))



[I 2025-08-21 14:15:48,522] A new study created in memory with name: no-name-e720975d-4634-40cb-abf3-9f1d716130a0
[W 2025-08-21 14:15:48,553] Trial 0 failed with parameters: {'max_depth': 12, 'learning_rate': 0.14747962426967176, 'n_estimators': 248, 'subsample': 0.5909997595708336, 'colsample_bytree': 0.5316142161977183, 'gamma': 3.321054049879391, 'min_child_weight': 8, 'reg_alpha': 0.22244227781713355, 'reg_lambda': 0.3355710024140415} because of the following error: ValueError("could not convert string to float: 'https://www.langloiskronstromdesjardins.com/english/maniatis-dimitri/voir-details.html'").
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipython-input-3477087624.py", line 46, in objective
    model.fit(X_train, y_train)
  File "/usr/local/lib/python3.12/dist-packages/xgboost/core.py", line 729, in inner_f

ValueError: could not convert string to float: 'https://www.langloiskronstromdesjardins.com/english/maniatis-dimitri/voir-details.html'

In [11]:
print(X[:5])

[['http://kitegacc.net/']
 ['https://www.electronichouse.com/article/ps3_adding_dts_hd_master_audio/C133']
 ['https://www.linkedin.com/in/larrymartinkimpel']
 ['https://www.kansascity.com/2011/03/05/2700249/review-kansas-city-symphony-with.html']
 ['https://www.en.wikipedia.org/wiki/Dem_Bones']]
