<a href="https://colab.research.google.com/github/juwetta/DLI_Group-B/blob/main/TP074003_Algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Loading the Dataset: The dataset is loaded into a pandas DataFrame, and the features (X) and the target variable (y) are extracted. The features appear to include columns 3 to the second-to-last column, and the target variable is the last column.

In [8]:
# Importing the libraries
import numpy as np
import pandas as pd
import tensorflow as tf

# Mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Importing the dataset
dataset = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Assignment/URL_dataset_clean_balanced.csv')


display(dataset)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,url,type
0,http://kitegacc.net/,phishing
1,https://www.electronichouse.com/article/ps3_ad...,legitimate
2,https://www.linkedin.com/in/larrymartinkimpel,legitimate
3,https://www.kansascity.com/2011/03/05/2700249/...,legitimate
4,https://www.en.wikipedia.org/wiki/Dem_Bones,legitimate
...,...,...
208871,http://www.apsweb.co.jp/wordpress/ihup/nD/inde...,phishing
208872,https://www.theruckus.wordpress.com/,legitimate
208873,http://jambidaily.com/34g3f3g/68k7jh65g.exe,phishing
208874,http://ejanla.co/43543r34r/843tf.exe,phishing


**Reasoning**:
The first step is to load the dataset and perform the initial feature extraction as outlined in the instructions. This involves importing necessary libraries, loading the data, defining the feature extraction function, and applying it to the dataset.



# XGBoost Version 1

In [19]:
# STEP 1: Install libraries
#!pip install xgboost optuna tensorflow

# STEP 2: Imports
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import xgboost as xgb
import optuna

# STEP 3: Load dataset
dataset = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Assignment/URL_dataset_clean_balanced.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# STEP 4: Autoencoder for feature extraction
input_dim = X_train.shape[1]

autoencoder = models.Sequential([
    layers.Input(shape=(input_dim,)),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu', name="bottleneck"),  # compressed feature space
    layers.Dense(64, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(input_dim, activation='sigmoid')
])

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_train, X_train, epochs=50, batch_size=64, shuffle=True, validation_data=(X_test, X_test), verbose=1)

# Extract compressed features
encoder = models.Model(inputs=autoencoder.input, outputs=autoencoder.get_layer("bottleneck").output)
X_train_encoded = encoder.predict(X_train)
X_test_encoded = encoder.predict(X_test)

print("Original shape:", X_train.shape, "Encoded shape:", X_train_encoded.shape)

# STEP 5: Train XGBoost with Optuna on extracted features
def objective(trial):
    param = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "booster": "gbtree",
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0)
    }
    model = xgb.XGBClassifier(**param, use_label_encoder=False, verbosity=0)
    model.fit(X_train_encoded, y_train)
    preds = model.predict(X_test_encoded)
    return accuracy_score(y_test, preds)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

print("Best trial:", study.best_trial.params)

# STEP 6: Train final model
best_params = study.best_trial.params
model = xgb.XGBClassifier(**best_params, use_label_encoder=False, verbosity=0)
model.fit(X_train_encoded, y_train)
y_pred = model.predict(X_test_encoded)

# STEP 7: Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


ValueError: Invalid dtype: object

# Autoencoder + XGBoost

In [20]:
# Step 1: Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras import layers, models
import xgboost as xgb

# Step 2: Load dataset
dataset = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Assignment/URL_dataset_clean_balanced.csv')

# Step 3: Encode categorical columns
df = dataset.copy()
for col in df.columns:
    if df[col].dtype == 'object':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

# Step 4: Split features & labels
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Step 5: Normalize
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 6: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 7: Autoencoder for feature extraction
input_dim = X_train.shape[1]
encoding_dim = 32  # compressed dimension

input_layer = layers.Input(shape=(input_dim,))
encoded = layers.Dense(128, activation='relu')(input_layer)
encoded = layers.Dense(64, activation='relu')(encoded)
bottleneck = layers.Dense(encoding_dim, activation='relu')(encoded)
decoded = layers.Dense(64, activation='relu')(bottleneck)
decoded = layers.Dense(128, activation='relu')(decoded)
output_layer = layers.Dense(input_dim, activation='linear')(decoded)

autoencoder = models.Model(inputs=input_layer, outputs=output_layer)
autoencoder.compile(optimizer='adam', loss='mse')

print("Training autoencoder...")
autoencoder.fit(
    X_train, X_train,
    epochs=50, batch_size=64,
    shuffle=True,
    validation_data=(X_test, X_test),
    verbose=1
)

# Step 8: Extract compressed features
encoder = models.Model(inputs=input_layer, outputs=bottleneck)
X_train_encoded = encoder.predict(X_train)
X_test_encoded = encoder.predict(X_test)

print("Original feature shape:", X_train.shape)
print("Compressed feature shape:", X_train_encoded.shape)

# Step 9: Train XGBoost classifier
xgb_clf = xgb.XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

print("Training XGBoost on compressed features...")
xgb_clf.fit(X_train_encoded, y_train)

# Step 10: Evaluate
y_pred = xgb_clf.predict(X_test_encoded)
acc = accuracy_score(y_test, y_pred)

print("\n✅ Model Evaluation")
print("Accuracy:", acc)
print(classification_report(y_test, y_pred))


Training autoencoder...
Epoch 1/50
[1m2611/2611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - loss: 0.0292 - val_loss: 1.4240e-06
Epoch 2/50
[1m2611/2611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - loss: 1.1609e-05 - val_loss: 1.4891e-07
Epoch 3/50
[1m2611/2611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 1.6536e-05 - val_loss: 1.3926e-08
Epoch 4/50
[1m2611/2611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - loss: 2.7597e-05 - val_loss: 5.1165e-08
Epoch 5/50
[1m2611/2611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 1.9766e-05 - val_loss: 3.6596e-05
Epoch 6/50
[1m2611/2611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - loss: 2.4092e-05 - val_loss: 8.6017e-06
Epoch 7/50
[1m2611/2611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - loss: 1.2814e-05 - val_loss: 3.4115e-08
Epoch 8/50
[1m2611/2611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



✅ Model Evaluation
Accuracy: 0.9947577556491766
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     20888
           1       1.00      0.99      0.99     20888

    accuracy                           0.99     41776
   macro avg       0.99      0.99      0.99     41776
weighted avg       0.99      0.99      0.99     41776



# PCA + XGBoost

In [21]:
# Step 1: Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb

# Step 2: Load dataset
dataset = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Assignment/URL_dataset_clean_balanced.csv')

# Step 3: Encode categorical columns
df = dataset.copy()
for col in df.columns:
    if df[col].dtype == 'object':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

# Step 4: Split features & labels
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Step 5: Normalize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 6: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 7: PCA for feature extraction
pca = PCA(n_components=0.95)  # keep 95% variance
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print("Original feature shape:", X_train.shape)
print("Reduced feature shape:", X_train_pca.shape)

# Step 8: Train XGBoost classifier
xgb_clf = xgb.XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

print("Training XGBoost on PCA features...")
xgb_clf.fit(X_train_pca, y_train)

# Step 9: Evaluate
y_pred = xgb_clf.predict(X_test_pca)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("\n✅ Model Evaluation (PCA + XGB)")
print(f"Accuracy  : {accuracy:.4f}")
print(f"Precision : {precision:.4f}")
print(f"Recall    : {recall:.4f}")
print(f"F1-score  : {f1:.4f}")


Original feature shape: (167100, 1)
Reduced feature shape: (167100, 1)
Training XGBoost on PCA features...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



✅ Model Evaluation (PCA + XGB)
Accuracy  : 0.9947
Precision : 0.9948
Recall    : 0.9947
F1-score  : 0.9947
