<a href="https://colab.research.google.com/github/juwetta/DLI_Group-B/blob/main/TP074003_Algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Loading the Dataset: The dataset is loaded into a pandas DataFrame, and the features (X) and the target variable (y) are extracted. The features appear to include columns 3 to the second-to-last column, and the target variable is the last column.

In [None]:
# Importing the libraries
import numpy as np
import pandas as pd
import tensorflow as tf

# Mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Importing the dataset
dataset = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Assignment/URL_dataset_clean_balanced.csv')


display(dataset)


# PCA + XGBoost

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb

# Step 1: Load dataset
dataset = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Assignment/URL_dataset_clean_balanced.csv')

# Step 2: Encode categorical columns with LabelEncoder (lightweight)
df = dataset.copy()
for col in df.columns:
    if df[col].dtype == 'object':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))

# Step 3: Split features & labels
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Step 4: Normalize
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 6: Dimensionality reduction (Truncated SVD instead of PCA)
svd = TruncatedSVD(n_components=100, random_state=42)  # reduce to 100 features
X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.transform(X_test)

print("✅ Original shape:", X_train.shape)
print("✅ Reduced shape :", X_train_svd.shape)

# Step 7: Train XGBoost
xgb_clf = xgb.XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss'
)

xgb_clf.fit(X_train_svd, y_train)
y_pred = xgb_clf.predict(X_test_svd)

# Step 8: Evaluate
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("\n📊 Model Evaluation (SVD + XGB)")
print(f"Accuracy  : {accuracy:.4f}")
print(f"Precision : {precision:.4f}")
print(f"Recall    : {recall:.4f}")
print(f"F1-score  : {f1:.4f}")


ValueError: Found array with 1 feature(s) (shape=(167100, 1)) while a minimum of 2 is required by TruncatedSVD.

# V2 autoencoder

In [2]:
# ================================
# Autoencoder (FE) + XGBoost (CLS) for large URL dataset
# ================================
import numpy as np
import pandas as pd
import re
from urllib.parse import urlparse

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

import xgboost as xgb
import tensorflow as tf
tf.get_logger().setLevel("ERROR")

# ---------- Load dataset ----------
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Assignment/URL_dataset_clean_balanced.csv')

# ---------- Feature engineering from URL (compact, numeric) ----------
def extract_url_features(url: str):
    try:
        p = urlparse(url)
        host = p.hostname or ""
        path = p.path or ""
        query = p.query or ""
        s = url.lower()

        feats = {
            "url_len": len(url),
            "host_len": len(host),
            "path_len": len(path),
            "query_len": len(query),
            "n_dots": s.count('.'),
            "n_hyphens": s.count('-'),
            "n_slash": s.count('/'),
            "n_qmark": s.count('?'),
            "n_eq": s.count('='),
            "n_and": s.count('&'),
            "n_at": s.count('@'),
            "n_pct": s.count('%'),
            "n_digits": sum(c.isdigit() for c in s),
            "has_ip": 1 if re.match(r'^\d{1,3}(\.\d{1,3}){3}$', host) else 0,
            "https": 1 if s.startswith('https') else 0,
            "has_www": 1 if 'www' in host else 0,
            "n_params": (query.count('&') + (1 if '=' in query else 0)),
            "subdomains": max(0, len(host.split('.')) - 2) if host else 0,
        }
        return pd.Series(feats)
    except Exception:
        return pd.Series({
            "url_len":0,"host_len":0,"path_len":0,"query_len":0,"n_dots":0,"n_hyphens":0,
            "n_slash":0,"n_qmark":0,"n_eq":0,"n_and":0,"n_at":0,"n_pct":0,"n_digits":0,
            "has_ip":0,"https":0,"has_www":0,"n_params":0,"subdomains":0
        })

X_feats = df['url'].astype(str).apply(extract_url_features)
y_raw = df['type'].astype(str)

# label-encode target (binary or multi-class)
le = LabelEncoder()
y = le.fit_transform(y_raw)

# ---------- Train/test split ----------
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_feats.values, y, test_size=0.2, random_state=42, stratify=y
)

# ---------- Scale (fit on train only) ----------
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test  = scaler.transform(X_test_raw)

# ---------- Autoencoder (dimensionality reduction) ----------
input_dim = X_train.shape[1]        # ~18 features above
latent_dim = 16                     # bottleneck; try 8/16/32

inputs = tf.keras.Input(shape=(input_dim,))
x = tf.keras.layers.Dense(64, activation='relu')(inputs)
x = tf.keras.layers.Dense(32, activation='relu')(x)
z = tf.keras.layers.Dense(latent_dim, activation='linear', name='latent')(x)   # bottleneck
x = tf.keras.layers.Dense(32, activation='relu')(z)
x = tf.keras.layers.Dense(64, activation='relu')(x)
outputs = tf.keras.layers.Dense(input_dim, activation='linear')(x)

autoencoder = tf.keras.Model(inputs, outputs)
autoencoder.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), loss='mse')

early = tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True, monitor='val_loss')
autoencoder.fit(
    X_train, X_train,
    epochs=50, batch_size=512, shuffle=True,
    validation_data=(X_test, X_test),
    callbacks=[early], verbose=1
)

# Extract latent features
encoder = tf.keras.Model(inputs, autoencoder.get_layer('latent').output)
Z_train = encoder.predict(X_train, batch_size=4096, verbose=0)
Z_test  = encoder.predict(X_test,  batch_size=4096, verbose=0)

print("Latent shape:", Z_train.shape)

# ---------- XGBoost on latent ----------
xgb_clf = xgb.XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss',
    tree_method='hist'   # fast & memory-efficient
)

xgb_clf.fit(Z_train, y_train)
y_pred = xgb_clf.predict(Z_test)

# ---------- Evaluation ----------
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\n🏁 Autoencoder (FE) + XGBoost (CLS) results")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}")
print("\nClassification report:")
print(classification_report(y_test, y_pred, target_names=list(le.classes_)))


Epoch 1/50
[1m327/327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - loss: 0.4454 - val_loss: 0.0590
Epoch 2/50
[1m327/327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.0585 - val_loss: 0.0340
Epoch 3/50
[1m327/327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 0.0304 - val_loss: 0.0235
Epoch 4/50
[1m327/327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.0218 - val_loss: 0.0300
Epoch 5/50
[1m327/327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.0475 - val_loss: 0.0182
Epoch 6/50
[1m327/327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - loss: 0.0159 - val_loss: 0.0216
Epoch 7/50
[1m327/327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - loss: 0.0342 - val_loss: 0.0119
Epoch 8/50
[1m327/327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.0215 - val_loss: 0.0123
Epoch 9/50
[1m327/327[0m [32m━━━━━━━━