<a href="https://colab.research.google.com/github/juwetta/DLI_Group-B/blob/main/Ivan_Algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

# Change directory to the desired location in Google Drive
import os
os.chdir('/content/gdrive/MyDrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [8]:
# STEP 1: Install dependencies
!pip install lightgbm scikit-learn pandas

# STEP 2: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import lightgbm as lgb
import re

# STEP 3: Download & Load dataset
DATA_URL = "https://raw.githubusercontent.com/juwetta/DLI_Group-B/main/URL_dataset_clean_balanced.csv"
!wget -O URL_dataset_clean_balanced.csv "$DATA_URL"

df = pd.read_csv("URL_dataset_clean_balanced.csv")
print("✅ Dataset loaded successfully!")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print(df.head())

# STEP 4: Feature Engineering
def extract_features(df):
    df['url_length'] = df['url'].apply(len)
    df['num_digits'] = df['url'].str.count(r'\d')
    df['num_special'] = df['url'].str.count(r'[\W]')
    df['num_subdirs'] = df['url'].str.count('/')
    df['num_dots'] = df['url'].str.count(r'\.')
    df['has_https'] = df['url'].apply(lambda x: 1 if "https" in x else 0)
    df['has_ip'] = df['url'].apply(lambda x: 1 if re.search(r'(\d{1,3}\.){3}\d{1,3}', x) else 0)
    return df

df = extract_features(df)

# STEP 5: Drop raw url, prepare features and labels
X = df.drop(["url", "type"], axis=1)
y = df["type"].map({"legitimate": 0, "phishing": 1})  # Convert to binary (0=legit, 1=phishing)

# STEP 6: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# STEP 7: Define LightGBM model
lgbm = lgb.LGBMClassifier(
    boosting_type='gbdt',
    objective='binary',
    n_estimators=2000,
    learning_rate=0.02,
    max_depth=-1,
    num_leaves=64,
    colsample_bytree=0.8,
    subsample=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42,
    n_jobs=-1
)

# STEP 8: Train model (with early stopping)
lgbm.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='f1',
    callbacks=[lgb.early_stopping(stopping_rounds=100)]
)

# STEP 9: Predictions
y_pred = lgbm.predict(X_test)

# STEP 10: Evaluation
f1 = f1_score(y_test, y_pred)
print("\n🎯 F1 Score:", f1)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


--2025-08-19 11:10:38--  https://raw.githubusercontent.com/juwetta/DLI_Group-B/main/URL_dataset_clean_balanced.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15312637 (15M) [text/plain]
Saving to: ‘URL_dataset_clean_balanced.csv’


2025-08-19 11:10:39 (51.0 MB/s) - ‘URL_dataset_clean_balanced.csv’ saved [15312637/15312637]

✅ Dataset loaded successfully!
Shape: (208876, 2)
Columns: ['url', 'type']
                                                 url        type
0                               http://kitegacc.net/    phishing
1  https://www.electronichouse.com/article/ps3_ad...  legitimate
2      https://www.linkedin.com/in/larrymartinkimpel  legitimate
3  https://www.kansascity.com/2011/03/05/2700249/...  legitimate
4        https://www.en.wikipedia.org