<a href="https://colab.research.google.com/github/juwetta/DLI_Group-B/blob/main/DLI_Malicious_URL__2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# set paths

In [27]:
# Paths
SRC = '/content/drive/MyDrive/DLI Group B/url_dataset/URL dataset.csv'
OUT_DIR = '/content/drive/MyDrive/DLI Group B/url_dataset/clean_balanced'
FINAL_CSV = f'{OUT_DIR}/URL_dataset_clean_balanced.csv'

# Create output folder
import os
os.makedirs(OUT_DIR, exist_ok=True)


#1) Load, clean, and deduplicate

In [28]:
import pandas as pd

# Load
df = pd.read_csv(SRC)

# Clean: drop missing & exact duplicates
df = df.dropna(subset=['url','type']).drop_duplicates().reset_index(drop=True)

# Show cleaned stats
print('After cleaning:', df.shape)
print('Class counts (cleaned):\n', df['type'].value_counts())

# Find minority count (phishing ~104k in your data)
minority_count = df['type'].value_counts().min()

# Undersample each class to the minority count (no synthetic data)
df_balanced = (
    df.groupby('type', group_keys=False)
      .apply(lambda x: x.sample(n=minority_count, random_state=42))
      .sample(frac=1, random_state=42)   # shuffle
      .reset_index(drop=True)
)

# Sanity checks
print('\nBalanced counts:\n', df_balanced['type'].value_counts())
print('Missing values:\n', df_balanced.isnull().sum())
print('Duplicate rows:', df_balanced.duplicated().sum())

# Save final balanced & clean file
df_balanced.to_csv(FINAL_CSV, index=False)
print('\nSaved ->', FINAL_CSV)


After cleaning: (450176, 2)
Class counts (cleaned):
 type
legitimate    345738
phishing      104438
Name: count, dtype: int64


  .apply(lambda x: x.sample(n=minority_count, random_state=42))



Balanced counts:
 type
phishing      104438
legitimate    104438
Name: count, dtype: int64
Missing values:
 url     0
type    0
dtype: int64
Duplicate rows: 0

Saved -> /content/drive/MyDrive/DLI Group B/url_dataset/clean_balanced/URL_dataset_clean_balanced.csv


#2) Stratified train/val/test splits



In [29]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(
    df_balanced, test_size=0.30, stratify=df_balanced['type'], random_state=42
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.50, stratify=temp_df['type'], random_state=42
)

print('Split sizes -> train/val/test:', len(train_df), len(val_df), len(test_df))
print('Train counts:\n', train_df['type'].value_counts())
print('Val counts:\n',   val_df['type'].value_counts())
print('Test counts:\n',  test_df['type'].value_counts())

train_df.to_csv(f'{OUT_DIR}/URL_dataset_balanced_train.csv', index=False)
val_df.to_csv(f'{OUT_DIR}/URL_dataset_balanced_val.csv', index=False)
test_df.to_csv(f'{OUT_DIR}/URL_dataset_balanced_test.csv', index=False)
print('\nSaved splits to:', OUT_DIR)


Split sizes -> train/val/test: 146213 31331 31332
Train counts:
 type
phishing      73107
legitimate    73106
Name: count, dtype: int64
Val counts:
 type
legitimate    15666
phishing      15665
Name: count, dtype: int64
Test counts:
 type
legitimate    15666
phishing      15666
Name: count, dtype: int64

Saved splits to: /content/drive/MyDrive/DLI Group B/url_dataset/clean_balanced
