<a href="https://colab.research.google.com/github/juwetta/DLI_Group-B/blob/main/DLI_Malicious_URL__2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Import libraries

In [3]:
!pip install -q imbalanced-learn
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from collections import Counter

#1) Load dataset & Encode target labels

In [7]:
# Load dataset
dataset_path = '/content/drive/MyDrive/DLI Group B/url_dataset/URL dataset.csv'
df = pd.read_csv(dataset_path)

print("Initial dataset shape:", df.shape)
print("Initial class distribution:\n", df['type'].value_counts())

# Encode target
le = LabelEncoder()
y_encoded = le.fit_transform(df['type'])  # legitimate=0, phishing=1


Initial dataset shape: (450176, 2)
Initial class distribution:
 type
legitimate    345738
phishing      104438
Name: count, dtype: int64


#2) Prepare Features & Apply SMOTE

In [8]:
# Use URL length as a simple numeric feature (replace with better features later)
X_numeric = df['url'].apply(len).values.reshape(-1, 1)

# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_numeric, y_encoded)

# Decode back to original labels
y_res_labels = le.inverse_transform(y_res)
print("\nBalanced class distribution:\n", Counter(y_res_labels))



Balanced class distribution:
 Counter({'legitimate': 345738, 'phishing': 345738})


#4) Create Balanced DataFrame & Save

In [9]:
# Create balanced DataFrame
balanced_df = pd.DataFrame({
    'url': df['url'].sample(n=len(y_res_labels), replace=True, random_state=42).values,
    'type': y_res_labels
})

print("Balanced dataset shape:", balanced_df.shape)

# Save balanced dataset
save_path = '/content/drive/MyDrive/DLI Group B/url_dataset/URL_dataset_balanced.csv'
balanced_df.to_csv(save_path, index=False)
print(f"Balanced dataset saved to: {save_path}")


Balanced dataset shape: (691476, 2)
Balanced dataset saved to: /content/drive/MyDrive/DLI Group B/url_dataset/URL_dataset_balanced.csv
