In [1]:
# Cell 1: Import Libraries and Load Data
import pandas as pd
from urllib.parse import urlparse
import re

# Load the Kaggle dataset
kaggle_path = 'data/phishing_site_urls.csv'
df = pd.read_csv(kaggle_path)

# --- 1. Initial Exploration ---
print("--- Initial Data Exploration ---")
print(f"Dataset Shape: {df.shape}")
print("\nFirst 5 Rows:")
print(df.head())
print("\nData Info:")
df.info()

# --- 2. Check for Missing Values ---
print("\nMissing Values:")
print(df.isnull().sum())

# --- 3. Analyze Label Distribution ---
print("\nLabel Distribution:")
print(df['Label'].value_counts())

--- Initial Data Exploration ---
Dataset Shape: (549346, 2)

First 5 Rows:
                                                 URL Label
0  nobell.it/70ffb52d079109dca5664cce6f317373782/...   bad
1  www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...   bad
2  serviciosbys.com/paypal.cgi.bin.get-into.herf....   bad
3  mail.printakid.com/www.online.americanexpress....   bad
4  thewhiskeydregs.com/wp-content/themes/widescre...   bad

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549346 entries, 0 to 549345
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   URL     549346 non-null  object
 1   Label   549346 non-null  object
dtypes: object(2)
memory usage: 8.4+ MB

Missing Values:
URL      0
Label    0
dtype: int64

Label Distribution:
Label
good    392924
bad     156422
Name: count, dtype: int64


In [2]:
# Cell 2: Clean and Standardize
# --- 1. Remove Duplicates ---
print(f"Shape before dropping duplicates: {df.shape}")
df.drop_duplicates(inplace=True)
print(f"Shape after dropping duplicates: {df.shape}")

# --- 2. Standardize Labels ---
# We will map 'good' to 0 and 'bad' to 1
df['Label'] = df['Label'].map({'good': 0, 'bad': 1})

print("\nDataset after cleaning and standardizing labels:")
print(df.head())

Shape before dropping duplicates: (549346, 2)
Shape after dropping duplicates: (507196, 2)

Dataset after cleaning and standardizing labels:
                                                 URL  Label
0  nobell.it/70ffb52d079109dca5664cce6f317373782/...      1
1  www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...      1
2  serviciosbys.com/paypal.cgi.bin.get-into.herf....      1
3  mail.printakid.com/www.online.americanexpress....      1
4  thewhiskeydregs.com/wp-content/themes/widescre...      1


In [3]:
# Cell 3: Feature Engineering Functions (UPGRADED)
print("Defining feature extraction functions...")

# --- All your previous functions are still here ---
def get_url_length(url):
    return len(url)
def get_hostname_length(url):
    try: return len(urlparse(url).netloc)
    except: return 0
def get_dot_count(url):
    return url.count('.')
def get_slash_count(url):
    return url.count('/')
def has_ip_address(url):
    try:
        if re.search(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', urlparse(url).netloc): return 1
        return 0
    except: return 0
def has_special_chars(url):
    if re.search(r'[@_-]', url): return 1
    return 0
def get_subdomain_count(url):
    try:
        hostname = urlparse(url).netloc
        return len(hostname.split('.')) - 2
    except: return 0
def has_https(url):
    try:
        if urlparse(url).scheme == 'https': return 1
        return 0
    except: return 0
def has_sensitive_keywords(url):
    keywords = ['login', 'secure', 'account', 'verify', 'password', 'signin', 'banking']
    for keyword in keywords:
        if keyword in url.lower():
            return 1
    return 0
    
# --- NEW, SMARTER FUNCTIONS ---
# 10. Count of Directories in Path
def count_directories(url):
    try:
        path = urlparse(url).path
        # Count non-empty segments
        return len([segment for segment in path.split('/') if segment])
    except:
        return 0

# 11. Count of Query Parameters
def count_query_params(url):
    try:
        query = urlparse(url).query
        if not query:
            return 0
        return len(query.split('&'))
    except:
        return 0

# 12. Check for common URL shortening services
def is_shortened(url):
    shorteners = ['bit.ly', 't.co', 'goo.gl', 'tinyurl', 'ow.ly']
    try:
        hostname = urlparse(url).netloc
        for shortener in shorteners:
            if shortener in hostname:
                return 1
        return 0
    except:
        return 0

print("Functions defined. Now applying them to the dataset...")

# --- Apply ALL functions ---
# (Old features)
df['url_length'] = df['URL'].apply(get_url_length)
df['hostname_length'] = df['URL'].apply(get_hostname_length)
df['dot_count'] = df['URL'].apply(get_dot_count)
df['slash_count'] = df['URL'].apply(get_slash_count)
df['has_ip'] = df['URL'].apply(has_ip_address)
df['has_special_chars'] = df['URL'].apply(has_special_chars)
df['subdomain_count'] = df['URL'].apply(get_subdomain_count)
df['has_https'] = df['URL'].apply(has_https)
df['has_sensitive_words'] = df['URL'].apply(has_sensitive_keywords)
# (New features)
df['directory_count'] = df['URL'].apply(count_directories)
df['query_param_count'] = df['URL'].apply(count_query_params)
df['is_shortened'] = df['URL'].apply(is_shortened)

print("\nDataset with all 12 engineered features:")
print(df.head())

Defining feature extraction functions...
Functions defined. Now applying them to the dataset...

Dataset with all 12 engineered features:
                                                 URL  Label  url_length  \
0  nobell.it/70ffb52d079109dca5664cce6f317373782/...      1         225   
1  www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...      1          81   
2  serviciosbys.com/paypal.cgi.bin.get-into.herf....      1         177   
3  mail.printakid.com/www.online.americanexpress....      1          60   
4  thewhiskeydregs.com/wp-content/themes/widescre...      1         116   

   hostname_length  dot_count  slash_count  has_ip  has_special_chars  \
0                0          6           10       0                  1   
1                0          5            4       0                  1   
2                0          7           11       0                  1   
3                0          6            2       0                  0   
4                0          1           10    

In [4]:
# Cell 4: Create and Save Final Dataset (UPGRADED)
# Select all our engineered features and the label
features = [
    'url_length', 'hostname_length', 'dot_count', 'slash_count',
    'has_ip', 'has_special_chars', 'subdomain_count', 'has_https',
    'has_sensitive_words', 'directory_count', 'query_param_count', 'is_shortened' # <-- Added new features
]
X = df[features]
y = df['Label']

# Combine features and label into a final dataframe
final_df = pd.concat([X, y], axis=1)

# Save to a new CSV file, overwriting the old one
output_path = 'processed_features.csv'
final_df.to_csv(output_path, index=False)

print(f"\nFinal feature-rich dataset saved to '{output_path}'")
print("Final dataset head:")
print(final_df.head())


Final feature-rich dataset saved to 'processed_features.csv'
Final dataset head:
   url_length  hostname_length  dot_count  slash_count  has_ip  \
0         225                0          6           10       0   
1          81                0          5            4       0   
2         177                0          7           11       0   
3          60                0          6            2       0   
4         116                0          1           10       0   

   has_special_chars  subdomain_count  has_https  has_sensitive_words  \
0                  1               -1          0                    1   
1                  1               -1          0                    0   
2                  1               -1          0                    1   
3                  0               -1          0                    0   
4                  1               -1          0                    0   

   directory_count  query_param_count  is_shortened  Label  
0                9   