In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("taruntiwarihp/phishing-site-urls")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/taruntiwarihp/phishing-site-urls?dataset_version_number=1...


100%|██████████| 9.03M/9.03M [00:00<00:00, 177MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/taruntiwarihp/phishing-site-urls/versions/1


In [3]:
import os

# List files in dataset folder
os.listdir(path)

['phishing_site_urls.csv']

In [4]:
import pandas as pd

csv_path = os.path.join(path, 'phishing_site_urls.csv')  # Replace with actual file name if needed
df = pd.read_csv(csv_path)
df.head()


Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad
3,mail.printakid.com/www.online.americanexpress....,bad
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad


Now, to visualize the dataset, .shape and .value_counts() will display the number of rows, columns, and URLS we have

In [5]:
print("Dataset shape:", df.shape)
print("Class distribution:")
print(df['Label'].value_counts())


Dataset shape: (549346, 2)
Class distribution:
Label
good    392924
bad     156422
Name: count, dtype: int64


Now, i will use dropna() the remove any rows that have missing values , and drop_duplicates() to remove duplicate rows

In [6]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

print("Cleaned dataset shape:", df.shape)


Cleaned dataset shape: (507196, 2)


In machine learning, there is a step called preprocessing that comes after the cleaning of the dataset. sklearn provides LabelEncoder() which transforms the database values into something the model can undestand. Phishing will correspond with 1, and benign with 0.

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Label'] = le.fit_transform(df['Label']) #replaces string labels with numeric ones automatically

print(df['Label'].value_counts())  # Now 0s and 1s


Now, to write a funtion that creates some variables based on characteristics of suspicious URLS

url_length - longer URLS can be more
suspicious


num_digits - digits are sometimes used to mimic characters, more digits can be more suspicious


num_spec_char - special chars are often used to obfuscate URLs


has_ip - IP addresses are rare in legit domains


has_https - lack of HTTPS can be a red flag


num_subdomains - phishing URLs often bury malicious domains under many subdomains

In [7]:
#importing the regular expressions module to use for pattern matching
import re
#define a function that takes the URL and returns a dictionary of extracted features
def extract_features(url):
    return {
        'url_length': len(url),
        'num_digits': sum(c.isdigit() for c in url),
         # re.findall() returns a list of matches for the given pattern. [^\w] matches anything NOT a-z, A-Z, 0–9, or _
        'num_special_chars': len(re.findall(r'[^\w]', url)),
        # re.search() finds a pattern in the string. The regex looks for an IPv4 address format.
        'has_ip': int(bool(re.search(r'\d{1,3}(\.\d{1,3}){3}', url))),
        # 'https' in url.lower() checks if the lowercase version contains 'https'
        'has_https': int('https' in url.lower()),   # int(True) = 1, int(False) = 0
        'num_subdomains': url.count('.') - 1,
    }
# df['URL'] is a pandas Series of strings (not a list or array, but similar in functionality)
# .apply() applies a function to every element in a pandas Series
features = df['URL'].apply(extract_features)
# Convert the Series of dictionaries into a new DataFrame
# .tolist() turns the Series into a list of dictionaries
features_df = pd.DataFrame(features.tolist()) # Each dictionary becomes a row

features_df['Label'] = df['Label']
features_df.head()


Unnamed: 0,url_length,num_digits,num_special_chars,has_ip,has_https,num_subdomains,Label
0,225,58,28,0,0,5,bad
1,81,1,14,0,0,4,bad
2,177,47,19,0,0,6,bad
3,60,0,8,0,0,5,bad
4,116,21,13,0,0,0,bad
