In [1]:
import pandas as pd

# Assuming you downloaded and saved as top-1k.csv
df = pd.read_csv("tranco.csv", header=None)
df.columns = ['rank', 'domain']

# Add protocol and create URL
df['url'] = 'https://' + df['domain']
df['label'] = 0  # label = 0 means legitimate

df[['url', 'label']].to_csv("legit_urls.csv", index=False)


In [14]:
import requests
import csv

# OpenPhish free feed (you can visit it to see how the data looks)
URL = "https://openphish.com/feed.txt"

response = requests.get(URL)
phishing_urls = response.text.strip().split('\n')

# Save to CSV
with open("phishing_urls.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["url", "label"])  # label = 1 means phishing
    for url in phishing_urls:
        writer.writerow([url, 1])


In [15]:
import requests
import csv

# OpenPhish free feed (you can visit it to see how the data looks)
URL = "https://urlhaus.abuse.ch/downloads/text/"

response = requests.get(URL)
phishing_urls = response.text.strip().split('\n')

# Save to CSV
with open("phishing_urls2.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["url", "label"])  # label = 1 means phishing
    for url in phishing_urls:
        writer.writerow([url, 1])


In [23]:

# Load both CSV files (with label column already present)
df_legit = pd.read_csv('legit_urls.csv')
df_phish = pd.read_csv('phishing_urls.csv')

# Concatenate them
df_combined = pd.concat([df_legit, df_phish], ignore_index=True)

# Optional: Shuffle the combined dataset
df_combined = df_combined.sample(frac=1).reset_index(drop=True)

# Save to a new CSV file
df_combined.to_csv('urls.csv', index=False)

print("Combined dataset saved to urls.csv")


Combined dataset saved to urls.csv


In [33]:

# Load the combined dataset
df = pd.read_csv('urls.csv')

# Count occurrences of each label
label_counts = df['label'].value_counts()

print(label_counts)


label
0.0    1000000
1.0     110854
Name: count, dtype: int64


In [25]:
# Load the CSV file
df = pd.read_csv("urls.csv")

# Drop duplicate URLs based on the 'url' column
df.drop_duplicates(subset='url', inplace=True)

# Save the cleaned DataFrame back to CSV
df.to_csv("urls.csv", index=False)

print(f"Removed duplicates. {len(df)} unique URLs saved to urls_cleaned.csv")


Removed duplicates. 1110855 unique URLs saved to urls_cleaned.csv


In [39]:
import math
import tldextract
import re
from urllib.parse import urlparse

# Load the CSV with URLs
df = pd.read_csv("phishing_dataset_features.csv")  # Combine phishing + legit CSVs

# Feature extraction functions
def has_ip(url):
    return 1 if re.match(r"http[s]?://\d{1,3}(\.\d{1,3}){3}", url) else 0

def url_length(url):
    return len(url)

def count_dots(url):
    return url.count('.')

def has_at_symbol(url):
    return 1 if '@' in url else 0

def uses_https(url):
    return 1 if url.startswith("https") else 0

def get_domain(url):
    ext = tldextract.extract(url)
    return ext.domain
# Helper: Entropy calculator
def shannon_entropy(string):
    prob = [float(string.count(c)) / len(string) for c in set(string)]
    return -sum([p * math.log(p) / math.log(2.0) for p in prob])

# Shorteners to check
shorteners = ['bit.ly', 'tinyurl.com', 'goo.gl', 'ow.ly', 't.co', 'is.gd', 'buff.ly']

def count_hyphens(url):
    return url.count('-')

def count_subdomains(url):
    hostname = urlparse(url).hostname or ''
    return hostname.count('.') - 1

def has_suspicious_words(url):
    words = ['login', 'verify', 'secure', 'update', 'account', 'banking']
    return any(word in url.lower() for word in words)

def get_url_entropy(url):
    return shannon_entropy(url)

def is_short_url(url):
    return any(short in url for short in shorteners)

def get_path_len(url):
    return len(urlparse(url).path)

def get_query_len(url):
    return len(urlparse(url).query)

def count_special_chars(url):
    return sum(url.count(c) for c in ['@', '&', '%', '=', '?', '_'])

def has_http_token(url):
    path = urlparse(url).path
    return 1 if 'http' in path else 0


def has_port(url):
    return 1 if urlparse(url).port else 0

def domain_length(url):
    ext = tldextract.extract(url)
    return len(ext.domain)

suspicious_tlds = ['tk', 'ga', 'ml', 'cf', 'gq']
def has_suspicious_tld(url):
    ext = tldextract.extract(url)
    return 1 if ext.suffix in suspicious_tlds else 0

def https_in_domain(url):
    ext = tldextract.extract(url)
    full_domain = f"{ext.subdomain}.{ext.domain}.{ext.suffix}"
    return 1 if 'https' in full_domain else 0

def hostname_length(url):
    hostname = urlparse(url).hostname or ''
    return len(hostname)
def count_digits(url):
    return sum(c.isdigit() for c in url)


# Extract features
df['url_len'] = df['url'].apply(url_length)
df['num_dots'] = df['url'].apply(count_dots)
df['has_ip'] = df['url'].apply(has_ip)
df['has_at'] = df['url'].apply(has_at_symbol)
df['uses_https'] = df['url'].apply(uses_https)
df['domain'] = df['url'].apply(get_domain)
df['num_hyphens'] = df['url'].apply(count_hyphens)
df['num_subdomains'] = df['url'].apply(count_subdomains)
df['has_suspicious_words'] = df['url'].apply(has_suspicious_words)
df['url_entropy'] = df['url'].apply(get_url_entropy)
df['is_shortened'] = df['url'].apply(is_short_url)
df['path_length'] = df['url'].apply(get_path_len)
df['query_length'] = df['url'].apply(get_query_len)
df['num_special_chars'] = df['url'].apply(count_special_chars)
df['has_http_token'] = df['url'].apply(has_http_token)
df['has_port'] = df['url'].apply(has_port)
df['domain_len'] = df['url'].apply(domain_length)
df['suspicious_tld'] = df['url'].apply(has_suspicious_tld)
df['https_in_domain'] = df['url'].apply(https_in_domain)
df['hostname_len'] = df['url'].apply(hostname_length)
df['digit_count'] = df['url'].apply(count_digits)


# Save feature-rich dataset
df.to_csv("phishing_dataset_features.csv", index=False)


In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("url.csv")

# Drop rows where 'url' column is missing or empty
df = df[df['url'].notna()]             # Removes NaN values
df = df[df['url'].str.strip() != ""]   # Removes empty strings or spaces

# Optionally, reset index
df = df.reset_index(drop=True)

# Save cleaned file
df.to_csv("phishing_dataset_features.csv", index=False)


In [36]:
# Load the combined dataset
df = pd.read_csv('phishing_dataset_features.csv')

# Count occurrences of each label
label_counts = df['label'].value_counts()

print(label_counts)

label
0.0    1000000
1.0     110854
Name: count, dtype: int64


In [1]:
import pandas as pd

# Load your dataset
df = pd.read_csv("phishing_dataset_features.csv")

# Check class distribution
print("Original class distribution:")
print(df['label'].value_counts())

# Separate classes
malicious = df[df['label'] == 1.0]
benign = df[df['label'] == 0.0]

# Downsample benign to match malicious count
benign_sampled = benign.sample(n=len(malicious), random_state=42)

# Combine balanced dataset
balanced_df = pd.concat([malicious, benign_sampled]).sample(frac=1, random_state=42).reset_index(drop=True)

# Save to new CSV
balanced_df.to_csv("phishing_dataset_balanced.csv", index=False)

print("\nBalanced class distribution:")
print(balanced_df['label'].value_counts())


Original class distribution:
label
0.0    1000000
1.0     110854
Name: count, dtype: int64

Balanced class distribution:
label
1.0    110854
0.0    110854
Name: count, dtype: int64


In [4]:
# Set sample size per class
sample_size = 5000  # or any number you prefer

# Split by label
phishing_df = df[df['label'] == 1.0].sample(n=sample_size, random_state=42)
legit_df = df[df['label'] == 0.0].sample(n=sample_size, random_state=42)

# Combine and shuffle
balanced_df = pd.concat([phishing_df, legit_df]).sample(frac=1, random_state=42).reset_index(drop=True)

# Now balanced_df has 5000 phishing + 5000 legitimate = 10,000 total
print(balanced_df['label'].value_counts())
balanced_df.to_csv("balanced_phishing_dataset.csv", index=False)


label
0.0    5000
1.0    5000
Name: count, dtype: int64


In [None]:
import pandas as pd
import tldextract
import requests
import whois
import socket
import ipinfo
import logging
import re
from urllib.parse import urlparse
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
from functools import lru_cache

# Suppress whois logger
logging.getLogger('whois').setLevel(logging.CRITICAL)

# Load dataset
df = pd.read_csv("balanced_phishing_dataset.csv")

# Initialize IPinfo handler
access_token = '680060c702415c'  # Replace with your own token
ip_handler = ipinfo.getHandler(access_token)

# ---------------------- WHOIS DOMAIN AGE ---------------------- #

@lru_cache(maxsize=None)
def get_domain_age_cached(domain):
    try:
        socket.gethostbyname(domain)
        info = whois.whois(domain)
        creation = info.creation_date
        if isinstance(creation, list):
            creation = min(creation)
        if creation:
            if isinstance(creation, str):
                creation = datetime.strptime(creation, "%Y-%m-%d %H:%M:%S")
            return (datetime.now() - creation).days
    except Exception:
        return -1
    return -1

def extract_domain(url):
    ext = tldextract.extract(url)
    return f"{ext.domain}.{ext.suffix}" if ext.suffix else ""

def get_domain_age_from_url(url):
    domain = extract_domain(url)
    return get_domain_age_cached(domain)

# ---------------------- HTTP HEADER INFO ---------------------- #

def get_headers_info(url):
    try:
        response = requests.head(
            url,
            timeout=5,
            allow_redirects=True,
            headers={'User-Agent': 'Mozilla/5.0'}
        )
        return {
            "header_status_code": response.status_code,
            "header_server": response.headers.get("Server", ""),
            "header_powered_by": response.headers.get("X-Powered-By", ""),
            "header_has_csp": "Content-Security-Policy" in response.headers
        }
    except:
        return {
            "header_status_code": -1,
            "header_server": "",
            "header_powered_by": "",
            "header_has_csp": False
        }

# ---------------------- IP GEOLOCATION ---------------------- #

@lru_cache(maxsize=None)
def get_geo_info_cached(hostname):
    try:
        ip = socket.gethostbyname(hostname)
        details = ip_handler.getDetails(ip)
        return {
            "geo_country": details.country or "NA",
            "geo_org": details.org or "NA",
            "geo_asn": details.all.get("asn", {}).get("asn", "NA")
        }
    except:
        return {"geo_country": "NA", "geo_org": "NA", "geo_asn": "NA"}

def get_ip_geo_from_url(url):
    try:
        hostname = urlparse(url).hostname
        if not hostname or not re.match(r"^((?!-)[A-Za-z0-9-]{1,63}(?<!-)\.)+[A-Za-z]{2,6}$", hostname):
            return {"geo_country": "NA", "geo_org": "NA", "geo_asn": "NA"}
        return get_geo_info_cached(hostname)
    except:
        return {"geo_country": "NA", "geo_org": "NA", "geo_asn": "NA"}

# ---------------------- APPLY IN PARALLEL ---------------------- #

# Apply domain age in parallel
with ThreadPoolExecutor(max_workers=20) as executor:
    df['domain_age_days'] = list(executor.map(get_domain_age_from_url, df['url']))

# Apply header info in parallel
with ThreadPoolExecutor(max_workers=20) as executor:
    headers = list(executor.map(get_headers_info, df['url']))
df = df.join(pd.json_normalize(headers))

# Apply geo info in parallel
with ThreadPoolExecutor(max_workers=20) as executor:
    geo_info = list(executor.map(get_ip_geo_from_url, df['url']))
df = df.join(pd.json_normalize(geo_info))

# ---------------------- SAVE OUTPUT ---------------------- #

df.to_csv("phishing_dataset_enriched.csv", index=False)
print("âœ… Enrichment complete and saved to 'phishing_dataset_enriched.csv'")
