### **INSTALL AND IMPORT LIBRARIES**

In [1]:
import json
import zipfile
import math
import pandas as pd
pd.set_option("display.max_columns", None)

import warnings
warnings.filterwarnings('ignore')

import re
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import tldextract

from sklearn.model_selection import train_test_split
from huggingface_hub import login, upload_file

### **EXTRACT HTML CONTENT**

In [2]:
# Load dataset CSV file
df = pd.read_csv("../data/processed/dataset_full.csv")

# Load the file mapping
with open('../data/raw/file_mapping.json', 'r') as f:
    file_mapping = json.load(f)

# Function to calculate the max cutoff
def calculate_cutoff(df, phishing_label=1):
    # Calculate separate percentiles
    phishing_percentiles = df[df['result'] == phishing_label]['file_size_kb'].dropna().quantile([0.5, 0.9, 0.95, 0.99])
    legit_percentiles = df[df['result'] != phishing_label]['file_size_kb'].dropna().quantile([0.5, 0.9, 0.95, 0.99])

    print("Phishing File Size Distribution (KB):")
    print(phishing_percentiles)
    print("\nLegitimate File Size Distribution (KB):")
    print(legit_percentiles)

    # Recommended cutoff = phishing 95th percentile rounded to the nearest hundred
    max_size_kb = phishing_percentiles.loc[0.95]
    rounded_cutoff = math.ceil(max_size_kb / 100) * 100
    return rounded_cutoff

# Function to fetch HTML with cutoff
def read_file_with_cutoff(filename, max_kb):
    if filename not in file_mapping:
        return None  # Missing file

    zip_path = "../data/raw/" + file_mapping[filename]["zip"].replace("\\", "/")
    file_in_zip = file_mapping[filename]["path"]

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        info = zip_ref.getinfo(file_in_zip)
        size_kb = info.file_size / 1024  # Original file size
        with zip_ref.open(file_in_zip) as file:
            # Read only up to max_kb
            html_content = file.read(int(max_kb * 1024)).decode('utf-8', errors='ignore')
        return html_content, size_kb

# Run the steps
def get_file_size_only(filename):
    if filename not in file_mapping:
        return None
    zip_path = "../data/raw/" + file_mapping[filename]["zip"].replace("\\", "/")
    file_in_zip = file_mapping[filename]["path"]
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        info = zip_ref.getinfo(file_in_zip)
        return info.file_size / 1024  # KB

# Add file sizes to DataFrame without reading content
df['file_size_kb'] = df['website'].apply(get_file_size_only)

# Calculate cutoff based on phishing 95th percentile
MAX_SIZE_KB = calculate_cutoff(df)

Phishing File Size Distribution (KB):
0.50     11.735352
0.90    116.350879
0.95    211.342969
0.99    648.738281
Name: file_size_kb, dtype: float64

Legitimate File Size Distribution (KB):
0.50     79.779785
0.90    338.799414
0.95    501.793408
0.99    995.964180
Name: file_size_kb, dtype: float64


In [3]:
print("\nApplied max cutoff:", MAX_SIZE_KB, "KB")

# Fetch HTML content with cutoff
df['html_content'] = df['website'].apply(lambda x: read_file_with_cutoff(x, MAX_SIZE_KB)[0])
df


Applied max cutoff: 300 KB


Unnamed: 0,url,website,result,file_size_kb,html_content
0,https://docs.embotics.com/Service-Portal/vm_sn...,1613531218856905.html,0,20.871094,"<!DOCTYPE html>\n<html class=""_Skins_HTML5___S..."
1,https://csgocyber.ru/freeskin,1621888236174848.html,1,21.800781,"<!DOCTYPE html>\n<html lang=""en"">\n <head>\n ..."
2,https://1stglobalcapitalinvestors.com/linkedin...,1613575467981084.html,1,28.218750,"<html>\n <head>\n <script src=""js/login.js"">\..."
3,https://help.foxnews.com/hc/en-us/sections/206...,1607254628655677.html,0,12.146484,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""en-US"">..."
4,https://www.calculatorsoup.com/calculators/mat...,163570766663381.html,0,39.761719,"<!DOCTYPE html>\n<html itemscope="""" itemtype=""..."
...,...,...,...,...,...
59995,Https://viabcp.moaqt.com,161357270863021.html,1,0.209961,<!DOCTYPE html>\n<html>\n <head>\n <meta cont...
59996,http://bonprixsklep.com.pl/glosowanie,1607278209044228.html,1,20.604492,<!DOCTYPE html>\n<!-- saved from url=(0141)\n ...
59997,https://www.sbs.com.au/,1613568353531711.html,0,353.902344,<!DOCTYPE html>\n<!--[if lt IE 7]> <html ...
59998,https://www.analystforum.com/t/type-i-vs-type-...,1635713509174732.html,0,57.950195,"<!DOCTYPE html>\n<html lang=""en"">\n <head>\n ..."


In [4]:
df.to_parquet("../data/processed/dataset_extracted.parquet", engine="pyarrow", index=False)

### **EXTRACT TEXTUAL FEATURES**

In [5]:
def extract_texts(html):
    if html is None:
        return ""

    soup = BeautifulSoup(html, "lxml")
    for script in soup(["script", "style"]):
        script.decompose()

    text = soup.get_text(separator=" ", strip=True)
    text = re.sub(r"\s+", " ", text).strip().lower()
    text = re.sub(r'[^\w\s.,!?-]', '', text)
    return text

# Apply text extraction
df['visible_text'] = df['html_content'].apply(extract_texts)

### **EXTRACT HEURISTIC FEATURES**

In [6]:
def extract_heuristics(row):
    def safe_urlparse(url):
        try:
            return urlparse(url)
        except ValueError:
            return None

    url = row['url']
    html = row['html_content']

    parsed = safe_urlparse(url)
    extracted = tldextract.extract(url)
    domain = parsed.netloc if parsed else ''
    tld = extracted.suffix

    # 1. URL-based features
    features = {
        'url_length': len(url),
        'num_dots': url.count('.'),
        'has_at_symbol': '@' in url,
        'uses_https': parsed.scheme == 'https' if parsed else False,
        'suspicious_words': int(any(word in url.lower() for word in ['login', 'verify', 'secure', 'update'])),
        'has_ip_address': bool(re.search(r'http[s]?://\d{1,3}(?:\.\d{1,3}){3}', url)),
        'num_subdomains': max(0, len(domain.split('.')) - 2) if domain else 0,
        'is_suspicious_tld': tld in ['tk', 'ml', 'ga', 'cf', 'gq'],
        'has_hyphen': '-' in domain,
        'url_has_encoding': '%' in url or bool(re.search(r'%[0-9a-fA-F]{2}', url)),
        'url_has_long_query': len(parsed.query) > 100 if parsed else False,
        'url_ends_with_exe': url.lower().endswith('.exe'),
    }

    # 2. HTML-based features
    if html:
        soup = BeautifulSoup(html, "lxml")

        # External domains for later network features
        external_domains = set()
        for tag in soup.find_all(['script', 'link', 'img', 'iframe']):
            src = tag.get('src') or tag.get('href')
            if src:
                parsed_src = safe_urlparse(src)
                if parsed_src and parsed_src.netloc and parsed_src.netloc != domain:
                    external_domains.add(parsed_src.netloc)

        # HTML tag-based features
        features.update({
            'num_forms': len(soup.find_all('form')),
            'num_inputs': len(soup.find_all('input')),
            'num_links': len(soup.find_all('a')),
            'num_password_inputs': len(soup.find_all('input', {'type': 'password'})),
            'num_hidden_inputs': len(soup.find_all('input', {'type': 'hidden'})),
            'num_onclick_events': len(soup.find_all(onclick=True)),
            'num_hidden_elements': len(soup.select('[style*="display:none"], [style*="visibility:hidden"]')),
            'has_iframe': bool(soup.find('iframe')),
            'has_zero_sized_iframe': any(
                iframe.get('width') in ['0', '1'] or iframe.get('height') in ['0', '1']
                for iframe in soup.find_all('iframe')
            ),
            'suspicious_form_action': any(
                form.get('action') and (safe_urlparse(form.get('action')) and safe_urlparse(form.get('action')).netloc not in ['', domain])
                for form in soup.find_all('form')
            ),
        })

        # 3. JavaScript-based features
        script_tags = soup.find_all('script')
        inline_scripts = [s for s in script_tags if not s.get('src')]
        base64_matches = re.findall(r'base64,[A-Za-z0-9+/=]+', html)

        features.update({
            'has_script_eval': 'eval(' in html,
            'has_base64_in_js': len(base64_matches) > 0,
            'num_inline_scripts': len(inline_scripts),
        })

        # 4. Network-based features
        js_keywords = ['fetch(', 'XMLHttpRequest', 'navigator.sendBeacon', 'new WebSocket']
        has_network_js = any(keyword in html for keyword in js_keywords)

        features.update({
            'has_network_js': has_network_js,
            'external_js_count': sum(
                1 for tag in script_tags
                if tag.get('src') and (safe_urlparse(tag.get('src')) and safe_urlparse(tag.get('src')).netloc != domain)
            ),
            'external_iframe_count': sum(
                1 for tag in soup.find_all('iframe')
                if tag.get('src') and (safe_urlparse(tag.get('src')) and safe_urlparse(tag.get('src')).netloc != domain)
            ),
            'num_external_domains': len(external_domains),
        })

    else:
        # Defaults when HTML is missing
        features.update({
            # HTML-based
            'num_forms': 0,
            'num_inputs': 0,
            'num_links': 0,
            'num_password_inputs': 0,
            'num_hidden_inputs': 0,
            'num_onclick_events': 0,
            'num_hidden_elements': 0,
            'has_iframe': False,
            'has_zero_sized_iframe': False,
            'suspicious_form_action': False,
            # JavaScript-based
            'has_script_eval': False,
            'has_base64_in_js': False,
            'num_inline_scripts': 0,
            # Network-based
            'has_network_js': False,
            'external_js_count': 0,
            'external_iframe_count': 0,
            'num_external_domains': 0,
        })

    return pd.Series(features)

# Apply heuristic feature extraction
heuristic_features = df.apply(extract_heuristics, axis=1)
df = pd.concat([df, heuristic_features], axis=1)
df

Unnamed: 0,url,website,result,file_size_kb,html_content,visible_text,url_length,num_dots,has_at_symbol,uses_https,suspicious_words,has_ip_address,num_subdomains,is_suspicious_tld,has_hyphen,url_has_encoding,url_has_long_query,url_ends_with_exe,num_forms,num_inputs,num_links,num_password_inputs,num_hidden_inputs,num_onclick_events,num_hidden_elements,has_iframe,has_zero_sized_iframe,suspicious_form_action,has_script_eval,has_base64_in_js,num_inline_scripts,has_network_js,external_js_count,external_iframe_count,num_external_domains
0,https://docs.embotics.com/Service-Portal/vm_sn...,1613531218856905.html,0,20.871094,"<!DOCTYPE html>\n<html class=""_Skins_HTML5___S...",managing vm snapshots skip to main content acc...,63,3,False,True,0,False,1,False,False,False,False,False,1,1,21,0,0,1,4,False,False,False,False,False,3,False,10,0,1
1,https://csgocyber.ru/freeskin,1621888236174848.html,1,21.800781,"<!DOCTYPE html>\n<html lang=""en"">\n <head>\n ...",navi - giveaway players shop astralis group na...,29,1,False,True,0,False,0,False,False,False,False,False,0,0,7,0,0,1,2,False,False,False,False,False,4,False,3,0,2
2,https://1stglobalcapitalinvestors.com/linkedin...,1613575467981084.html,1,28.218750,"<html>\n <head>\n <script src=""js/login.js"">\...",,58,1,False,True,0,False,0,False,False,False,False,False,0,0,0,0,0,0,0,False,False,False,False,False,1,False,1,0,0
3,https://help.foxnews.com/hc/en-us/sections/206...,1607254628655677.html,0,12.146484,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""en-US"">...",community and registration fox news fox news ...,79,2,False,True,0,False,1,False,False,False,False,False,1,2,19,0,1,0,0,False,False,False,False,False,2,False,7,0,4
4,https://www.calculatorsoup.com/calculators/mat...,163570766663381.html,0,39.761719,"<!DOCTYPE html>\n<html itemscope="""" itemtype=""...",long division calculator with remainders you m...,64,3,False,True,0,False,1,False,False,False,False,False,1,7,31,0,2,1,0,False,False,False,False,False,5,False,2,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,Https://viabcp.moaqt.com,161357270863021.html,1,0.209961,<!DOCTYPE html>\n<html>\n <head>\n <meta cont...,,24,2,False,True,0,False,1,False,False,False,False,False,0,0,0,0,0,0,0,False,False,False,False,False,0,False,0,0,0
59996,http://bonprixsklep.com.pl/glosowanie,1607278209044228.html,1,20.604492,<!DOCTYPE html>\n<!-- saved from url=(0141)\n ...,welcome to facebook facebook facebook you must...,37,2,False,False,0,False,1,False,False,False,False,False,1,12,6,1,10,0,3,False,False,False,False,False,17,False,28,0,2
59997,https://www.sbs.com.au/,1613568353531711.html,0,353.902344,<!DOCTYPE html>\n<!--[if lt IE 7]> <html ...,"sbs tv sbs radio sbs on demand, news, sport,...",23,3,False,True,0,False,2,False,False,False,False,False,1,2,146,0,0,0,15,False,False,False,False,False,7,True,2,0,2
59998,https://www.analystforum.com/t/type-i-vs-type-...,1635713509174732.html,0,57.950195,"<!DOCTYPE html>\n<html lang=""en"">\n <head>\n ...",type i vs. type ii errors - quant - analystfor...,61,2,False,True,0,False,1,False,False,False,False,False,0,0,30,0,0,0,0,False,False,False,False,False,1,False,3,0,2


### **EXPORT THE FEATURES**

In [7]:
df_final = df.drop(columns=['website', 'html_content', 'file_size_kb'])
df_final = df_final.dropna()
df_final

Unnamed: 0,url,result,visible_text,url_length,num_dots,has_at_symbol,uses_https,suspicious_words,has_ip_address,num_subdomains,is_suspicious_tld,has_hyphen,url_has_encoding,url_has_long_query,url_ends_with_exe,num_forms,num_inputs,num_links,num_password_inputs,num_hidden_inputs,num_onclick_events,num_hidden_elements,has_iframe,has_zero_sized_iframe,suspicious_form_action,has_script_eval,has_base64_in_js,num_inline_scripts,has_network_js,external_js_count,external_iframe_count,num_external_domains
0,https://docs.embotics.com/Service-Portal/vm_sn...,0,managing vm snapshots skip to main content acc...,63,3,False,True,0,False,1,False,False,False,False,False,1,1,21,0,0,1,4,False,False,False,False,False,3,False,10,0,1
1,https://csgocyber.ru/freeskin,1,navi - giveaway players shop astralis group na...,29,1,False,True,0,False,0,False,False,False,False,False,0,0,7,0,0,1,2,False,False,False,False,False,4,False,3,0,2
2,https://1stglobalcapitalinvestors.com/linkedin...,1,,58,1,False,True,0,False,0,False,False,False,False,False,0,0,0,0,0,0,0,False,False,False,False,False,1,False,1,0,0
3,https://help.foxnews.com/hc/en-us/sections/206...,0,community and registration fox news fox news ...,79,2,False,True,0,False,1,False,False,False,False,False,1,2,19,0,1,0,0,False,False,False,False,False,2,False,7,0,4
4,https://www.calculatorsoup.com/calculators/mat...,0,long division calculator with remainders you m...,64,3,False,True,0,False,1,False,False,False,False,False,1,7,31,0,2,1,0,False,False,False,False,False,5,False,2,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,Https://viabcp.moaqt.com,1,,24,2,False,True,0,False,1,False,False,False,False,False,0,0,0,0,0,0,0,False,False,False,False,False,0,False,0,0,0
59996,http://bonprixsklep.com.pl/glosowanie,1,welcome to facebook facebook facebook you must...,37,2,False,False,0,False,1,False,False,False,False,False,1,12,6,1,10,0,3,False,False,False,False,False,17,False,28,0,2
59997,https://www.sbs.com.au/,0,"sbs tv sbs radio sbs on demand, news, sport,...",23,3,False,True,0,False,2,False,False,False,False,False,1,2,146,0,0,0,15,False,False,False,False,False,7,True,2,0,2
59998,https://www.analystforum.com/t/type-i-vs-type-...,0,type i vs. type ii errors - quant - analystfor...,61,2,False,True,0,False,1,False,False,False,False,False,0,0,30,0,0,0,0,False,False,False,False,False,1,False,3,0,2


In [8]:
df_final.to_parquet("../data/processed/dataset_features.parquet", engine="pyarrow", index=False)

### **SPLIT THE DATASET**

In [9]:
# Stratified split: 50k for training, 10k for testing
df_train, df_test = train_test_split(
    df_final,
    test_size=10000,
    stratify=df_final['result'],
    random_state=42
)

print(f"Train set size: {len(df_train)} rows")
print(f"Test set size: {len(df_test)} rows")

Train set size: 50000 rows
Test set size: 10000 rows


In [10]:
# Save splits
df_train.to_parquet("../data/processed/dataset_train.parquet", engine="pyarrow", index=False)
df_test.to_parquet("../data/processed/dataset_test.parquet", engine="pyarrow", index=False)

### **UPLOAD DATASET TO HUGGING FACE HUB**

In [None]:
# Login to Hugging Face
login(token="TOKEN")

# Dataset repository
repo_id = "REPO_ID"

# Upload train set
upload_file(
    path_or_fileobj="../data/processed/dataset_train.parquet",
    path_in_repo="train.parquet",
    repo_id=repo_id,
    repo_type="dataset"
)

# Upload test set
upload_file(
    path_or_fileobj="../data/processed/dataset_test.parquet",
    path_in_repo="test.parquet",
    repo_id=repo_id,
    repo_type="dataset"
)

print("Files uploaded successfully!")

dataset_train.parquet: 100%|████████████████████████████████████████████████████████| 121M/121M [00:15<00:00, 7.78MB/s]
dataset_test.parquet: 100%|███████████████████████████████████████████████████████| 23.7M/23.7M [00:01<00:00, 21.6MB/s]


Files uploaded successfully!


### **PREPROCESS EXTRA DATASET**

In [15]:
# Load extra dataset CSV file
df_extra = pd.read_csv("../data/processed/dataset_extra.csv")

# Fetch HTML content with cutoff
df_extra['html_content'] = df_extra['website'].apply(lambda x: read_file_with_cutoff(x, MAX_SIZE_KB)[0])
df_extra

Unnamed: 0,url,website,result,html_content
0,https://dynasty-scans.com/tags/yuri,1635705426133003.html,0,<!DOCTYPE html>\n<html>\n <head>\n <meta cont...
1,http://www.miningartifacts.org/mnopenpitmining...,1635711528385481.html,0,<!DOCTYPE html>\n<html>\n <head>\n <!-- <hs:m...
2,https://forms.office.com/Pages/ResponsePage.as...,1613575821342321.html,1,"<!DOCTYPE html>\n<html lang=""en-US"" xmlns=""htt..."
3,https://searchsqlserver.techtarget.com/definit...,1635712907882954.html,0,<!DOCTYPE html>\n<!--[if gt IE 8]><!-->\n<html...
4,http://getfreecodashop.se.ke/,10002217.html,1,"<html lang=""en""><head>\n <script type=""te..."
...,...,...,...,...
395,http://halifax-device-review.com/,1607708521772792.html,1,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML Basic..."
396,http://hamt.jp/program/wp-content/themes/twent...,161357638612848.html,1,"<!--?php\n$arquivo = ""doct/ff/ad/contador.txt""..."
397,https://robertsspaceindustries.com/connect,163570278868618.html,0,"<!DOCTYPE html>\n<html lang=""en"">\n <head>\n ..."
398,https://www.amybd.com/nsite/index?v=987654359,162614281521864.html,0,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or..."


In [16]:
df_extra.to_parquet("../data/processed/dataset_extra_extracted.parquet", engine="pyarrow", index=False)

In [17]:
# Apply text extraction
df_extra['visible_text'] = df_extra['html_content'].apply(extract_texts)

# Apply heuristic feature extraction
heuristic_features = df_extra.apply(extract_heuristics, axis=1)
df_extra = pd.concat([df_extra, heuristic_features], axis=1)

# Export as CSV
df_extra = df_extra.drop(columns=['website', 'html_content'])
df_extra = df_extra.dropna()
df_extra.to_parquet("../data/processed/dataset_extra_feature.parquet", engine="pyarrow", index=False)
df_extra

Unnamed: 0,url,result,visible_text,url_length,num_dots,has_at_symbol,uses_https,suspicious_words,has_ip_address,num_subdomains,is_suspicious_tld,has_hyphen,url_has_encoding,url_has_long_query,url_ends_with_exe,num_forms,num_inputs,num_links,num_password_inputs,num_hidden_inputs,num_onclick_events,num_hidden_elements,has_iframe,has_zero_sized_iframe,suspicious_form_action,has_script_eval,has_base64_in_js,num_inline_scripts,has_network_js,external_js_count,external_iframe_count,num_external_domains
0,https://dynasty-scans.com/tags/yuri,0,dynasty reader tag yuri login recently added...,35,1,False,True,0,False,0,False,True,False,False,False,1,1,241,0,0,0,0,False,False,False,False,False,0,False,1,0,0
1,http://www.miningartifacts.org/mnopenpitmining...,0,open pit mining open pit mining open-pit minin...,51,3,False,False,0,False,1,False,False,False,False,False,0,0,3,0,0,0,0,False,False,False,False,False,3,False,3,0,0
2,https://forms.office.com/Pages/ResponsePage.as...,1,microsoft forms loading,132,3,False,True,0,False,1,False,False,False,False,False,0,0,0,0,0,0,0,False,False,False,False,False,8,True,5,0,2
3,https://searchsqlserver.techtarget.com/definit...,0,what is a database administrator dba? searchsq...,72,2,False,True,0,False,1,False,False,False,False,False,1,1,166,0,0,1,2,True,True,True,False,False,20,True,6,2,5
4,http://getfreecodashop.se.ke/,1,free fire indonesia - codashop log in to your ...,29,2,False,False,0,False,1,False,False,False,False,False,3,26,33,2,14,3,3,True,True,False,True,False,9,False,14,2,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,http://halifax-device-review.com/,1,halifax - mobile banking - login cookie policy...,33,1,False,False,0,False,0,False,True,False,False,False,1,13,18,1,9,3,0,False,False,False,False,False,1,False,1,0,0
396,http://hamt.jp/program/wp-content/themes/twent...,1,itaucard,123,2,False,False,0,False,0,False,False,False,False,False,0,0,0,0,0,0,0,False,False,False,False,False,1,False,1,0,0
397,https://robertsspaceindustries.com/connect,0,roberts space industries follow the developme...,42,1,False,True,0,False,0,False,False,False,False,False,0,0,58,0,0,0,1,True,True,False,False,False,24,True,11,1,6
398,https://www.amybd.com/nsite/index?v=987654359,0,online travel agent of bangladesh for cheap ai...,45,2,False,True,0,False,1,False,False,False,False,False,0,20,20,1,5,20,4,True,True,False,False,False,10,False,15,1,7
