All collection of webpages is done in a completely separate repo I'm calling "out of scope" for the project. But the high-level details are that it had a queue of top web pages, would crawl those, and monitor the open phish feed. In both cases, it would render the page in selenium, take a screenshot, and save the HTML to S3. It would save the metadata in a postgres table, which this notebook uses to create the hdf5 dataset we use.

In [1]:
import psycopg2
import boto3
import h5py
import os
import numpy as np
from PIL import Image
from bs4 import BeautifulSoup
from io import BytesIO


In [1]:
# Database and AWS configuration
AWS_S3_BUCKET = 'stingray-phishing-dataset'
HDF5_FILE_PATH = 'phishing.h5.nosync'

In [None]:
# Connect to the database
def get_db_connection():
    db_string = os.getenv('RDS_CONNECTION')
    return psycopg2.connect(db_string)

In [None]:
# Fetch entries from the database
def fetch_entries():
    with get_db_connection() as conn:
        with conn.cursor() as cur:
            cur.execute("""
                SELECT id, full_url, s3_html_key, s3_screenshot_key, source
                FROM urls
                WHERE s3_html_key IS NOT NULL AND phash_distance > 10 AND source IN (1, 3, 4, 5)
            """)
            return cur.fetchall()

In [None]:
# Create or open the HDF5 file
h5_file = h5py.File(HDF5_FILE_PATH, 'a')

if 'urls' not in h5_file:
    h5_file.create_dataset('urls', (0,), maxshape=(None,), dtype=h5py.string_dtype())
    h5_file.create_dataset('screenshots', (0, 340, 680, 3), maxshape=(None, 340, 680, 3), dtype='uint8')
    h5_file.create_dataset('html_content', (0,), maxshape=(None,), dtype=h5py.string_dtype())
    h5_file.create_dataset('source', (0,), maxshape=(None,), dtype='i') 
    # { internal: 0, open_phish: 1, api: 2, alexa: 3, product_hunt: 4, expansion: 5 }


In [None]:
import redis
from collections import Counter

r = redis.Redis(host='localhost', port=6379, db=0)
# Define expiry time (30 days in seconds)
expiry_time = 30 * 24 * 60 * 60  # 30 days
namespace = "phish_dataset"


In [46]:
import time 
from urllib.parse import urlparse
s3_client = boto3.client('s3')

urls_dataset = h5_file['urls']
screenshots_dataset = h5_file['screenshots']
html_content_dataset = h5_file['html_content']
source_dataset = h5_file['source']

start_time = time.time()
count = 0
# Process entries
for entry in fetch_entries():
    url_id, full_url, s3_html_key, s3_screenshot_key, source = entry
    count += 1
        
    if count % 1000 == 0:
        h5_file.flush()
        execution_time = time.time() - start_time
        print("")
        print(f"Count {count}; {execution_time} seconds")
        time.sleep(5)
        start_time = time.time()
    
    
    parsed_url = urlparse(full_url)
    
    # Construct the URL without query parameters
    target_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
    redis_key = f"{namespace}:{target_url}"
    
    # Get the domain
    domain = parsed_url.netloc
    
    # Check if URL is already in HDF5 dataset
    if r.exists(redis_key):
        print("-", end ="")
        continue
        
    blocked_domains = ["pages.dev", "github.io", "weebly.com", "vercel.app", "weeblysite.com", "gitbook.io" ]
    if any(blocked_domain in domain for blocked_domain in blocked_domains):
        r.setex(redis_key, expiry_time, target_url)
        print("x", end ="")
        continue
    
    # Download screenshot from S3
    try:
        screenshot_obj = s3_client.get_object(Bucket=AWS_S3_BUCKET, Key=s3_screenshot_key)
        screenshot_image = Image.open(BytesIO(screenshot_obj['Body'].read()))
        screenshot_image = screenshot_image.resize((680, 340))  # Resize image to 680x340
        screenshot_array = np.array(screenshot_image)
    except Exception as e:
        print(f"Error fetching screenshot for URL {target_url}: {e}")
        continue
        
    # Download and parse HTML from S3
    try:
        html_obj = s3_client.get_object(Bucket=AWS_S3_BUCKET, Key=s3_html_key)
        html_content = html_obj['Body'].read().decode('utf-8')
        soup = BeautifulSoup(html_content, 'html.parser')
        main_html_content = soup.prettify()
    except Exception as e:
        print(f"Error fetching HTML for URL {target_url}: {e}")
        continue
    
    new_index = urls_dataset.shape[0]
    urls_dataset.resize((new_index + 1,))
    urls_dataset[new_index] = target_url
    
    source_dataset.resize((new_index + 1,))
    source_dataset[new_index] = source
    
    screenshots_dataset.resize((new_index + 1, 340, 680, 3))
    screenshots_dataset[new_index] = screenshot_array[:, :, :3]

    html_content_dataset.resize((new_index + 1,))
    html_content_dataset[new_index] = main_html_content.replace('\x00', '')

    
    r.setex(redis_key, expiry_time, target_url)
    print(".", end ="")

# Close HDF5 file
h5_file.close()


.....xx............................x......x.......................x..

In [50]:
from collections import Counter
h5_file = h5py.File(HDF5_FILE_PATH, 'a')
urls_dataset = h5_file['urls']
screenshots_dataset = h5_file['screenshots']
html_content_dataset = h5_file['html_content']
source_dataset = h5_file['source']

# Use Counter to count the frequency of each unique value
print(source_dataset.shape)
print(urls_dataset.shape)
print(screenshots_dataset.shape)
print(html_content_dataset.shape)

source_dataset = h5_file['source'][:]

# Use Counter to count the frequency of each unique value
frequencies = Counter(source_dataset)

# Print the frequencies
for value, count in frequencies.items():
    print(f"Value: {value}, Count: {count}")

(72083,)
(72083,)
(72083, 340, 680, 3)
(72083,)
Value: 1, Count: 26670
Value: 5, Count: 22728
Value: 3, Count: 21500
Value: 4, Count: 1185


Now let's make us of the dataset to make a final dataset we can use for testing et. al  

In [4]:
import h5py
import numpy as np
from IPython.display import clear_output

# Function to resize datasets
def resize_dataset(dataset, new_size):
    dataset.resize((new_size, *dataset.shape[1:]))

# File paths
HDF5_FILE_PATH = 'phishing.h5.nosync'
NEW_HDF5_FILE_PATH = '/tmp/phishing_output.h5'

# Chunk size for reading data in batches
CHUNK_SIZE = 1024 * 2

# Open original file and create new one
with h5py.File(HDF5_FILE_PATH, 'r') as h5_file:
    total_entries = h5_file['source'].shape[0]

    with h5py.File(NEW_HDF5_FILE_PATH, 'w') as new_h5_file:
        # Create new datasets for "producthunt", "train", "dev", "test"
        for name in ['producthunt', 'train', 'dev', 'test']:
            new_h5_file.create_dataset(f'{name}/urls', (0,), maxshape=(None,), dtype=h5py.string_dtype())
            new_h5_file.create_dataset(f'{name}/screenshots', (0, 340, 680, 3), maxshape=(None, 340, 680, 3), dtype='uint8')
            new_h5_file.create_dataset(f'{name}/html_content', (0,), maxshape=(None,), dtype=h5py.string_dtype())
            new_h5_file.create_dataset(f'{name}/labels', (0,), maxshape=(None,), dtype='i')

        current_indices = {'producthunt': 0, 'train': 0, 'dev': 0, 'test': 0}

        # Iterate over the original dataset in chunks
        for i in range(0, total_entries, CHUNK_SIZE):

            # Load chunk data
            urls_chunk = h5_file['urls'][i:i + CHUNK_SIZE]
            screenshots_chunk = h5_file['screenshots'][i:i + CHUNK_SIZE]
            html_chunk = h5_file['html_content'][i:i + CHUNK_SIZE]
            source_chunk = h5_file['source'][i:i + CHUNK_SIZE]

            # Process each entry in the chunk
            for j in range(len(source_chunk)):
                source = source_chunk[j]
                label = 1 if source == 1 else 0

                if source == 4:
                    dataset_name = 'producthunt'
                    print('-', end ="")
                else:
                    rand_val = np.random.rand()
                    print('.', end ="")
                    if rand_val < 0.8:
                        dataset_name = 'train'
                    elif rand_val < 0.9:
                        dataset_name = 'dev'
                    else:
                        dataset_name = 'test'

                idx = current_indices[dataset_name]

                # Resize dataset to accommodate new entry
                resize_dataset(new_h5_file[f'{dataset_name}/urls'], idx + 1)
                resize_dataset(new_h5_file[f'{dataset_name}/screenshots'], idx + 1)
                resize_dataset(new_h5_file[f'{dataset_name}/html_content'], idx + 1)
                resize_dataset(new_h5_file[f'{dataset_name}/labels'], idx + 1)

                # Write data to new dataset
                new_h5_file[f'{dataset_name}/urls'][idx] = urls_chunk[j]
                new_h5_file[f'{dataset_name}/screenshots'][idx] = screenshots_chunk[j]
                new_h5_file[f'{dataset_name}/html_content'][idx] = html_chunk[j]
                new_h5_file[f'{dataset_name}/labels'][idx] = label

                # Update index for the dataset
                current_indices[dataset_name] += 1
            
            clear_output(wait=True)
            print(f"Chunk: {i}/{total_entries}")

print("Dataset splitting completed.")


Dataset splitting completed.
