In [1]:
import psycopg2
import boto3
import h5py
import os
import numpy as np
from PIL import Image
from bs4 import BeautifulSoup
from io import BytesIO


In [2]:
# Database and AWS configuration
AWS_S3_BUCKET = 'stingray-phishing-dataset'
HDF5_FILE_PATH = 'phishing.h5.nosync'

In [3]:
# Connect to the database
def get_db_connection():
    db_string = os.getenv('RDS_CONNECTION')
    return psycopg2.connect(db_string)

In [4]:
# Fetch entries from the database
def fetch_entries():
    with get_db_connection() as conn:
        with conn.cursor() as cur:
            cur.execute("""
                SELECT id, full_url, s3_html_key, s3_screenshot_key, source, status
                FROM urls
                WHERE s3_html_key IS NOT NULL AND phash_distance > 10
            """)
            return cur.fetchall()

In [5]:
# Create or open the HDF5 file
h5_file = h5py.File(HDF5_FILE_PATH, 'a')

if 'urls' not in h5_file:
    h5_file.create_dataset('urls', (0,), maxshape=(None,), dtype=h5py.string_dtype())
if 'screenshots' not in h5_file:
    h5_file.create_dataset('screenshots', (0, 340, 680, 3), maxshape=(None, 340, 680, 3), dtype='uint8')
if 'html_content' not in h5_file:
    h5_file.create_dataset('html_content', (0,), maxshape=(None,), dtype=h5py.string_dtype())
if 'status' not in h5_file:
    h5_file.create_dataset('status', (0,), maxshape=(None,), dtype='i')
if 'source' not in h5_file:
    h5_file.create_dataset('source', (0,), maxshape=(None,), dtype='i')
        


In [None]:
import time 
from urllib.parse import urlparse
s3_client = boto3.client('s3')

urls_dataset = h5_file['urls']
screenshots_dataset = h5_file['screenshots']
html_content_dataset = h5_file['html_content']
status_dataset = h5_file['status']
source_dataset = h5_file['source']

count = 0
# Process entries
for entry in fetch_entries():
    url_id, full_url, s3_html_key, s3_screenshot_key, status, source = entry
    count += 1
        
    if count % 1000 == 0:
        h5_file.flush()
        print("")
        print(f"Count {count}")
        time.sleep(5)
    
    parsed_url = urlparse(full_url)
    
    # Construct the URL without query parameters
    target_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
    
    # Get the domain
    domain = parsed_url.netloc
    
    # Check if URL is already in HDF5 dataset
    if target_url in h5_file['urls']:
        continue
        
    blocked_domains = ["pages.dev", "github.io", "weebly.com", "vercel.app", "weeblysite.com", "gitbook.io" ]
    if any(blocked_domain in domain for blocked_domain in blocked_domains):
        print("x", end ="")
        continue
    
    # Download screenshot from S3
    try:
        screenshot_obj = s3_client.get_object(Bucket=AWS_S3_BUCKET, Key=s3_screenshot_key)
        screenshot_image = Image.open(BytesIO(screenshot_obj['Body'].read()))
        screenshot_image = screenshot_image.resize((680, 340))  # Resize image to 680x340
        screenshot_array = np.array(screenshot_image)
    except Exception as e:
        print(f"Error fetching screenshot for URL {target_url}: {e}")
        continue
        
    # Download and parse HTML from S3
    try:
        html_obj = s3_client.get_object(Bucket=AWS_S3_BUCKET, Key=s3_html_key)
        html_content = html_obj['Body'].read().decode('utf-8')
        soup = BeautifulSoup(html_content, 'html.parser')
        main_html_content = soup.prettify()
    except Exception as e:
        print(f"Error fetching HTML for URL {target_url}: {e}")
        continue
    
    new_index = urls_dataset.shape[0]
    urls_dataset.resize((new_index + 1,))
    urls_dataset[new_index] = target_url
    
    screenshots_dataset.resize((new_index + 1, 340, 680, 3))
    screenshots_dataset[new_index] = screenshot_array

    html_content_dataset.resize((new_index + 1,))
    html_content_dataset[new_index] = main_html_content

    status_dataset.resize((new_index + 1,))
    status_dataset[new_index] = status
    
    source_dataset.resize((new_index + 1,))
    source_dataset[new_index] = source
    print(".", end ="")

# Close HDF5 file
h5_file.close()


......x...x..xx....x......x...x.x.xx.......x.....x.