In [1]:
import psycopg2
import boto3
import h5py
import os
import numpy as np
from PIL import Image
from bs4 import BeautifulSoup
from io import BytesIO


In [2]:
# Database and AWS configuration
AWS_S3_BUCKET = 'stingray-phishing-dataset'
HDF5_FILE_PATH = 'phishing.h5.nosync'

In [3]:
# Connect to the database
def get_db_connection():
    db_string = os.getenv('RDS_CONNECTION')
    return psycopg2.connect(db_string)

In [4]:
# Fetch entries from the database
def fetch_entries():
    with get_db_connection() as conn:
        with conn.cursor() as cur:
            cur.execute("""
                SELECT id, full_url, s3_html_key, s3_screenshot_key, source
                FROM urls
                WHERE s3_html_key IS NOT NULL AND phash_distance > 10 AND source IN (1, 3, 4, 5)
            """)
            return cur.fetchall()

In [5]:
# Create or open the HDF5 file
h5_file = h5py.File(HDF5_FILE_PATH, 'a')

if 'urls' not in h5_file:
    h5_file.create_dataset('urls', (0,), maxshape=(None,), dtype=h5py.string_dtype())
    h5_file.create_dataset('screenshots', (0, 340, 680, 3), maxshape=(None, 340, 680, 3), dtype='uint8')
    h5_file.create_dataset('html_content', (0,), maxshape=(None,), dtype=h5py.string_dtype())
    h5_file.create_dataset('source', (0,), maxshape=(None,), dtype='i') 
    # { internal: 0, open_phish: 1, api: 2, alexa: 3, product_hunt: 4, expansion: 5 }


In [6]:
import redis
from collections import Counter

r = redis.Redis(host='localhost', port=6379, db=0)
# Define expiry time (30 days in seconds)
expiry_time = 30 * 24 * 60 * 60  # 30 days
namespace = "phish_dataset"


In [8]:
import time 
from urllib.parse import urlparse
s3_client = boto3.client('s3')

urls_dataset = h5_file['urls']
screenshots_dataset = h5_file['screenshots']
html_content_dataset = h5_file['html_content']
source_dataset = h5_file['source']

start_time = time.time()
count = 0
# Process entries
for entry in fetch_entries():
    url_id, full_url, s3_html_key, s3_screenshot_key, source = entry
    count += 1
        
    if count % 1000 == 0:
        h5_file.flush()
        execution_time = time.time() - start_time
        print("")
        print(f"Count {count}; {execution_time} seconds")
        time.sleep(5)
        start_time = time.time()
    
    
    parsed_url = urlparse(full_url)
    
    # Construct the URL without query parameters
    target_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
    redis_key = f"{namespace}:{target_url}"
    
    # Get the domain
    domain = parsed_url.netloc
    
    # Check if URL is already in HDF5 dataset
    if r.exists(redis_key):
        print("-", end ="")
        continue
        
    blocked_domains = ["pages.dev", "github.io", "weebly.com", "vercel.app", "weeblysite.com", "gitbook.io" ]
    if any(blocked_domain in domain for blocked_domain in blocked_domains):
        r.setex(redis_key, expiry_time, target_url)
        print("x", end ="")
        continue
    
    # Download screenshot from S3
    try:
        screenshot_obj = s3_client.get_object(Bucket=AWS_S3_BUCKET, Key=s3_screenshot_key)
        screenshot_image = Image.open(BytesIO(screenshot_obj['Body'].read()))
        screenshot_image = screenshot_image.resize((680, 340))  # Resize image to 680x340
        screenshot_array = np.array(screenshot_image)
    except Exception as e:
        print(f"Error fetching screenshot for URL {target_url}: {e}")
        continue
        
    # Download and parse HTML from S3
    try:
        html_obj = s3_client.get_object(Bucket=AWS_S3_BUCKET, Key=s3_html_key)
        html_content = html_obj['Body'].read().decode('utf-8')
        soup = BeautifulSoup(html_content, 'html.parser')
        main_html_content = soup.prettify()
    except Exception as e:
        print(f"Error fetching HTML for URL {target_url}: {e}")
        continue
    
    new_index = urls_dataset.shape[0]
    urls_dataset.resize((new_index + 1,))
    urls_dataset[new_index] = target_url
    
    screenshots_dataset.resize((new_index + 1, 340, 680, 3))
    screenshots_dataset[new_index] = screenshot_array[:, :, :3]

    html_content_dataset.resize((new_index + 1,))
    html_content_dataset[new_index] = main_html_content
    
    source_dataset.resize((new_index + 1,))
    source_dataset[new_index] = source
    
    r.setex(redis_key, expiry_time, target_url)
    print(".", end ="")

# Close HDF5 file
h5_file.close()


------x---x--xx----x------x---x-x-xx-------x-----x--x-xx--x----x-x--xxxxxxxxxxxxxxxx--x-xx-x--xx----x---xxxxxxxx.x...x..x.............xx...x.xxx..x.....x...x......x..x....xx

KeyboardInterrupt: 