In [7]:
import psycopg2
import boto3
import h5py
import os
import numpy as np
from PIL import Image
import requests
from bs4 import BeautifulSoup
from io import BytesIO
import matplotlib.pyplot as plt


In [8]:
# Database and AWS configuration
AWS_S3_BUCKET = 'stingray-phishing-dataset'
HDF5_FILE_PATH = 'phishing.h5.nosync'

In [9]:
# Connect to the database
def get_db_connection():
    db_string = os.getenv('RDS_CONNECTION')
    return psycopg2.connect(db_string)

In [10]:
# Helper function to check if an image is mostly white
def is_mostly_white(image):
    grayscale_image = image.convert('L')
    histogram = grayscale_image.histogram()
    white_pixels = sum(histogram[230:])
    total_pixels = sum(histogram)
    return (white_pixels / total_pixels) > 0.9

In [11]:
# Fetch entries from the database
def fetch_entries():
    with get_db_connection() as conn:
        with conn.cursor() as cur:
            cur.execute("""
                SELECT id, full_url, s3_html_key, s3_screenshot_key, source, status
                FROM urls
                WHERE s3_html_key IS NOT NULL AND s3_screenshot_key IS NOT NULL
            """)
            return cur.fetchall()

In [12]:
# Create or open the HDF5 file
h5_file = h5py.File(HDF5_FILE_PATH, 'a')

if 'urls' not in h5_file:
    h5_file.create_dataset('urls', (0,), maxshape=(None,), dtype=h5py.string_dtype())
if 'screenshots' not in h5_file:
    h5_file.create_dataset('screenshots', (0, 340, 680, 3), maxshape=(None, 340, 680, 3), dtype='uint8')
if 'html_content' not in h5_file:
    h5_file.create_dataset('html_content', (0,), maxshape=(None,), dtype=h5py.string_dtype())
if 'status' not in h5_file:
    h5_file.create_dataset('status', (0,), maxshape=(None,), dtype='i')
if 'source' not in h5_file:
    h5_file.create_dataset('source', (0,), maxshape=(None,), dtype='i')
        


In [13]:
s3_client = boto3.client('s3')

def generate_presigned_url(object_key, expiration=3600):
    # Generate the presigned URL
    response = s3_client.generate_presigned_url('get_object',
                                                Params={
                                                    'Bucket': AWS_S3_BUCKET,
                                                    'Key': object_key
                                                },
                                                ExpiresIn=expiration)
    return response

In [None]:
import time 

urls_dataset = h5_file['urls']
screenshots_dataset = h5_file['screenshots']
html_content_dataset = h5_file['html_content']
status_dataset = h5_file['status']
source_dataset = h5_file['source']

good = 0
count = 0
# Process entries
for entry in fetch_entries():
    url_id, full_url, s3_html_key, s3_screenshot_key, status, source = entry
    count += 1
    if count % 100 == 0:
        h5_file.flush()
        print(f"{good}/{count} = {good/count}")
        time.sleep(5)
    
    # Check if URL is already in HDF5 dataset
    if full_url in h5_file['urls']:
        continue
    
    # Download screenshot from S3
    try:
        screenshot_obj = s3_client.get_object(Bucket=AWS_S3_BUCKET, Key=s3_screenshot_key)
        screenshot_image = Image.open(BytesIO(screenshot_obj['Body'].read()))
        screenshot_image = screenshot_image.resize((680, 340))  # Resize image to 680x340
        screenshot_array = np.array(screenshot_image)
    except Exception as e:
        print(f"Error fetching screenshot for URL {full_url}: {e}")
        continue
        
    # Check if screenshot is mostly white
    if is_mostly_white(screenshot_image):
        # signed_url = generate_presigned_url(s3_screenshot_key)
        # print(signed_url)
        continue
    
    # Download and parse HTML from S3
    try:
        html_obj = s3_client.get_object(Bucket=AWS_S3_BUCKET, Key=s3_html_key)
        html_content = html_obj['Body'].read().decode('utf-8')
        soup = BeautifulSoup(html_content, 'html.parser')
        main_html_content = soup.prettify()
    except Exception as e:
        print(f"Error fetching HTML for URL {full_url}: {e}")
        continue
    
    new_index = urls_dataset.shape[0]
    urls_dataset.resize((new_index + 1,))
    urls_dataset[new_index] = full_url
    
    screenshots_dataset.resize((new_index + 1, 340, 680, 3))
    screenshots_dataset[new_index] = screenshot_array

    html_content_dataset.resize((new_index + 1,))
    html_content_dataset[new_index] = main_html_content

    status_dataset.resize((new_index + 1,))
    status_dataset[new_index] = status
    
    source_dataset.resize((new_index + 1,))
    source_dataset[new_index] = source
    good +=1

# Close HDF5 file
h5_file.close()


Error fetching screenshot for URL https://bafybeicxz3kv7i5gquunkseoaygbe3chazinh42at7drfuwkcpnromc4o4.ipfs.dweb.link/: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.
