In [None]:
from pathlib import Path
from typing import List
import json

src_url='https://www.industrydocuments.ucsf.edu/food/results/#q=NOT%20type%3A(Email)&h=%7B%22hideDuplicates%22%3Atrue%2C%22hideFolders%22%3Atrue%2C%22hideRestricted%22%3Atrue%7D&subsite=food&cache=true&count=131964'
num_pages = 25 # each page consists of 20 samples

res_path = "../dataset/ucsf-idl-resized-without_emails"

(Path(res_path) / "images").mkdir(exist_ok=True, parents=True)

In [None]:
from scraper import scrape_url

samples = scrape_url(src_url, num_pages=num_pages)


In [None]:
from PIL import Image

def resize_image(img: Image.Image, width=None, height=None):
    """Resizes a PIL image to fit within a maximum size while maintaining its aspect ratio.
    
    Args:
        image_path (str): Path to the input image file.
        width (int or None): The maximum width of the output image. If None, the width is calculated based on the height.
        height (int or None): The maximum height of the output image. If None, the height is calculated based on the width.
    
    Returns:
        PIL.Image: The resized image.
    """
    
    # Get the original size of the image
    orig_width, orig_height = img.size

    # Calculate the new size that maintains the aspect ratio
    if width is not None and height is not None:
        # If both width and height are specified, use them directly
        new_width, new_height = width, height
    elif width is not None:
        # If only width is specified, calculate height based on aspect ratio
        new_width, new_height = width, int(orig_height * (width / orig_width))
    elif height is not None:
        # If only height is specified, calculate width based on aspect ratio
        new_width, new_height = int(orig_width * (height / orig_height)), height
    else:
        # If neither width nor height is specified, return the original image
        return img

    # Resize the image
    resized_image = img.resize((new_width, new_height))

    return resized_image


In [None]:
from tqdm import tqdm
import requests
from pdf2image import convert_from_bytes

for sample in tqdm(samples):
    id = sample["id"]
    download_url = sample["download_url"]

    img_folder = Path("images") / str(id)
    img_path = Path(res_path) / img_folder
    img_path.mkdir(exist_ok=True, parents=True)

    sample["image_folder"] = str(img_folder)

    r = requests.get(download_url, allow_redirects=True)
    bytes = r.content

    #convert to img and save
    images = convert_from_bytes(bytes)
    for i, img in enumerate(images):
        img = resize_image(img, width=1000)
        img.save(img_path / f"page_{i}.jpg", "JPEG")

json.dump(samples, open(Path(res_path) / "samples.json", "w+"))