# Generate thumbnails of every GRIT issue
Save png thumbnails 600*400px, in the same folder structure as the PDFs, with the first PDF page centered on a white background.

In [1]:
import os
import fitz  # PyMuPDF
from PIL import Image, ImageOps

def create_folder_structure(src, dst):
    for dirpath, dirnames, filenames in os.walk(src):
        structure = os.path.join(dst, os.path.relpath(dirpath, src))
        if not os.path.exists(structure):
            os.makedirs(structure)

def save_first_page_image(pdf_path, output_path):
    doc = fitz.open(pdf_path)
    page = doc.load_page(0)  # First page
    pix = page.get_pixmap()
    
    # Convert pixmap to PIL Image
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    
    # OPTION 1: Crop to 600x400 pixels, keeping the top of the page
    #cropped_img = img.crop((0, 0, 600, 400))
    #cropped_img.save(output_path)

    # OPTION 2: Center the whole page image in 600*400px white background
    # Calculate the new width while keeping aspect ratio
    new_height = 400
    aspect_ratio = img.width / img.height
    new_width = int(aspect_ratio * new_height)
    
    # Resize the image
    resized_img = img.resize((new_width, new_height))
    
    # Create a white background (600x400)
    background = Image.new('RGB', (600, 400), (255, 255, 255))
    
    # Center the resized image on the background
    offset = ((600 - new_width) // 2, 0)  # Center horizontally
    background.paste(resized_img, offset)
    
    # Save the final image
    background.save(output_path)

   

def process_pdfs_in_folders(src, dst):
    create_folder_structure(src, dst)
    for dirpath, _, filenames in os.walk(src):
        for filename in filenames:
            if filename.endswith('.pdf'):
                pdf_path = os.path.join(dirpath, filename)
                relative_path = os.path.relpath(dirpath, src)
                output_dir = os.path.join(dst, relative_path)
                output_file = os.path.join(output_dir, os.path.splitext(filename)[0] + ".png")
                
                save_first_page_image(pdf_path, output_file)

In [5]:
source_folder = '/Users/heidi/Documents/SHHA/GRIT/GRIT_archive_PDFs/' 
destination_folder = '/Users/heidi/Documents/SHHA/GRIT/GRIT_archive_thumbnails_fullpage/' 

process_pdfs_in_folders(source_folder, destination_folder)


In [6]:
# Count files

folder_to_count_files = '/Users/heidi/Documents/SHHA/GRIT/GRIT_archive_thumbnails_fullpage/'
file_extension = 'png'

sum([1 for _, _, files in os.walk(folder_to_count_files) for file in files if file.endswith(file_extension)])

349