## Creating a second Dataset on Huggingface with all Images

In [None]:
import os
import pandas as pd
from pdf2image import convert_from_path
from datasets import Dataset, Features, Image, Value
from natsort import natsorted

# Paths
PDF_DIR = "zenodo_pdfs" 
IMG_OUTPUT_DIR = "zenodo_images" 
os.makedirs(IMG_OUTPUT_DIR, exist_ok=True)

# List to store dataset entries
dataset_entries = []

pdf_names = set()

# Iterate over all PDFs
for index, row in df.iterrows():
        
    record_id = row["zenodo_record_id"]
    pdf_filename = row["zenodo_filename"]
    page_number = row["page_number"]
    
    # Define key format: record10008464_pdf1_slide1
    pdf_num = df[df["zenodo_record_id"] == record_id]["zenodo_filename"].nunique()
    pdf_index = list(df[df["zenodo_record_id"] == record_id]["zenodo_filename"].unique()).index(pdf_filename) + 1
    slide_key = f"record{record_id}_pdf{pdf_index}_slide{page_number}"

    # Path to PDF file
    pdf_path = os.path.join(PDF_DIR, pdf_filename)
    
    if not os.path.exists(pdf_path):
        print(f"Skipping {pdf_filename} (not found)")
        continue  # Skip missing PDFs

    try:
        # Convert PDF to images (single page at a time)
        images = convert_from_path(pdf_path, first_page=page_number, last_page=page_number, dpi=300)
        
        for slide_number, img in enumerate(images):
            # Construct key
            slide_key = f"record{record_id}_pdf{pdf_index}_slide{page_number}"

            # Define image save path
            img_filename = f"{slide_key}.png"
            img_path = os.path.join(IMG_OUTPUT_DIR, img_filename)

            # Save image
            img.save(img_path, "PNG")

            # Add entry to dataset
            dataset_entries.append({
                "key": slide_key,
                "image": img_path  # Image path
            })                

    except Exception as e:
        print(f"Error processing {pdf_filename}: {e}")

    if pdf_filename not in pdf_names:
        print(f'Saving {pdf_filename}')
        pdf_names.add(pdf_filename)    

# Ensure images are sorted naturally
dataset_entries = natsorted(dataset_entries, key=lambda x: x["image"])

# Define dataset features
features = Features({
    "key": Value(dtype='string', id=None),
    "image": Image(),  # Hugging Face will load images from paths
})

# Create Hugging Face dataset
hf_dataset = Dataset.from_list(dataset_entries, features=features)

# Optional: Push dataset to HF Hub
# hf_dataset.push_to_hub("les-33/Slide_Insight_Images")

print("Dataset creation completed!")


In [None]:
from PIL import Image as PILImage

#hf_dataset[:3]  # Show first 3 images

#first_image = hf_dataset[0]["image"]
#first_image.show()

hf_dataset.push_to_hub("lea-33/Slide_Insight_Images")