# PDF Processor
This is part of a small set of tools I've developed for extracting data and categorizing/indexing test PDFs for Mu Alpha Theta


In [3]:
!pip install -U -q "google-generativeai"
!pip install -U -q "PyPDF2"
!pip install -U -q "pdfplumber"


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
import requests
import pathlib
from google import genai
from google.colab import userdata
from PyPDF2 import PdfReader, PdfWriter
import time
import os
import json
import csv
import re


client = genai.Client(api_key=userdata.get('GOOGLE_API_KEY'))

def extract_drive_id(url):
    match = re.search(r"/d/([a-zA-Z0-9_-]+)", url)
    return match.group(1) if match else None

file_id = '1WZzH1C8l7hUkp58jKrSMvJ62IouYU_rF'
pdf_path = '2025_L_D_States (Test).pdf'
response = requests.get(f'https://drive.google.com/uc?export=download&id={file_id}')
pathlib.Path(pdf_path).write_bytes(response.content)
pdf_reader = PdfReader(pdf_path)

In [None]:
latex_blocks = []
output_csv = "questions_output.csv"


for page_num in range(len(pdf_reader.pages)):
    try:
        print(f"Processing page {page_num + 1}/{len(pdf_reader.pages)}")

        writer = PdfWriter()
        writer.add_page(pdf_reader.pages[page_num])
        page_filename = f"temp_page_{page_num + 1}.pdf"

        with open(page_filename, "wb") as out_file:
            writer.write(out_file)


        uploaded_file = client.files.upload(file=page_filename)

        # Send the page for question extraction
        response = client.models.generate_content(
            model='gemini-2.0-flash',
            contents=[
                "Convert this math competition question into structured JSON format. Extract: question_text (LaTeX formatted), answer_choices",
                uploaded_file
            ]
        )

        # Extract JSON block
        response_text = getattr(response, "text", None) or getattr(response.candidates[0], "content", "")
        match = re.search(r"```json\s*(.*?)\s*```", response_text, re.DOTALL)
        clean_json = match.group(1).strip() if match else response_text.strip()
        latex_blocks.append(clean_json)

        print(f" Successfully processed page {page_num + 1}")

        os.remove(page_filename)
        time.sleep(1.2)

    except Exception as e:
        print(f" Error processing page {page_num + 1}: {str(e)}")


all_questions = []
for idx, block in enumerate(latex_blocks):
    try:
        parsed = json.loads(block)

        if not isinstance(parsed, list):
            print(f" Skipping block {idx+1} (not a list)")
            continue

        for q in parsed:
            if not isinstance(q, dict):
                print(f" Skipping malformed question in block {idx+1}: {q}")
                continue

            question_text = q.get("question_text", "").strip()
            choices_raw = q.get("answer_choices", {})

            if isinstance(choices_raw, dict):
                choices = [choices_raw.get(opt, "") for opt in ["A", "B", "C", "D", "E"]]
            elif isinstance(choices_raw, list):
                choices = choices_raw
            else:
                print(f" Skipping malformed choices in block {idx+1}: {choices_raw}")
                continue

            all_questions.append([question_text, json.dumps(choices)])

    except json.JSONDecodeError:
        print(f"JSON decode error in block {idx+1}: {block[:100]}")

# Save results to CSV
with open(output_csv, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["question_text", "answer_choices"])
    writer.writerows(all_questions)

print(f"\n Finished! Saved {len(all_questions)} questions to {output_csv}")