In [1]:
!pip install -q -U google-generativeai PyMuPDF easyocr pillow

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gradio 5.49.1 requires pillow<12.0,>=8.0, but you have pillow 12.0.0 which is incompatible.
mediapipe 0.10.21 requires numpy<2, but you have numpy 2.2.6 which is incompatible.
mediapipe 0.10.21 requires protobuf<5,>=4.25.3, but you have protobuf 5.29.5 which is incompatible.


In [None]:
import os
import fitz  # PyMuPDF
import easyocr
import json
import google.generativeai as genai
from PIL import Image
from datetime import datetime
from fpdf import FPDF
import io

# 1. Setup Gemini with JSON Mode
os.environ["GEMINI_API_KEY"] = "AIzaSyBHbB9NlECSo8bQGOtF-DD_ADtAJEcA4V0"
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

# Initialize the model with JSON output configuration
model = genai.GenerativeModel(
    model_name='gemini-2.5-flash', # Recommended for high-volume marking
    generation_config={"response_mime_type": "application/json"}
)

# Initialize OCR reader
reader = easyocr.Reader(['en'])

# --- Helper Functions ---

def get_ielts_grade(mark_str):
    """Converts a mark like '23/40' into an approximate IELTS Band Score."""
    try:
        score = int(mark_str.split('/')[0])
        if score >= 39: return "9.0"
        if score >= 37: return "8.5"
        if score >= 35: return "8.0"
        if score >= 32: return "7.5"
        if score >= 30: return "7.0"
        if score >= 26: return "6.5"
        if score >= 23: return "6.0"
        if score >= 18: return "5.5"
        if score >= 16: return "5.0"
        if score >= 13: return "4.5"
        return "4.0 or below"
    except:
        return "N/A"

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text()
    except Exception as e:
        print(f"Error reading PDF {pdf_path}: {e}")
    return text

def extract_text_from_upload(file_path):
    if file_path.lower().endswith(('.png', '.jpg', '.jpeg')):
        results = reader.readtext(file_path, detail=0)
        return " ".join(results)
    elif file_path.lower().endswith('.pdf'):
        return extract_text_from_pdf(file_path)
    return ""

def mark_batch_answers(official_key_text, student_data_list):
    # Format student data for the prompt
    students_input_str = ""
    for i, text in enumerate(student_data_list):
        students_input_str += f"\n--- STUDENT {i+1} ---\n{text}\n"

    prompt = f"""
    You are an IELTS Examiner. Use the provided Official Question Set as the absolute source of truth.
    
    OFFICIAL SET (Questions and Answers):
    {official_key_text}
    
    TASK:
    Mark the following {len(student_data_list)} students.
    1. Identify each candidate's name.
    2. Mark their answers (1 to 40).
    3. Calculate total marks.
    
    STUDENT INPUTS:
    {students_input_str}
    
    OUTPUT FORMAT (Strict JSON Array of Objects):
    [
      {{
        "candidate_name": "Full Name",
        "total_marks": "X/40",
        "correct_answers": {{ "1": "val" }},
        "incorrect_answers": {{ "3": {{ "student_answer": "val", "correct_answer": "val" }} }}
      }}
    ]
    """
    response = model.generate_content(prompt)
    return response.text

def export_results_to_pdf(results, official_file_path, output_filename="Marking_Summary.pdf"):
    folder_dir = os.path.dirname(official_file_path)
    folder_name = os.path.basename(folder_dir.rstrip('/'))
    set_label = folder_name.replace("set", "Set ")
    current_time = datetime.now().strftime("%d %B %Y, %I:%M %p")

    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    
    # Header
    pdf.set_font("Arial", 'B', 20)
    pdf.cell(0, 15, txt=set_label, ln=True, align='C')
    pdf.set_font("Arial", 'I', 11)
    pdf.cell(0, 5, txt=current_time, ln=True, align='C')
    pdf.ln(10)
    
    # Table Header
    pdf.set_font("Arial", 'B', 11)
    pdf.set_fill_color(231, 16, 16)
    pdf.set_text_color(255, 255, 255)
    pdf.cell(90, 12, txt=" Candidate Name", border=1, ln=0, fill=True)
    pdf.cell(50, 12, txt=" Mark", border=1, ln=0, fill=True, align='C')
    pdf.cell(50, 12, txt=" Grade (Band)", border=1, ln=1, fill=True, align='C')
    
    # Table Rows
    pdf.set_font("Arial", '', 11)
    pdf.set_text_color(0, 0, 0)
    for entry in results:
        name = entry.get("candidate_name", "Unknown")
        mark = entry.get("total_marks", "0/40")
        grade = get_ielts_grade(mark)
        pdf.cell(90, 10, txt=f" {name}", border=1, ln=0)
        pdf.cell(50, 10, txt=str(mark), border=1, ln=0, align='C')
        pdf.cell(50, 10, txt=str(grade), border=1, ln=1, align='C')

    pdf.output(output_filename)
    print(f"\n--- PDF Report Generated: {output_filename} ---")

# --- EXECUTION ---

official_file = "sets/set1/full_set.pdf"
student_uploads = [
    "marking_data/set1_1.pdf",
    "marking_data/set1_2.jpg"
    # ... add up to 10 more here
]

# 1. Extract official key
print("--- Step 1: Reading official marking key ---")
official_text = extract_text_from_pdf(official_file)

# 2. Extract all student texts first
print("--- Step 2: Extracting student texts ---")
batch_texts = []
for upload in student_uploads:
    if os.path.exists(upload):
        print(f"Reading: {upload}...")
        batch_texts.append(extract_text_from_upload(upload))

# 3. Process in batches (up to 10 students per prompt)
all_results = []
batch_size = 10

print(f"--- Step 3: Marking {len(batch_texts)} students in batches ---")
for i in range(0, len(batch_texts), batch_size):
    current_batch = batch_texts[i : i + batch_size]
    print(f"Marking batch {i//batch_size + 1}...")
    
    raw_response = mark_batch_answers(official_text, current_batch)
    
    try:
        batch_json = json.loads(raw_response.strip())
        if isinstance(batch_json, list):
            all_results.extend(batch_json)
        else:
            all_results.append(batch_json)
    except json.JSONDecodeError:
        print(f"Error parsing batch starting at index {i}")

# 4. Final Output
if all_results:
    export_results_to_pdf(all_results, official_file)
else:
    print("No results processed.")

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


--- Step 1: Reading official marking key ---
--- Step 2: Extracting student texts ---
Reading: marking_data/set1_1.pdf...
Reading: marking_data/set1_2.jpg...




--- Step 3: Marking 2 students in batches ---
Marking batch 1...

--- PDF Report Generated: Marking_Summary.pdf ---
