In [1]:
import fitz  # PyMuPDF
import json
import os
from openai import OpenAI

# --- SETUP ---
# It's best practice to use environment variables for your API key
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
# For this example, I'll assume you have it set up.
client = OpenAI()

In [2]:
import json
import re
from openai import OpenAI

# Assume 'client = OpenAI()' is initialized

def extract_concepts_from_chunk(chunk: str, client) -> list[str]:
    """Uses gpt-3.5-turbo to extract key concepts from a text chunk."""
    prompt = f"""
    Given the following text excerpt, please extract the most critical main ideas and concepts.
    Respond with a simple bulleted list.

    Excerpt:
    \"\"\"
    {chunk}
    \"\"\"

    Concepts:
    """
    response = client.chat.completions.create(
        model="gpt-3.5-turbo", # CHEAPEST MODEL
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0
    )
    concepts = [line.strip('- ') for line in response.choices[0].message.content.strip().split('\n')]
    return concepts



In [3]:
def synthesize_concepts(all_concepts: list[str], client) -> list[str]:
    """Merges and deduplicates a list of concepts using gpt-3.5-turbo."""
    concepts_str = "\n".join([f"- {c}" for c in all_concepts])
    prompt = f"""
    I have a list of concepts from a document. Merge and deduplicate this into a final, clean list.

    Extracted Concepts:
    \"\"\"
    {concepts_str}
    \"\"\"

    Consolidated Concepts (bulleted list):
    """
    response = client.chat.completions.create(
        model="gpt-3.5-turbo", # CHEAPEST MODEL
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0
    )
    final_concepts = [line.strip('- ') for line in response.choices[0].message.content.strip().split('\n')]
    return final_concepts



In [4]:
def parse_json_from_response(response_text: str) -> dict | None:
    """Safely extracts a JSON object from a string, even with surrounding text."""
    # Use regex to find the JSON block, which handles leading/trailing text
    json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
    if not json_match:
        print("Error: No JSON object found in the response.")
        return None
    try:
        return json.loads(json_match.group(0))
    except json.JSONDecodeError:
        print("Error: Failed to decode JSON from the extracted string.")
        return None



In [5]:
def generate_question(concept: str, passages: list[str], client) -> dict | None:
    """Generates a multiple-choice question using gpt-3.5-turbo with robust parsing."""
    passages_str = "\n\n---\n\n".join(passages)
    
    # We've made this prompt more forceful to guide gpt-3.5-turbo
    prompt = f"""
    Based on the following main idea and relevant text, create one multiple-choice question.
    **IMPORTANT**: The question MUST test conceptual understanding. AVOID simple factual recall questions. It should require some reasoning.

    Main Idea: "{concept}"

    Relevant Text:
    \"\"\"
    {passages_str}
    \"\"\"

    Your response MUST be ONLY a single JSON object with these exact keys:
    - "question": The question text.
    - "choices": A list of 4 string options.
    - "correct_answer": The correct answer string.
    - "explanation": A brief explanation for the correct answer.

    The incorrect choices must be plausible distractors. Do not add any text before or after the JSON object.
    """
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo", # CHEAPEST MODEL
            messages=[{"role": "user", "content": prompt}],
            temperature=0.3 # A bit of temperature can sometimes help with creativity
        )
        response_content = response.choices[0].message.content
        return parse_json_from_response(response_content)
    except Exception as e:
        print(f"API call failed for concept '{concept}': {e}")
        return None