In [1]:
# Cell 2: Imports and API Key Configuration
import json
import re
import os
from openai import OpenAI

# For Google Colab:
# from google.colab import userdata
# client = OpenAI(api_key=userdata.get('OPENAI_API_KEY'))

# For local environment (make sure you have OPENAI_API_KEY set):
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# As a placeholder, initialize the client.
# Make sure to replace this with your actual setup.
try:
    client = OpenAI()
    print("OpenAI client initialized successfully.")
except Exception as e:
    print(f"Error initializing OpenAI client: {e}")
    print("Please ensure your API key is configured correctly.")

# --- Configuration ---
# Update this path to point to the PDF you want to process
PDF_FILE_PATH = "Documents/The 100 Page Machine Learning Book Part2.pdf"
OUTPUT_JSON_PATH = "quiz_output.json"

OpenAI client initialized successfully.


In [2]:
# Cell 3: Document Processing and Parsing Functions
def extract_text_from_pdf(pdf_path: str) -> str:
    """Extracts text content from a PDF file."""
    try:
        import fitz  # PyMuPDF
        doc = fitz.open(pdf_path)
        full_text = ""
        for page in doc:
            full_text += page.get_text()
        doc.close()
        print(f"✅ Successfully extracted {len(full_text.split())} words from {pdf_path}.")
        return full_text
    except Exception as e:
        print(f"❌ Error reading PDF {pdf_path}: {e}")
        return ""

def chunk_text(text: str, chunk_size: int = 2000, overlap: int = 200) -> list[str]:
    """Splits text into overlapping chunks."""
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunks.append(" ".join(words[i:i + chunk_size]))
    print(f"Text split into {len(chunks)} chunks.")
    return chunks

def parse_json_from_response(response_text: str) -> dict | None:
    """Safely extracts a JSON object from a string, even with surrounding text."""
    # Use regex to find the JSON block, which handles leading/trailing text
    json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
    if not json_match:
        print("Parser Error: No JSON object found in the response.")
        return None
    try:
        return json.loads(json_match.group(0))
    except json.JSONDecodeError:
        print("Parser Error: Failed to decode JSON from the extracted string.")
        return None

In [3]:
# Cell 4: Core LLM Functions

from sentence_transformers import SentenceTransformer, util
import numpy as np

# Load a pre-trained model once at the beginning of your script
# This model is small, fast, and effective
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

def retrieve_relevant_passages_semantic(concept: str, text_chunks: list[str], chunk_embeddings, top_k: int = 2) -> list[str]:
    """Retrieves the most relevant text chunks using semantic search."""
    # 1. Create an embedding for the concept (your query)
    concept_embedding = embedding_model.encode(concept, convert_to_tensor=True)
    
    # 2. Calculate cosine similarity between the concept and all text chunks
    cosine_scores = util.cos_sim(concept_embedding, chunk_embeddings)[0]
    
    # 3. Get the indices of the top_k most similar chunks
    top_results_indices = np.argsort(-cosine_scores.cpu())[:top_k]
    
    # 4. Return the actual text chunks
    return [text_chunks[i] for i in top_results_indices]

def extract_concepts_from_chunk(chunk: str, client) -> list[str]:
    """Uses gpt-3.5-turbo to extract key concepts from a text chunk."""
    prompt = f"""
    Given the following text excerpt, please extract the most critical main ideas and concepts.
    Respond with a simple bulleted list.

    Excerpt:
    \"\"\"
    {chunk}
    \"\"\"

    Concepts:
    """
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0
        )
        content = response.choices[0].message.content
        return [line.strip('- ') for line in content.strip().split('\n') if line.strip()]
    except Exception as e:
        print(f"API Error during concept extraction: {e}")
        return []

def synthesize_concepts(all_concepts: list[str], client) -> list[str]:
    """Merges and deduplicates a list of concepts using gpt-3.5-turbo."""
    concepts_str = "\n".join([f"- {c}" for c in all_concepts])
    prompt = f"""
    I have a list of concepts from a document. Merge and deduplicate this into a final, clean list.

    Extracted Concepts:
    \"\"\"
    {concepts_str}
    \"\"\"

    Consolidated Concepts (bulleted list):
    """
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0
    )
    content = response.choices[0].message.content
    return [line.strip('- ') for line in content.strip().split('\n') if line.strip()]

def retrieve_relevant_passages(concept: str, text_chunks: list[str], top_k: int = 2) -> list[str]:
    """Retrieves the most relevant text chunks for a given concept using keyword matching."""
    concept_words = set(concept.lower().split())
    scored_chunks = []
    for chunk in text_chunks:
        chunk_words = set(chunk.lower().split())
        score = len(concept_words.intersection(chunk_words))
        if score > 0:
            scored_chunks.append((score, chunk))

    scored_chunks.sort(key=lambda x: x[0], reverse=True)
    return [chunk for score, chunk in scored_chunks[:top_k]]

def generate_question_with_difficulty(concept: str, passages: list[str], difficulty: str, client) -> dict | None:
    """Generates a question with a specific difficulty level (CORRECTED VERSION)."""

    difficulty_instructions = {
        "easy": "The question should test basic recall or understanding of a key definition from the text (Bloom's Taxonomy: Remembering/Understanding).",
        "medium": "The question should require applying a concept to a new context or analyzing the relationship between ideas from the text (Bloom's Taxonomy: Applying/Analyzing).",
        "hard": "The question should require evaluating the strengths/weaknesses of an argument or synthesizing information from multiple passages to form a conclusion (Bloom's Taxonomy: Evaluating/Creating)."
    }
    
    instruction = difficulty_instructions.get(difficulty, difficulty_instructions['medium'])
    passages_str = "\\n\\n---\\n\\n".join(passages)
    
    prompt = f"""
    Based on the following main idea and relevant text, create one multiple-choice question.

    **Difficulty Instruction**: {instruction}
    **Main Idea**: "{concept}"
    **Relevant Text**:
    \"\"\"
    {passages_str}
    \"\"\"

    Your response MUST be ONLY a single JSON object with the exact keys: "question", "choices", "correct_answer", "explanation".
    IMPORTANT: The value for "correct_answer" MUST be the full text of the correct option, NOT the letter (e.g., "A" or "B").
    """
    
    try:
        # 1. Make the API call
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.3
        )
        
        # 2. DEFINE response_content from the result
        response_content = response.choices[0].message.content
        
        # 3. NOW use response_content to parse the JSON
        parsed_json = parse_json_from_response(response_content)
        
        if parsed_json:
            parsed_json['difficulty'] = difficulty
        
        return parsed_json

    except Exception as e:
        print(f"API call failed for concept '{concept}': {e}")
        return None

def score_question_quality(question_data: dict, client) -> int:
    """Uses an AI judge to score the quality of a generated question (1-5)."""
    prompt = f"""
    Please evaluate the quality of the following multiple-choice question on a scale of 1 to 5,
    where 1 is poor and 5 is excellent. Consider its clarity, conceptual depth, and the plausibility of its distractors.

    Question: {question_data['question']}
    Choices: {question_data['choices']}

    Return ONLY a single integer score between 1 and 5.
    """
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo", # The cheap model is fine for this simple task
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0
        )
        return int(response.choices[0].message.content.strip())
    except Exception as e:
        print(f"  -> Quality scoring failed: {e}")
        return 0 # Return a default low score on failure
    
def verify_question(question_data: dict, client) -> bool:
    """Uses an AI agent to verify the correctness of a question against its source."""
    """VERY EXPENSIVE FUNCTION - USE SPARINGLY - CONSIDER SEMANTIC ALTERNATIVE"""
    question = question_data['question']
    correct_answer = question_data['correct_answer']
    passages = "\\n---\\n".join(question_data['source_passages'])

    prompt = f"""
    You are a meticulous fact-checker. Your task is to determine if the provided "Correct Answer" is factually supported by the "Source Text" for the given "Question".

    Analyze the following:
    - Source Text: "{passages}"
    - Question: "{question}"
    - Proposed Correct Answer: "{correct_answer}"

    Respond with a single word: "true" if the source text fully supports the answer, and "false" otherwise.
    """
    try:
        response = client.chat.completions.create(
            model="gpt-4o", # A powerful model is needed for reliable fact-checking
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0
        )
        decision = response.choices[0].message.content.strip().lower()
        return decision == "true"
    except Exception as e:
        print(f"  -> Verification call failed: {e}")
        return False

def verify_question_semantic(question_data: dict, embedding_model, threshold=0.45) -> bool:
    """Verifies a question by checking the semantic similarity of the answer to its source (Robust Version)."""
    
    answer_text = question_data['correct_answer']
    choices = question_data['choices']

    # --- NEW LOGIC TO HANDLE INCONSISTENT FORMATS ---
    # Check if the answer is a letter (e.g., "A", "B") and choices is a dict
    if isinstance(choices, dict) and answer_text.upper() in choices:
        answer_text = choices[answer_text.upper()]
    # Check if the answer is a letter and choices is a list (less common)
    elif isinstance(choices, list) and answer_text.upper() in ["A", "B", "C", "D"]:
        # Simple mapping for A=0, B=1, etc.
        idx = ord(answer_text.upper()) - ord('A')
        if idx < len(choices):
            answer_text = choices[idx]
    # --- END OF NEW LOGIC ---

    source_context = " ".join(question_data['source_passages'])

    answer_embedding = embedding_model.encode(answer_text, convert_to_tensor=True)
    context_embedding = embedding_model.encode(source_context, convert_to_tensor=True)

    similarity_score = util.cos_sim(answer_embedding, context_embedding)[0][0]

    print(f"  -> Verification Score: {similarity_score:.4f} (Threshold: {threshold})")

    return similarity_score >= threshold



  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Cell 5: Run the Full Pipeline
def run_full_pipeline(pdf_path, client, desired_difficulties=["easy", "medium", "hard"]):
    # Stage 1: Ingest and Chunk Document
    document_text = extract_text_from_pdf(pdf_path)
    if not document_text:
        print("Pipeline stopped: Could not read document.")
        return []
    text_chunks = chunk_text(document_text)
    print("Creating embeddings for document chunks...")
    chunk_embeddings = embedding_model.encode(text_chunks, convert_to_tensor=True)
    print("Embeddings created.")


    # Stage 2: Extract and Synthesize Concepts
    print("\n--- Starting Stage 2: Concept Extraction ---")
    all_chunk_concepts = []
    for i, chunk in enumerate(text_chunks):
        print(f"Extracting concepts from chunk {i+1}/{len(text_chunks)}...")
        all_chunk_concepts.extend(extract_concepts_from_chunk(chunk, client))

    print(f"\nExtracted {len(all_chunk_concepts)} raw concepts. Now synthesizing...")
    final_concepts = synthesize_concepts(all_chunk_concepts, client)
    print(f"Synthesized down to {len(final_concepts)} final concepts.")

    # Stage 3: Retrieve and Generate Questions
    print("\n--- Starting Stage 3: Question Generation ---")
    generated_questions = []
    
    for i, concept in enumerate(final_concepts):
        # ✅ CHANGE 2: Determine the difficulty for the current question
        current_difficulty = desired_difficulties[i % len(desired_difficulties)]
        
        print(f"Processing concept {i+1}/{len(final_concepts)} (Difficulty: {current_difficulty}): '{concept[:50]}...'")
        passages = retrieve_relevant_passages_semantic(concept, text_chunks, chunk_embeddings)

        
        if passages:
            # ✅ CHANGE 3: Call the new function with the difficulty parameter
            question_data = generate_question_with_difficulty(concept, passages, current_difficulty, client)

            if question_data:
                # Add the quality score
                quality_score = score_question_quality(question_data, client)
                question_data['quality_score'] = quality_score
                
                # Only append the question if it meets our quality threshold
                if quality_score >= 3: # Example threshold
                    question_data['source_passages'] = passages
                    is_verified = verify_question_semantic(question_data, embedding_model)
                    if is_verified:
                        generated_questions.append(question_data)
                        print(f"  -> ✅ Verified and Kept question with score: {quality_score}")
                    else:
                        print(f"  -> ❌ Discarded question that failed verification.")

            else:
                print("  -> Failed to generate question.")
        else:
            print("  -> No relevant passages found, skipping.")

    return generated_questions

# --- EXECUTE ---
if os.path.exists(PDF_FILE_PATH):
    final_quiz = run_full_pipeline(PDF_FILE_PATH, client)

    # Save the final list of questions to a JSON file
    with open(OUTPUT_JSON_PATH, 'w') as f:
        json.dump(final_quiz, f, indent=4)

    print(f"\n✅ Pipeline complete! Generated {len(final_quiz)} questions.")
    print(f"Output saved to {OUTPUT_JSON_PATH}")
else:
    print(f"❌ Error: The file '{PDF_FILE_PATH}' was not found. Please update the path in Cell 2.")

✅ Successfully extracted 6490 words from Documents/The 100 Page Machine Learning Book Part2.pdf.
Text split into 4 chunks.
Creating embeddings for document chunks...
Embeddings created.

--- Starting Stage 2: Concept Extraction ---
Extracting concepts from chunk 1/4...
Extracting concepts from chunk 2/4...
Extracting concepts from chunk 3/4...
Extracting concepts from chunk 4/4...

Extracted 33 raw concepts. Now synthesizing...
Synthesized down to 26 final concepts.

--- Starting Stage 3: Question Generation ---
Processing concept 1/26 (Difficulty: easy): 'Derivative and Gradient...'
  -> Verification Score: 0.4535 (Threshold: 0.45)
  -> ✅ Verified and Kept question with score: 4
Processing concept 2/26 (Difficulty: medium): 'Random Variable...'
  -> Verification Score: 0.1448 (Threshold: 0.45)
  -> ❌ Discarded question that failed verification.
Processing concept 3/26 (Difficulty: hard): 'Probability Mass Function and Probability Density ...'
  -> Verification Score: 0.2199 (Threshold

KeyboardInterrupt: 