In [10]:
# Cell 2: Imports and API Key Configuration
import json
import re
import os
from openai import OpenAI

# Local environment (make sure you have OPENAI_API_KEY set):
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Initialize the client.
try:
    client = OpenAI()
    print("OpenAI client initialized successfully.")
except Exception as e:
    print(f"Error initializing OpenAI client: {e}")
    print("Please ensure your API key is configured correctly.")

# --- Configuration ---
# Path to point to the PDF you want to process
PDF_FILE_PATH = "Documents/The 100 Page Machine Learning Book Part2.pdf"
OUTPUT_JSON_PATH = "quiz_output.json"

OpenAI client initialized successfully.


In [11]:
# Cell 3: Document Processing and Parsing Functions

import ftfy

def clean_and_normalize_text(text: str) -> str:
    """Fixes Unicode errors and normalizes text."""
    # ftfy.fix_text handles the weird characters like '\ufb01'
    # The encode/decode step handles any remaining escape sequences.
    return ftfy.fix_text(text)


def extract_text_from_pdf(pdf_path: str) -> str:
    """Extracts text content from a PDF file."""
    try:
        import fitz  # PyMuPDF
        doc = fitz.open(pdf_path)
        full_text = ""
        for page in doc:
            full_text += page.get_text()
        doc.close()
        
        cleaned_text = clean_and_normalize_text(full_text)

        print(f"✅ Successfully extracted {len(cleaned_text.split())} words from {pdf_path}.")
        return cleaned_text
    except Exception as e:
        print(f"❌ Error reading PDF {pdf_path}: {e}")
        return ""

def chunk_text(text: str, chunk_size: int = 2000, overlap: int = 200) -> list[str]:
    """Splits text into overlapping chunks."""
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunks.append(" ".join(words[i:i + chunk_size]))
    print(f"Text split into {len(chunks)} chunks.")
    return chunks

def parse_json_from_response(response_text: str) -> dict | None:
    """Safely extracts a JSON object from a string, even with surrounding text."""
    # Use regex to find the JSON block, which handles leading/trailing text
    json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
    if not json_match:
        print("Parser Error: No JSON object found in the response.")
        return None
    try:
        return json.loads(json_match.group(0))
    except json.JSONDecodeError:
        print("Parser Error: Failed to decode JSON from the extracted string.")
        return None

In [12]:
# Cell 4: Core LLM Functions

from sentence_transformers import SentenceTransformer, util
import numpy as np




# Load a pre-trained model once at the beginning of the script
# This model is small, fast, and effective
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

def retrieve_relevant_passages_semantic(concept: str, text_chunks: list[str], chunk_embeddings, top_k: int = 2) -> list[str]:
    """Retrieves the most relevant text chunks using semantic search."""
    # 1. Create an embedding for the concept (the query)
    concept_embedding = embedding_model.encode(concept, convert_to_tensor=True)
    
    # 2. Calculate cosine similarity between the concept and all text chunks
    cosine_scores = util.cos_sim(concept_embedding, chunk_embeddings)[0]
    
    # 3. Get the indices of the top_k most similar chunks
    top_results_indices = np.argsort(-cosine_scores.cpu())[:top_k]
    
    # 4. Return the actual text chunks
    return [text_chunks[i] for i in top_results_indices]

def extract_concepts_from_chunk(chunk: str, client) -> list[str]:
    """Uses gpt-3.5-turbo to extract key concepts from a text chunk."""
    prompt = f"""
    Given the following text excerpt, please extract the most critical main ideas and concepts.
    Respond with a simple bulleted list.

    Excerpt:
    \"\"\"
    {chunk}
    \"\"\"

    Concepts:
    """
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0
        )
        content = response.choices[0].message.content
        return [line.strip('- ') for line in content.strip().split('\n') if line.strip()]
    except Exception as e:
        print(f"API Error during concept extraction: {e}")
        return []

def synthesize_concepts(all_concepts: list[str], client) -> list[str]:
    """Merges and deduplicates a list of concepts using gpt-3.5-turbo."""
    concepts_str = "\n".join([f"- {c}" for c in all_concepts])
    prompt = f"""
                You are given a long, potentially repetitive list of concepts extracted from a document.
                Your task is to synthesize this into a final, consolidated list of unique concepts.
                Carefully merge concepts that refer to the same idea (e.g., "Linear Regression Model" and "Linear Regression Algorithm").
                Eliminate any concepts that are too generic or not technical.

                Extracted Concepts:
                \"\"\"
                {concepts_str}
                \"\"\"

                Return ONLY a single, clean, bulleted list of the final consolidated concepts.
            """ 

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0
    )
    content = response.choices[0].message.content
    return [line.strip('- ') for line in content.strip().split('\n') if line.strip()]

def retrieve_relevant_passages(concept: str, text_chunks: list[str], top_k: int = 2) -> list[str]:
    """Retrieves the most relevant text chunks for a given concept using keyword matching."""
    concept_words = set(concept.lower().split())
    scored_chunks = []
    for chunk in text_chunks:
        chunk_words = set(chunk.lower().split())
        score = len(concept_words.intersection(chunk_words))
        if score > 0:
            scored_chunks.append((score, chunk))

    scored_chunks.sort(key=lambda x: x[0], reverse=True)
    return [chunk for score, chunk in scored_chunks[:top_k]]

def generate_question_with_difficulty(concept: str, passages: list[str], difficulty: str, client) -> dict | None:
    """Generates a question with a specific difficulty level (CORRECTED VERSION)."""

    difficulty_instructions = {
        "easy": "The question should test basic recall or understanding of a key definition from the text (Bloom's Taxonomy: Remembering/Understanding).",
        "medium": "The question should require applying a concept to a new context or analyzing the relationship between ideas from the text (Bloom's Taxonomy: Applying/Analyzing).",
        "hard": "The question should require evaluating the strengths/weaknesses of an argument or synthesizing information from multiple passages to form a conclusion (Bloom's Taxonomy: Evaluating/Creating)."
    }
    
    instruction = difficulty_instructions.get(difficulty, difficulty_instructions['medium'])
    passages_str = "\\n\\n---\\n\\n".join(passages)
    
    prompt = f"""
            Act as an expert educator. Your task is to create one high-quality, multiple-choice question based on the provided main idea and source text.

            **Main Idea**: "{concept}"
            **Source Text**:
            \"\"\"
            {passages_str}
            \"\"\"

            **Difficulty Instruction**: {instruction}

            Follow these steps:
            1.  **Reasoning Step**: First, silently think step-by-step. What is the core conceptual knowledge being tested? What would be common misconceptions for the distractors? How can I phrase the question to test for deep understanding rather than simple recall?
            2.  **Generation Step**: Based on your reasoning, generate a single JSON object with the exact keys: "question", "choices", "correct_answer", "explanation".

            **CRITICAL INSTRUCTIONS**:
            -   The "choices" must be a list of 4 strings.
            -   The value for "correct_answer" MUST be the full text of the correct option, NOT the letter (e.g., "A" or "B").
            -   All choices, including distractors, must be plausible and directly related to the main idea.
            -   The explanation must be concise (2-3 sentences) and clearly state why the correct answer is right based on the source text.

            Your response MUST be ONLY the single JSON object.

            Example of a perfect response:
            {{
                "question": "What is the primary trade-off controlled by the hyperparameter C in a soft-margin SVM?",
                "choices": [
                    "Margin size vs. model complexity", 
                    "Margin size vs. classification accuracy on training data", 
                    "Kernel type vs. training speed", 
                    "Number of support vectors vs. feature dimensions"
                ],
                "correct_answer": "Margin size vs. classification accuracy on training data",
                "explanation": "The hyperparameter C controls the trade-off between maximizing the margin for better generalization and minimizing the misclassification of training examples for a better fit to the data."
            }}
            """

    
    try:
        # 1. Make the API call
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.3
        )
        
        # 2. DEFINE response_content from the result
        response_content = response.choices[0].message.content
        
        # 3. NOW use response_content to parse the JSON
        parsed_json = parse_json_from_response(response_content)
        
        if parsed_json:
            parsed_json['difficulty'] = difficulty
        
        return parsed_json

    except Exception as e:
        print(f"API call failed for concept '{concept}': {e}")
        return None

def score_question_quality(question_data: dict, client) -> int:
    """Uses an AI judge to score the quality of a generated question (1-5)."""
    prompt = f"""
    Please evaluate the quality of the following multiple-choice question on a scale of 1 to 5,
    where 1 is poor and 5 is excellent. Consider its clarity, conceptual depth, and the plausibility of its distractors.

    Question: {question_data['question']}
    Choices: {question_data['choices']}

    Return ONLY a single integer score between 1 and 5.
    """
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo", # The cheap model is fine for this simple task
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0
        )
        return int(response.choices[0].message.content.strip())
    except Exception as e:
        print(f"  -> Quality scoring failed: {e}")
        return 0 # Return a default low score on failure

def pre_filter_question(question_data: dict) -> bool:
    """Performs simple, non-LLM checks on a generated question (Robust Version)."""
    if not question_data:
        return False

    choices_data = question_data.get("choices", [])
    
    # --- NEW LOGIC to handle both lists and dictionaries ---
    actual_choices = []
    if isinstance(choices_data, dict):
        # If it's a dictionary, iterate over its values
        actual_choices = list(choices_data.values())
    elif isinstance(choices_data, list):
        # If it's a list, use it directly
        actual_choices = choices_data
    
    if not actual_choices:
        return False
    # --- END OF NEW LOGIC ---

    # Now, perform the checks on the actual choice text
    if any(len(str(c).split()) < 2 for c in actual_choices):
        print("  -> Pre-filter fail: A choice was too short.")
        return False
        
    if len(question_data.get("explanation", "").split()) < 5:
        print("  -> Pre-filter fail: Explanation was too short.")
        return False
        
    if "placeholder" in question_data.get("question", "").lower():
        print("  -> Pre-filter fail: Question contained placeholder text.")
        return False
        
    return True

def extract_concepts_from_batch(combined_prompt: str, client) -> list[str]:
    """
    Sends a batch of chunks in a single prompt to the LLM and parses the
    structured response to extract a flat list of concepts (CORRECTED).
    """
    full_prompt = combined_prompt + "\n\nReturn a bulleted list of concepts for each excerpt under its corresponding heading."
    
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": full_prompt}],
            temperature=0.0
        )
        
        content = response.choices[0].message.content
        
        # --- DEBUGGING LINE ---
        # print("--- Raw LLM Output for Batch ---\n", content, "\n---------------------------------")
        
        extracted_concepts = []
        
        # Split by the correct newline character '\n'
        lines = content.strip().split('\n')
        
        for line in lines:
            cleaned_line = line.strip()
            if cleaned_line.startswith('-'):
                concept = cleaned_line.lstrip('- ').strip()
                if concept:
                    extracted_concepts.append(concept)
                    
        return extracted_concepts

    except Exception as e:
        print(f"API Error during batch concept extraction: {e}")
        return []


def verify_question_semantic(question_data: dict, embedding_model, threshold=0.40) -> bool:
    """Verifies a question by checking the semantic similarity of the answer to its source (Robust Version)."""
    
    answer_text = question_data['correct_answer']
    choices = question_data['choices']

    # --- NEW LOGIC TO HANDLE INCONSISTENT FORMATS ---
    # Check if the answer is a letter (e.g., "A", "B") and choices is a dict
    if isinstance(choices, dict) and answer_text.upper() in choices:
        answer_text = choices[answer_text.upper()]
    # Check if the answer is a letter and choices is a list (less common)
    elif isinstance(choices, list) and answer_text.upper() in ["A", "B", "C", "D"]:
        # Simple mapping for A=0, B=1, etc.
        idx = ord(answer_text.upper()) - ord('A')
        if idx < len(choices):
            answer_text = choices[idx]
    # --- END OF NEW LOGIC ---

    source_context = " ".join(question_data['source_passages'])

    answer_embedding = embedding_model.encode(answer_text, convert_to_tensor=True)
    context_embedding = embedding_model.encode(source_context, convert_to_tensor=True)

    similarity_score = util.cos_sim(answer_embedding, context_embedding)[0][0]

    print(f"  -> Verification Score: {similarity_score:.4f} (Threshold: {threshold})")

    return similarity_score >= threshold



In [None]:
# Cell 5: Run the Full Pipeline (with Caching and Bug Fixes)
import pickle

def run_full_pipeline(text_chunks, chunk_embeddings, client, desired_difficulties=["easy", "medium", "hard"]):
    # Stage 1: Extract and Synthesize Concepts
    print("\n--- Starting Stage 1: Concept Extraction (in Batches) ---")
    all_chunk_concepts = []
    batch_size = 5 # Process 5 chunks at a time
    for i in range(0, len(text_chunks), batch_size):
        batch_chunks = text_chunks[i:i + batch_size]
        
        combined_prompt = """For each document excerpt below, act as a subject matter expert and extract the key concepts.
                        A 'concept' should be a core technical term, algorithm, or principle that is essential for understanding the text.
                        Ignore introductory phrases, examples, or simple definitions. Focus on the main nouns or noun phrases.

                        """

        for j, chunk in enumerate(batch_chunks):
            combined_prompt += f"--- Excerpt {j+1} ---\n{chunk}\n\n"
        
        print(f"Extracting concepts from chunks {i+1}-{min(i+batch_size, len(text_chunks))}...")
        all_chunk_concepts.extend(extract_concepts_from_batch(combined_prompt, client))

    print(f"\nExtracted {len(all_chunk_concepts)} raw concepts. Now synthesizing...")
    final_concepts = synthesize_concepts(all_chunk_concepts, client)
    print(f"Synthesized down to {len(final_concepts)} final concepts.")

    # Stage 2: Retrieve and Generate Questions
    print("\n--- Starting Stage 2: Question Generation ---")
    generated_questions = []
    
    for i, concept in enumerate(final_concepts):
        current_difficulty = desired_difficulties[i % len(desired_difficulties)]
        
        print(f"Processing concept {i+1}/{len(final_concepts)} (Difficulty: {current_difficulty}): '{concept[:50]}...'")
        passages = retrieve_relevant_passages_semantic(concept, text_chunks, chunk_embeddings)
        
        if passages:
            question_data = generate_question_with_difficulty(concept, passages, current_difficulty, client)

            if pre_filter_question(question_data):
                quality_score = score_question_quality(question_data, client)
                question_data['quality_score'] = quality_score
                
                if quality_score >= 3:
                    question_data['source_passages'] = passages
                    is_verified = verify_question_semantic(question_data, embedding_model)
                    if is_verified:
                        generated_questions.append(question_data)
                        print(f"  -> ✅ Verified and Kept question with score: {quality_score}")
                    else:
                        print(f"  -> ❌ Discarded question that failed verification.")
                else:
                    print(f"  -> ❌ Discarded question with low quality score: {quality_score}")
            else:
                print("  -> ❌ Discarded question that failed pre-filter.")
        else:
            print("  -> No relevant passages found, skipping.")

    return generated_questions

# --- EXECUTE ---
if os.path.exists(PDF_FILE_PATH):
    # --- NEW CACHING LOGIC ---
    # Create a unique cache file name based on the PDF name
    cache_filename = os.path.basename(PDF_FILE_PATH) + ".pkl"
    
    if os.path.exists(cache_filename):
        print(f"Loading text chunks and embeddings from cache file: {cache_filename}")
        with open(cache_filename, "rb") as f:
            text_chunks, chunk_embeddings = pickle.load(f)
    else:
        print("No cache found. Processing PDF from scratch...")
        document_text = extract_text_from_pdf(PDF_FILE_PATH)
        text_chunks = chunk_text(document_text)
        print("Creating embeddings for document chunks...")
        chunk_embeddings = embedding_model.encode(text_chunks, convert_to_tensor=True)
        print("Embeddings created.")
        
        print(f"Saving chunks and embeddings to cache file: {cache_filename}")
        with open(cache_filename, "wb") as f:
            pickle.dump((text_chunks, chunk_embeddings), f)
    # --- END OF CACHING LOGIC ---

    # Now, run the pipeline with the processed data
    final_quiz = run_full_pipeline(text_chunks, chunk_embeddings, client)

    # Save the final list of questions to a JSON file
    with open(OUTPUT_JSON_PATH, 'w') as f:
        json.dump(final_quiz, f, indent=4)

    print(f"\n✅ Pipeline complete! Generated {len(final_quiz)} questions.")
    print(f"Output saved to {OUTPUT_JSON_PATH}")
else:
    print(f"❌ Error: The file '{PDF_FILE_PATH}' was not found. Please update the path in Cell 2.")

Loading text chunks and embeddings from cache file: The 100 Page Machine Learning Book Part2.pdf.pkl

--- Starting Stage 1: Concept Extraction (in Batches) ---
Extracting concepts from chunks 1-4...

Extracted 76 raw concepts. Now synthesizing...
Synthesized down to 53 final concepts.

--- Starting Stage 2: Question Generation ---
Processing concept 1/53 (Difficulty: easy): 'Derivative...'
  -> Verification Score: 0.5359 (Threshold: 0.4)
  -> ✅ Verified and Kept question with score: 4
Processing concept 2/53 (Difficulty: medium): 'Gradient...'
  -> Verification Score: 0.2772 (Threshold: 0.4)
  -> ❌ Discarded question that failed verification.
Processing concept 3/53 (Difficulty: hard): 'Function...'
  -> Verification Score: -0.0126 (Threshold: 0.4)
  -> ❌ Discarded question that failed verification.
Processing concept 4/53 (Difficulty: easy): 'Constant value...'
  -> Pre-filter fail: A choice was too short.
  -> ❌ Discarded question that failed pre-filter.
Processing concept 5/53 (Diff