In [11]:
import os
import time
import json
import re
import ast
from typing import List, Optional, Dict, Union

from dotenv import load_dotenv

from pydantic import BaseModel, Field, conlist, validator
from typing import Dict, List
from enum import Enum
from langchain.output_parsers import PydanticOutputParser

from langchain_core.runnables import RunnableLambda
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.documents import Document

from langchain.schema import Document as SchemaDocument  
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import PyPDFLoader

from langchain_google_genai import ChatGoogleGenerativeAI



# models

In [4]:
# Load the .env file and set the API key
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# Check if key is loaded
if not GOOGLE_API_KEY:
    raise ValueError("❌ GOOGLE_API_KEY not found in .env")

# Initialize Gemini model for generation
mcq_generator = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash-preview-05-20",   # gemini-2.5-flash-preview-05-20
    google_api_key=GOOGLE_API_KEY
)

# Initialize Gemini model for evaluation
mcq_evaluator = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash-preview-04-17-thinking",  # gemini-2.5-flash-preview-04-17-thinking
    google_api_key=GOOGLE_API_KEY
)


In [5]:
print(mcq_generator.invoke("Hello, how are you?"))
print(mcq_evaluator.invoke("Hello, how are you?"))

content='Hello! As an AI, I don\'t have feelings or a physical body, so I don\'t get "how are you" in the human sense. But I\'m ready and functioning perfectly!\n\nHow are *you* doing today? And what can I help you with?' additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': []} id='run--95b302e8-eeeb-4d94-98e8-db53906cfcf4-0' usage_metadata={'input_tokens': 7, 'output_tokens': 59, 'total_tokens': 102, 'input_token_details': {'cache_read': 0}}
content='Hello!\n\nAs an AI, I don\'t have feelings or a physical state like a human does, so I don\'t experience being "well" or "unwell."\n\nHowever, I\'m here and ready to assist you! How can I help you today?' additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': []} id='run--08f2cce5-654c-4e51-88b1-54232fc8b253-0' usage_metadata={'input_tokens': 7, 'output_tokens

# final function 

In [23]:
# prompt for MCQ generation 
mcq_prompt_template = PromptTemplate(
    input_variables=["content"],
    template="""
You have been given some content. Your task is to generate multiple choice questions (MCQs) that assess deep understanding of the core concepts, reasoning, and applications within that content.

Instructions:

- Use only the information presented in the given content.
- Create questions that require comprehension, interpretation, or application of the main ideas, principles, or reasoning shown in the content.
- Focus your questions on the underlying concepts, methods, or logic discussed, rather than on surface details or specific examples.
- When examples are present, use them as inspiration to create new, similar scenarios or to ask about the general principle they illustrate, rather than about the specific details or outputs of those examples.
- Encourage the learner to think about why or how a concept works, or how it can be applied, rather than recalling isolated facts, names, or figures.
- Ensure that all key ideas are covered. Each question should test meaningful understanding or the ability to apply what was conveyed.
- Make each question unique, covering a distinct aspect of the content.
- Generate as many MCQs as possible from all the key topics in the content.

Avoid:

- Do NOT ask questions that test memory of specific figures, dates, percentages, or logistical details.
- Avoid questions about:
  - Administrative details, deadlines, or metadata
  - Author names, publication info, or references
  - Off-topic content not central to the main ideas

For each MCQ:
  1. Write a clear and concise question.
  2. Provide 4 plausible options labeled A, B, C, and D.
  3. Make the incorrect options plausible but clearly wrong.
  4. Ensure only one option is correct.
  5. Assign a difficulty level: "easy", "medium", or "hard".
  6. Add a topic that best categorizes the question.
  7. Provide an explanation for each option (why it is correct or incorrect).

Maintain this difficulty distribution:
  - 20-30% Hard
  - 30-40% Medium
  - 30-50% Easy

Return your answers as a list of valid JSON objects in the following format:
  {{
    "status": "success",
    "mcqs": [
              {{
              "question": "Sample question?",
              "options": {{
              "A": "Option A text",   
              "B": "Option B text",
              "C": "Option C text",
              "D": "Option D text"
              }},
              "answer": "C",
              "topic": "Relevant topic here",
              "difficulty": "easy",
              "explanation": {{
              "A": "Option A is incorrect because...",
              "B": "Option B is incorrect because...",
              "C": "Option C is correct because...",
              "D": "Option D is incorrect because..."
              }}
              }}
            ]
  }}
  
- If content has no generatable MCQs then return :
  {{
    "status": "no_valid_mcqs",
    "mcqs": []
  }}     

Content:
{content}
"""
)


mcq_evaluation_prompt = PromptTemplate(
    input_variables=["chunk", "mcqs"],
    template="""
You are an expert educational content validator. Your role is to rigorously assess and filter a given list of **multiple-choice questions (MCQs)** against a provided **source content chunk**.

Your ONLY task is to return a JSON array containing **ONLY high-quality MCQs** that strictly adhere to ALL the evaluation criteria. If an MCQ violates even a single rule, it MUST be excluded. You are NOT allowed to modify, correct, or infer anything about the MCQs.

---

### FUNDAMENTAL PRINCIPLE FOR ACCEPTANCE:
**Every accepted MCQ must test the learner's understanding, application, or reasoning of the CORE ACADEMIC CONCEPTS, principles, theories, or methods explicitly discussed in the `chunk`. Questions about the *delivery, administration, or structure* of the course itself are strictly forbidden.**

---

### EVALUATION CRITERIA (All Must Be Satisfied for an MCQ to be Accepted):

1.  **Direct Content Basis:** The question and its correct answer MUST be directly derivable and supported by the explicit information within the provided `chunk` of content. No outside knowledge or interpretation allowed.

2.  **Plausible Distractors:** All incorrect options (distractors) must be genuinely plausible and relevant to the subject matter, but unequivocally wrong based on the `chunk`. They should not be obviously incorrect or irrelevant.

3.  **Unambiguous Correct Answer:** There must be *exactly one* correct answer, and it must be unequivocally correct based on the `chunk`.

4.  **Accurate & Specific Topic:** The `topic` field must accurately and precisely reflect the core academic subject or concept the question is testing, as presented in the `chunk`. It should *never* refer to course logistics or structure.

5.  **Comprehensive Explanations:** Every option (A, B, C, D) must have a concise and accurate `explanation` detailing *why* it is correct or incorrect, specifically referencing the content in the `chunk`.

6.  **No Redundancy/Duplicates:** The MCQ must be unique. It should not repeat concepts, question phrasing, or the underlying reasoning tested by any other MCQ in the list. Avoid questions that are mere rephrasing of another.

7.  **Depth of Understanding (Conceptual Focus):** The question must assess understanding, reasoning, analysis, or application of core academic concepts. **STRICTLY REJECT** questions that primarily test:
    * Memorization of isolated facts (e.g., exact numbers, dates, specific names, definitions without context, simple recall of lists).
    * Logistical details (e.g., deadlines, submission methods, formatting rules).
    * Administrative details (e.g., class policies, grading, attendance).

---

### CRITICAL RED FLAGS: IMMEDIATELY REJECT MCQs IF THEY PERTAIN TO ANY OF THE FOLLOWING:

* **Course Logistics/Administration:** Questions about how the course is run, organized, graded, or what students are "required" to do beyond learning the subject matter.
    * *Examples to reject:* "What is the policy for late submissions?", "How many assignments are there?", "What is the recommended study strategy for this course?", "What is the purpose of the final project *proposal*?", "Why are students required to scribe lectures?", "What is the pedagogical reason for X course policy?"

* **Pedagogical Intent/Teaching Methods:** Questions about *why* the instructor chose a certain teaching method, assignment type, or course structure.
    * *Examples to reject:* "What is the aim of using case studies in this module?", "What is the instructor's philosophy behind peer review?", "Why is there a page limit for P-sets?"

* **Meta-Cognition/Learning Process:** Questions asking about the *process of learning* the content within the course context.
    * *Examples to reject:* "What is the best way to prepare for the exam?", "What skills will this course help you develop?"

* **Subjective Opinions or Preferences:** Questions that can have multiple subjective answers or ask for opinions.

* **Referencing External Course Elements:** Questions that refer to "this video," "the lecture notes," "Module 3," "the textbook," etc., rather than directly extracting knowledge from the provided `chunk`.

---

### CONTENT CHUNK:
{chunk}

---

### GENERATED MCQs:
{mcqs}

---

### FINAL OUTPUT INSTRUCTIONS:

-   If one or more valid MCQs remain after filtering, return a **clean JSON array** of valid MCQs only. Do NOT include any markdown, commentary, or extra text.
  {{
    "status": "success",
    "mcqs": [
              {{
              "question": "Sample question?",
              "options": {{
              "A": "Option A text",   
              "B": "Option B text",
              "C": "Option C text",
              "D": "Option D text"
              }},
              "answer": "C",
              "topic": "Relevant topic here",
              "difficulty": "easy",
              "explanation": {{
              "A": "Option A is incorrect because...",
              "B": "Option B is incorrect because...",
              "C": "Option C is correct because...",
              "D": "Option D is incorrect because..."
              }}
              }}
            ]
  }}
  
- If content has no generatable MCQs then return :
  {{
    "status": "no_valid_mcqs",
    "mcqs": []
  }}     


**IMPORTANT:** Your output must be ONLY valid JSON. No pre-text, no post-text, no explanations, no markdown fences unless specifically requested for the JSON itself.
"""
)



In [None]:

def is_valid_mcq_structure(data):
    """
    Checks if the parsed data has a 'status' key and a 'mcqs' list with proper MCQ dict structure.
    """
    required_keys = {"question", "options", "answer", "topic", "difficulty", "explanation"}

    if not isinstance(data, dict):
        print("Top-level data is not a dict")
        return False

    if data.get("status") != "success":
        print("Status is not 'success'")
        return False

    mcqs = data.get("mcqs")
    if not isinstance(mcqs, list):
        print("'mcqs' field is not a list")
        return False

    for item in mcqs:
        if not isinstance(item, dict):
            print("An MCQ is not a dict")
            return False
        if not required_keys.issubset(item.keys()):
            print("Required keys missing in MCQ")
            return False
        # Keeping original validation for options and explanation as dicts
        if not isinstance(item["options"], dict) or not isinstance(item["explanation"], dict):
            print("'options' or 'explanation' is not a dict")
            return False
    return True


def is_reviewer_rejection(data):
    if isinstance(data, dict):  print(data.get("status"))
    return isinstance(data, dict) and data.get("status") == "no_valid_mcqs"


def safe_json_extract(output):
    print("parser is running")
    
    if isinstance(output, (list, dict)):
        if is_valid_mcq_structure(output):
            return output
        elif is_reviewer_rejection(output):
            return output
        else:
            print("Returning error in status.")
            print("output was :",output)
            return {"status": "error", "mcqs": []}   # still allows retry

    try:
        if hasattr(output, 'content'):
            output = output.content

        # Find the first opening curly brace '{' and slice the string from there
        first_brace_index = output.find('{')
        if first_brace_index != -1:
            output = output[first_brace_index:]
        else:
            # If no opening brace is found, it's not valid JSON.
            # Log an error and return the error structure.
            print("⚠️ No opening '{' found in the output string. Cannot parse as JSON.")
            print("output was :",output)
            return {"status": "error", "mcqs": []}

        
        # Cleanup steps (these remain as in your original code)
        output = re.sub(r"```(?:json)?", "", output).strip("` \n")
        output = re.sub(r",\s*([}\]])", r"\1", output)
        output = re.sub(r'[\x00-\x09\x0B-\x1F]', '', output)
        output = re.sub(r'(?<={|,)\s*(\w+)\s*:', r'"\1":', output)

        parsed = json.loads(output)
        
        if is_valid_mcq_structure(parsed):
            # print(4)
            return parsed["mcqs"]
        elif is_reviewer_rejection(parsed):
            # print(5)
            return parsed
        else:
            print("Returning statues as error and output was as bellow:")
            print(output)
            return {"status": "error", "mcqs": []}   # will trigger retry

    except Exception as e:
        with open("malformed_output.json", "w") as f:
            f.write(str(output)) # Ensure output is a string when writing
        print(f"⚠️ JSON parsing failed: {e}")
        return {"status": "error", "mcqs": []}



# Wrap it as a LangChain Runnable
json_parser = RunnableLambda(safe_json_extract)


def load_and_split_file(file_path: Optional[str] = None,
                        transcript: Optional[str] = None,
                        chunk_size: int = 4000,
                        chunk_overlap: int = 200 ) -> List[Document]:
    """
    Load a .txt/.pdf file or a raw transcript string and split it into chunks using LangChain's RecursiveCharacterTextSplitter.

    Parameters:
        file_path (str, optional): Path to the input file (.txt or .pdf). Either this or `transcript` must be provided.
        transcript (str, optional): Raw transcript string. Used if no file_path is given.
        chunk_size (int): Maximum size (in characters) of each text chunk.
        chunk_overlap (int): Number of characters to overlap between consecutive chunks.

    Returns:
        List[Document]: A list of LangChain Document objects split from the input.
    """
    if transcript:
        combined_text = transcript
        source = "transcript_input"
    elif file_path:
        ext = os.path.splitext(file_path)[-1].lower()

        if ext == ".txt":
            loader = TextLoader(file_path)
        elif ext == ".pdf":
            loader = PyPDFLoader(file_path)
        else:
            raise ValueError("Unsupported file type. Only .txt and .pdf are supported.")

        documents = loader.load()

        # Combine all page contents into one string
        combined_text = "\n".join(doc.page_content for doc in documents)
        print("total character :",len(combined_text))
        source = file_path
    else:
        raise ValueError("Either file_path or transcript must be provided.")

    # Create a single Document with the combined content
    full_document = [Document(page_content=combined_text, metadata={"source": source})]
    
    # Now chunk this single full document
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = splitter.split_documents(full_document)

    return chunks


def generate_mcqs_from_file_with_eval(file_path=None, transcript=None, chunk_size=4000, chunk_overlap = 200, max_retries=3):
    """
    Generate and evaluate MCQs from either a file or a raw transcript using two LLM pipelines.

    Parameters:
        file_path (str, optional): Path to input .txt or .pdf file.
        transcript (str, optional): Raw string containing the transcript text.
        chunk_size (int): Maximum chunk size to split the document.
        max_retries (int): Maximum number of retries for a failed chunk.

    Returns:
        List[dict]: A list of final validated MCQs extracted and reviewed from the input.
    """

    chunks = load_and_split_file(file_path=file_path, transcript=transcript, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    generation_chain = mcq_prompt_template | mcq_generator | json_parser
    review_chain = mcq_evaluation_prompt | mcq_evaluator | json_parser

    all_mcqs = []
    all_raw_mcqs = []

    for i, chunk in enumerate(chunks):
        # if i == 1:
        #     break
        print(f"\n🔹 Processing chunk {i + 1}/{len(chunks)}...")
        content = chunk.page_content.strip()
        attempt = 0

        while attempt < max_retries:
            try:
                # Step 1: Generate raw MCQs
                print("Attempt : ",attempt + 1)
                raw_mcqs = generation_chain.invoke({"content": content})
                if not raw_mcqs:
                    raise ValueError("⚠️ No MCQs generated")

                if isinstance(raw_mcqs, dict) and raw_mcqs.get("status") == "error":
                    print("error in parser")
                    attempt += 1
                    continue

                if is_reviewer_rejection(raw_mcqs):
                    print(f"🟡 Chunk {i+1}: chunk has no relevant MCQs (not retrying).")
                    break
               
                print("raw mcqs : ",len(raw_mcqs))

                all_raw_mcqs.extend(raw_mcqs)
                
                if isinstance(raw_mcqs, str):
                    raw_mcqs = json.loads(raw_mcqs)

                # Step 2: Evaluate and filter MCQs
                final_mcqs = review_chain.invoke({
                    "chunk": content,
                    "mcqs": raw_mcqs
                })
                
                if isinstance(final_mcqs, dict) and final_mcqs.get("status") == "error":
                    print("error in parser")
                    attempt += 1
                    continue
                
                if is_reviewer_rejection(final_mcqs):
                    print(f"🟡 Chunk {i+1}: Reviewer rejected all MCQs (not retrying).")
                    break
                elif not final_mcqs:
                    print(final_mcqs)
                    raise ValueError("❌ Review returned no valid MCQs")

                all_mcqs.extend(final_mcqs)
                print(f"✅ Chunk {i + 1}: {len(final_mcqs)} valid MCQs. Total so far: {len(all_mcqs)} and raw MCQs are {len(all_raw_mcqs)}.")
                break  # ✅ Exit retry loop

            except Exception as e:
                attempt += 1
                print(f"[Retry {attempt}/{max_retries}] Error in chunk {i + 1}: {str(e)}")
                if attempt < max_retries:
                    wait_time = 2 ** attempt  # Exponential backoff
                    print(f"⏳ Retrying after {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    print(f"🚫 Skipping chunk {i + 1} after {max_retries} failed attempts.")
                    break

        time.sleep(3)  
    return all_mcqs,all_raw_mcqs

def deeply_sorted(obj):
    """Recursively sort any dictionary to ensure consistent comparison."""
    if isinstance(obj, dict):
        return {k: deeply_sorted(obj[k]) for k in sorted(obj)}
    elif isinstance(obj, list):
        return [deeply_sorted(item) for item in obj]
    else:
        return obj

def get_rejected_mcqs_from_lists(initial_mcqs, evaluated_mcqs):
    """
    Return unique MCQs from initial_mcqs that are not in evaluated_mcqs,
    preserving first appearance order.
    """
    evaluated_serialized = {
        json.dumps(deeply_sorted(mcq), separators=(",", ":"))
        for mcq in evaluated_mcqs
    }

    seen = set()
    rejected_mcqs = []
    for mcq in initial_mcqs:
        mcq_str = json.dumps(deeply_sorted(mcq), separators=(",", ":"))
        if mcq_str not in evaluated_serialized and mcq_str not in seen:
            rejected_mcqs.append(mcq)
            seen.add(mcq_str)

    return rejected_mcqs


def save_to_json(data, filename):
    """
    Saves a list or dictionary to a JSON file.

    Parameters:
        data (list or dict): The data to be saved (e.g., list of MCQs).
        filename (str): The name of the output JSON file.
    """
    try:
        # Write data to file in pretty-printed JSON format
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        print(f"Saved to {filename}")
    except Exception as e:
        print("Error saving JSON:", e)


In [27]:
path = r"experiment\Matrix muliplication\Multiply_Matrices.txt"
mcqs,raw = generate_mcqs_from_file_with_eval(file_path = path,chunk_size=4000)
rejected = get_rejected_mcqs_from_lists(raw,mcqs)
save_to_json(rejected, "Multiply_Matrices_rejected_1_p2.json")
save_to_json(mcqs, "Multiply_Matrices_1_p2.json")
save_to_json(raw, "Multiply_Matrices_raw_1_p2.json")


total character : 5800

🔹 Processing chunk 1/2...
Attempt :  1
parser is running
raw mcqs :  6
parser is running
✅ Chunk 1: 6 valid MCQs. Total so far: 6 and raw MCQs are 6.

🔹 Processing chunk 2/2...
Attempt :  1
parser is running
raw mcqs :  5
parser is running
✅ Chunk 2: 5 valid MCQs. Total so far: 11 and raw MCQs are 11.
Saved to Multiply_Matrices_rejected_1_p2.json
Saved to Multiply_Matrices_1_p2.json
Saved to Multiply_Matrices_raw_1_p2.json


In [28]:
path = r"experiment\Advanced Algorithms\Advanced_Algorithms.txt"
mcqs,raw = generate_mcqs_from_file_with_eval(file_path = path,chunk_size=4000)
rejected = get_rejected_mcqs_from_lists(raw,mcqs)
save_to_json(mcqs, "Advanced_Algorithms_1_p2.json")
save_to_json(raw, "Advanced_Algorithms_raw_1_p2.json")
save_to_json(rejected, "Advanced_Algorithms_rejected_1_p2.json")

total character : 44738

🔹 Processing chunk 1/12...
Attempt :  1
parser is running
raw mcqs :  7
parser is running
Status is not 'success'
no_valid_mcqs
no_valid_mcqs
🟡 Chunk 1: Reviewer rejected all MCQs (not retrying).

🔹 Processing chunk 2/12...
Attempt :  1
parser is running
raw mcqs :  7
parser is running
✅ Chunk 2: 6 valid MCQs. Total so far: 6 and raw MCQs are 14.

🔹 Processing chunk 3/12...
Attempt :  1
parser is running
raw mcqs :  11
parser is running
✅ Chunk 3: 11 valid MCQs. Total so far: 17 and raw MCQs are 25.

🔹 Processing chunk 4/12...
Attempt :  1
parser is running
raw mcqs :  8
parser is running
✅ Chunk 4: 8 valid MCQs. Total so far: 25 and raw MCQs are 33.

🔹 Processing chunk 5/12...
Attempt :  1
parser is running
raw mcqs :  10
parser is running
✅ Chunk 5: 9 valid MCQs. Total so far: 34 and raw MCQs are 43.

🔹 Processing chunk 6/12...
Attempt :  1
parser is running
raw mcqs :  13
parser is running
✅ Chunk 6: 12 valid MCQs. Total so far: 46 and raw MCQs are 56.

🔹 Pr

In [None]:
mcqs,raw = generate_mcqs_from_file_with_eval(file_path = path,chunk_size=4000)
save_to_json(mcqs, "IP_protection_policy_1_p2.json")

total character : 9752

🔹 Processing chunk 1/3...
parser is running
raw mcqs :  10
parser is running
✅ Chunk 1: 10 valid MCQs. Total so far: 10 and raw MCQs are 10.
Saved to IP_protection_policy_1_p2.json


In [None]:
path = r"experiment\IP_Protection_Policy\IP_Protection_Policy.pdf"
mcqs,raw = generate_mcqs_from_file_with_eval(file_path = path,chunk_size=4000)
rejected = get_rejected_mcqs_from_lists(raw,mcqs)
save_to_json(rejected, "IP_protection_policy_rejected_1_p2.json")
save_to_json(mcqs, "IP_protection_policy_1_p2.json")
save_to_json(raw, "IP_protection_policy_raw_1_p2.json")

total character : 9752

🔹 Processing chunk 1/3...
parser is running
raw mcqs :  12
parser is running
✅ Chunk 1: 12 valid MCQs. Total so far: 12 and raw MCQs are 12.

🔹 Processing chunk 2/3...
parser is running
raw mcqs :  12
parser is running
✅ Chunk 2: 12 valid MCQs. Total so far: 24 and raw MCQs are 24.

🔹 Processing chunk 3/3...
parser is running
raw mcqs :  12
parser is running
✅ Chunk 3: 12 valid MCQs. Total so far: 36 and raw MCQs are 36.
Saved to IP_protection_policy_rejected_1_p2.json
Saved to IP_protection_policy_1_p2.json
Saved to IP_protection_policy_raw_1_p2.json


In [None]:
path = r"C:\Users\admin\OneDrive\Desktop\coriolis\VS CODE\experiment\employee-leave-policy (1).pdf"
chunks = load_and_split_file(file_path = path )
len(chunks[0].page_content)

total character : 13303


161

In [None]:
path = r"experiment\employee-leave-policy\employee-leave-policy.pdf"
mcqs,raw = generate_mcqs_from_file_with_eval(file_path = path,chunk_size=4000)
rejected = get_rejected_mcqs_from_lists(raw,mcqs)
save_to_json(rejected, "leave_policy_3_rejected_1_p2.json")
save_to_json(mcqs, "leave_policy_3_1_p2.json")
save_to_json(raw, "leave_policy_3_raw_1_p2.json")

total character : 13303

🔹 Processing chunk 1/6...
parser is running
🟡 Chunk 1: chunk has no relevant MCQs (not retrying).

🔹 Processing chunk 2/6...
parser is running
 Raw response : The provided content is a table of contents, listing only headings and sub-headings for different types of leave. It does not contain any substantive information, definitions, principles, processes, or values for eligibility, entitlement, or any other aspect of the leave types.

To generate MCQs that assess "deep understanding of the core concepts, reasoning, and applications," as per the instructions, there must be actual content describing these concepts, principles, or their application. Since the content is purely structural and devoid of any explanatory text, it is not possible to create questions that go beyond simple recall of the document's structure (e.g., "Which section is listed under Earned Leave?"), which would violate the instruction to "avoid questions that test memory of specific figures, 

In [None]:
rejected = get_rejected_mcqs_from_lists(raw,mcqs)
len(rejected)


3

In [None]:
path = r"C:\Users\admin\OneDrive\Desktop\coriolis\VS CODE\experiment\madhvan_heaps_big_trnscpt.txt"
mcqs,raw = generate_mcqs_from_file_with_eval(file_path = path,chunk_size=4000)
rejected = get_rejected_mcqs_from_lists(raw,mcqs)
save_to_json(rejected,"madhvan_heaps_rejected.json")
save_to_json(mcqs,"madhvan_heaps.json")
save_to_json(raw,"madhvan_heaps_raw.json")


total character : 39474

🔹 Processing chunk 1/11...
parser is running
raw mcqs :  11
parser is running
✅ Chunk 1: 11 valid MCQs. Total so far: 11 and raw MCQs are 11.

🔹 Processing chunk 2/11...
parser is running
raw mcqs :  7
parser is running
✅ Chunk 2: 7 valid MCQs. Total so far: 18 and raw MCQs are 18.

🔹 Processing chunk 3/11...
parser is running
raw mcqs :  10
parser is running
✅ Chunk 3: 10 valid MCQs. Total so far: 28 and raw MCQs are 28.

🔹 Processing chunk 4/11...
parser is running
raw mcqs :  8
parser is running
✅ Chunk 4: 8 valid MCQs. Total so far: 36 and raw MCQs are 36.

🔹 Processing chunk 5/11...
parser is running
raw mcqs :  10
parser is running
✅ Chunk 5: 8 valid MCQs. Total so far: 44 and raw MCQs are 46.

🔹 Processing chunk 6/11...
parser is running
raw mcqs :  6
parser is running
✅ Chunk 6: 6 valid MCQs. Total so far: 50 and raw MCQs are 52.

🔹 Processing chunk 7/11...
parser is running
raw mcqs :  9
parser is running
✅ Chunk 7: 8 valid MCQs. Total so far: 58 and r

In [None]:
path = r"C:\Users\admin\OneDrive\Desktop\coriolis\VS CODE\experiment\Lecture 3_ Birthday Problem, Properties of Probability _ Statistics 110.txt"
mcqs,raw = generate_mcqs_from_file_with_eval(file_path = path,chunk_size=4000)
rejected = get_rejected_mcqs_from_lists(raw,mcqs)
save_to_json(rejected, "Birthday_Statistics_110_rejected.json" )
save_to_json(mcqs, "Birthday_Statistics_110.json" )
save_to_json(raw, "Birthday_Statistics_110_raw.json" )

total character : 41771

🔹 Processing chunk 1/11...
parser is running
raw mcqs :  7
parser is running
✅ Chunk 1: 7 valid MCQs. Total so far: 7 and raw MCQs are 7.

🔹 Processing chunk 2/11...
parser is running
raw mcqs :  7
parser is running
✅ Chunk 2: 7 valid MCQs. Total so far: 14 and raw MCQs are 14.

🔹 Processing chunk 3/11...
parser is running
raw mcqs :  7
parser is running
✅ Chunk 3: 7 valid MCQs. Total so far: 21 and raw MCQs are 21.

🔹 Processing chunk 4/11...
parser is running
raw mcqs :  8
parser is running
✅ Chunk 4: 8 valid MCQs. Total so far: 29 and raw MCQs are 29.

🔹 Processing chunk 5/11...
parser is running
raw mcqs :  8
parser is running
✅ Chunk 5: 8 valid MCQs. Total so far: 37 and raw MCQs are 37.

🔹 Processing chunk 6/11...
parser is running
raw mcqs :  9
parser is running
✅ Chunk 6: 9 valid MCQs. Total so far: 46 and raw MCQs are 46.

🔹 Processing chunk 7/11...
parser is running
raw mcqs :  7
parser is running
✅ Chunk 7: 6 valid MCQs. Total so far: 52 and raw MCQs

In [None]:
path = r"C:\Users\admin\OneDrive\Desktop\coriolis\VS CODE\experiment\loop_lecture_3.txt"
mcqs,raw = generate_mcqs_from_file_with_eval(file_path = path,chunk_size=4000)
rejected = get_rejected_mcqs_from_lists(raw,mcqs)
save_to_json(rejected, "loop_lecture_3_rejected.json" )
save_to_json(mcqs, "loop_lecture_3.json" )
save_to_json(raw, "loop_lecture_3_raw.json" )

total character : 11297

🔹 Processing chunk 1/3...
parser is running
raw mcqs :  11
parser is running
✅ Chunk 1: 11 valid MCQs. Total so far: 11 and raw MCQs are 11.

🔹 Processing chunk 2/3...
parser is running
raw mcqs :  6
parser is running
✅ Chunk 2: 6 valid MCQs. Total so far: 17 and raw MCQs are 17.

🔹 Processing chunk 3/3...
parser is running
raw mcqs :  9
parser is running
✅ Chunk 3: 9 valid MCQs. Total so far: 26 and raw MCQs are 26.
Saved to loop_lecture_3_rejected.json
Saved to loop_lecture_3.json
Saved to loop_lecture_3_raw.json


In [None]:
path = r"experiment\Random Forest\Random_Forest.txt"
chunks = load_and_split_file(file_path = path )
print(chunks[0].page_content)

total character : 5649
Hello, people from the future welcome to Normalized Nerd! Today we’ll set up our camp in the Random Forest. First, we’ll see why the random forest is better than our good old decision trees, and then I’ll explain how it works with visualizations. If you wanna see more videos like this, please subscribe to my channel and hit the bell icon because I make videos about machine learning and data science regularly. So without further ado let’s get started. To begin our journey, we need a dataset. Here I’m taking a small dataset with only 6 instances and 5 features.
          
          As you can see the target variable y takes 2 values 0 and 1 hence it’s a binary classification problem. First of all, we need to understand why do we even need the random forest when we already have decision trees. Let’s draw the decision tree for this dataset. Now if you don’t know what a decision tree really is or how it is trained then I’d highly recommend you to watch my previous vid

In [None]:
path = r"experiment\Random Forest\Random_Forest.txt"
mcqs,raw = generate_mcqs_from_file_with_eval(file_path = path,chunk_size=4000)
save_to_json(mcqs, "temp.json" )

total character : 5649

🔹 Processing chunk 1/2...
parser is running
raw mcqs :  10
parser is running
✅ Chunk 1: 10 valid MCQs. Total so far: 10 and raw MCQs are 10.
Saved to temp.json


In [None]:
path = r"C:\Users\admin\OneDrive\Desktop\coriolis\VS CODE\experiment\Random_Forest.txt"
mcqs,raw = generate_mcqs_from_file_with_eval(file_path = path,chunk_size=4000)
rejected = get_rejected_mcqs_from_lists(raw,mcqs)
save_to_json(rejected, "Random_Forest_rejected.json" )
save_to_json(mcqs, "Random_Forest.json" )
save_to_json(raw, "Random_Forest_raw.json" )

total character : 5649

🔹 Processing chunk 1/2...
parser is running
raw mcqs :  9
parser is running
✅ Chunk 1: 8 valid MCQs. Total so far: 8 and raw MCQs are 9.

🔹 Processing chunk 2/2...
parser is running
raw mcqs :  8
parser is running
✅ Chunk 2: 4 valid MCQs. Total so far: 12 and raw MCQs are 17.
Saved to Random_Forest_rejected.json
Saved to Random_Forest.json
Saved to Random_Forest_raw.json
