<a href="https://colab.research.google.com/github/keduog/LLM/blob/main/all_biology_Exam_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# === Install Required Libraries ===
!pip install -q -U google-generativeai sentence-transformers faiss-cpu pandas

# === Imports ===
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import google.generativeai as genai
from google.generativeai.types import GenerationConfig
from google.colab import files

# === Upload CSVs ===
uploaded = files.upload()
exam_df = pd.read_csv("grade9Q1.csv", encoding="ISO-8859-1")
textbook_df = pd.read_csv("biologyg9.csv", encoding="ISO-8859-1")

# === Setup Configuration ===
config = GenerationConfig(
    temperature=0.0,
    top_p=1.0,
    top_k=1
)

# === Semantic Search Setup ===
embedder = SentenceTransformer("all-MiniLM-L6-v2")
text_chunks = textbook_df["Text"].tolist()
chunk_embeddings = embedder.encode(text_chunks, convert_to_tensor=True)

matched_units, matched_sections = [], []
threshold = 0.6
for question in exam_df["question_text"]:
    q_embed = embedder.encode(question, convert_to_tensor=True)
    similarities = util.cos_sim(q_embed, chunk_embeddings)
    best_idx = similarities.argmax().item()
    best_score = similarities[0][best_idx].item()
    if best_score >= threshold:
        matched_units.append(textbook_df.iloc[best_idx]["Unit"])
        matched_sections.append(textbook_df.iloc[best_idx]["Section"])
    else:
        matched_units.append("No Match")
        matched_sections.append("No Match")

exam_df["matched_unit"] = matched_units
exam_df["matched_section"] = matched_sections

# === MCQ Format Validation ===
exam_df["option_count_valid"] = exam_df.apply(
    lambda row: all(pd.notnull(row[f"option_{i}"]) for i in range(1, 5)), axis=1
)

# === Gemini Setup ===
genai.configure(api_key="AIzaSyCGgwigTnwX7CSJZfdvlCAdZ46KQTyD6XI")  # Replace with your actual key
gemini_model = genai.GenerativeModel("gemini-2.0-flash")

# === Helper: Split DataFrame into Batches ===
def split_df_into_batches(df, batch_size):
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:i + batch_size]

# === Prompt Builders ===
def build_difficulty_batch_prompt(df):
    header = """
You are a biology teacher assessing the difficulty of multiple-choice questions.

Difficulty Levels:
- Easy: Factual recall or basic recognition
- Medium: Requires explanation or understanding
- Difficult: Requires analysis, synthesis, or reasoning

Classify each MCQ with one word: Easy, Medium, or Difficult.
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx+1}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

def build_blooms_batch_prompt(df):
    header = """
You are an educational evaluator. Classify each biology multiple-choice question (MCQ) using Bloom's Revised Taxonomy.

Use only one of the following levels:
- Remember
- Understand
- Apply
- Analyze
- Evaluate
- Create

Respond strictly in the format:
Q1: [Level]
Q2: [Level]
...
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx+1}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

# === Difficulty Classification (Batched) ===
difficulty_levels = []
for batch_df in split_df_into_batches(exam_df, 10):
    try:
        prompt = build_difficulty_batch_prompt(batch_df)
        response = gemini_model.generate_content(prompt, generation_config=config)
        lines = response.text.strip().splitlines()
        for line in lines:
            try:
                diff = line.split(":")[1].strip()
                difficulty_levels.append(diff if diff in ["Easy", "Medium", "Difficult"] else "Unclear")
            except:
                difficulty_levels.append("Unclear")
        # In case of mismatch in count, pad with "Unclear"
        while len(difficulty_levels) < len(exam_df):
            difficulty_levels.append("Unclear")
    except Exception as e:
        print(f"Difficulty API Error: {e}")
        difficulty_levels += ["API Error"] * len(batch_df)

exam_df["difficulty"] = difficulty_levels[:len(exam_df)]

# === Bloom’s Taxonomy Classification (Batched) ===
bloom_levels = []
valid_levels = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]
for batch_df in split_df_into_batches(exam_df, 10):
    try:
        prompt = build_blooms_batch_prompt(batch_df)
        response = gemini_model.generate_content(prompt, generation_config=config)
        lines = response.text.strip().splitlines()
        for line in lines:
            try:
                bloom = line.split(":")[1].strip()
                bloom_levels.append(bloom if bloom in valid_levels else "Unclear")
            except:
                bloom_levels.append("Unclear")
        while len(bloom_levels) < len(exam_df):
            bloom_levels.append("Unclear")
    except Exception as e:
        print(f"Bloom API Error: {e}")
        bloom_levels += ["API Error"] * len(batch_df)

exam_df["bloom_level"] = bloom_levels[:len(exam_df)]

# === Summary ===
print("\n=== Coverage by Section ===")
print(exam_df["matched_section"].value_counts())
print("\n=== Difficulty Distribution ===")
print(exam_df["difficulty"].value_counts())
print("\n=== Bloom’s Distribution ===")
print(exam_df["bloom_level"].value_counts())
print("\n=== Invalid MCQs (≠ 4 options) ===")
print(len(exam_df[~exam_df["option_count_valid"]]))

# === Save Result ===
exam_df.to_csv("evaluated_exam_gemini_revised.csv", index=False)
files.download("evaluated_exam_gemini_revised.csv")


In [None]:
# === Install Required Libraries ===
!pip install -q -U google-generativeai sentence-transformers faiss-cpu pandas

# === Imports ===
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import google.generativeai as genai
from google.generativeai.types import GenerationConfig
from google.colab import files
import re

# === Upload CSVs ===
uploaded = files.upload()
exam_df = pd.read_csv("grade9Q1.csv", encoding="ISO-8859-1")
textbook_df = pd.read_csv("biologyg9.csv", encoding="ISO-8859-1")

# === Setup Configuration ===
config = GenerationConfig(
    temperature=0.0,
    top_p=1.0,
    top_k=1
)

# === Semantic Search Setup ===
embedder = SentenceTransformer("all-MiniLM-L6-v2")
text_chunks = textbook_df["Text"].tolist()
chunk_embeddings = embedder.encode(text_chunks, convert_to_tensor=True)

matched_units, matched_sections = [], []
threshold = 0.6
for question in exam_df["question_text"]:
    q_embed = embedder.encode(question, convert_to_tensor=True)
    similarities = util.cos_sim(q_embed, chunk_embeddings)
    best_idx = similarities.argmax().item()
    best_score = similarities[0][best_idx].item()
    if best_score >= threshold:
        matched_units.append(textbook_df.iloc[best_idx]["Unit"])
        matched_sections.append(textbook_df.iloc[best_idx]["Section"])
    else:
        matched_units.append("No Match")
        matched_sections.append("No Match")

exam_df["matched_unit"] = matched_units
exam_df["matched_section"] = matched_sections

# === MCQ Format Validation ===
exam_df["option_count_valid"] = exam_df.apply(
    lambda row: all(pd.notnull(row[f"option_{i}"]) for i in range(1, 5)), axis=1
)

# === Gemini Setup ===
genai.configure(api_key="AIzaSyCGgwigTnwX7CSJZfdvlCAdZ46KQTyD6XI")  # Replace with your actual Gemini API key
gemini_model = genai.GenerativeModel("gemini-2.0-flash")

# === Helper: Split DataFrame into Batches ===
def split_df_into_batches(df, batch_size):
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:i + batch_size]

# === Robust Output Parser ===
def extract_labels(text, valid_labels):
    lines = text.strip().splitlines()
    results = {}
    for line in lines:
        match = re.match(r"Q(\d+):?\s*(\w+)", line.strip())
        if match:
            idx, label = int(match.group(1)), match.group(2)
            results[idx] = label if label in valid_labels else "Unclear"
    return results

# === Prompt Builders ===
def build_difficulty_batch_prompt(df):
    header = """
You are a biology teacher assessing the difficulty of multiple-choice questions.

Difficulty Levels:
- Easy: Factual recall or basic recognition
- Medium: Requires explanation or understanding
- Difficult: Requires analysis, synthesis, or reasoning

Classify each MCQ with one word: Easy, Medium, or Difficult.

Respond in this format:
Q[ID]: [Level]

Example:
Q23: Medium
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

def build_blooms_batch_prompt(df):
    header = """
You are an educational evaluator. Classify each biology MCQ using Bloom's Revised Taxonomy.

Levels:
- Remember
- Understand
- Apply
- Analyze
- Evaluate
- Create

Respond in the format:
Q[ID]: [Level]

Example:
Q23: Apply
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

# === Difficulty Classification (Batched + Robust Parsing) ===
difficulty_levels = ["Unclear"] * len(exam_df)
for batch_df in split_df_into_batches(exam_df, 10):
    try:
        prompt = build_difficulty_batch_prompt(batch_df)
        response = gemini_model.generate_content(prompt, generation_config=config)
        parsed = extract_labels(response.text, ["Easy", "Medium", "Difficult"])
        for idx, label in parsed.items():
            difficulty_levels[idx] = label
    except Exception as e:
        print(f"Difficulty API Error: {e}")

exam_df["difficulty"] = difficulty_levels

# === Bloom’s Taxonomy Classification (Batched + Robust Parsing) ===
bloom_levels = ["Unclear"] * len(exam_df)
for batch_df in split_df_into_batches(exam_df, 10):
    try:
        prompt = build_blooms_batch_prompt(batch_df)
        response = gemini_model.generate_content(prompt, generation_config=config)
        parsed = extract_labels(response.text, ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"])
        for idx, label in parsed.items():
            bloom_levels[idx] = label
    except Exception as e:
        print(f"Bloom API Error: {e}")

exam_df["bloom_level"] = bloom_levels

# === Summary ===
print("\n=== Coverage by Section ===")
print(exam_df["matched_section"].value_counts())
print("\n=== Difficulty Distribution ===")
print(exam_df["difficulty"].value_counts())
print("\n=== Bloom’s Distribution ===")
print(exam_df["bloom_level"].value_counts())
print("\n=== Invalid MCQs (≠ 4 options) ===")
print(len(exam_df[~exam_df["option_count_valid"]]))

# === Save Result ===
exam_df.to_csv("evaluated_exam_gemini_revised.csv", index=False)
files.download("evaluated_exam_gemini_revised.csv")


In [None]:
# === Install Required Libraries ===
!pip install -q -U google-generativeai sentence-transformers faiss-cpu pandas

# === Imports ===
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import google.generativeai as genai
from google.generativeai.types import GenerationConfig
from google.colab import files
import re

# === Upload CSVs ===
uploaded = files.upload()
exam_df = pd.read_csv("grade9Q1.csv", encoding="ISO-8859-1")
textbook_df = pd.read_csv("biologyg9.csv", encoding="ISO-8859-1")

# === Setup Configuration ===
config = GenerationConfig(
    temperature=0.0,
    top_p=1.0,
    top_k=1
)

# === Semantic Search Setup ===
embedder = SentenceTransformer("all-MiniLM-L6-v2")
text_chunks = textbook_df["Text"].tolist()
chunk_embeddings = embedder.encode(text_chunks, convert_to_tensor=True)

matched_units, matched_sections = [], []
threshold = 0.6
for question in exam_df["question_text"]:
    q_embed = embedder.encode(question, convert_to_tensor=True)
    similarities = util.cos_sim(q_embed, chunk_embeddings)
    best_idx = similarities.argmax().item()
    best_score = similarities[0][best_idx].item()
    if best_score >= threshold:
        matched_units.append(textbook_df.iloc[best_idx]["Unit"])
        matched_sections.append(textbook_df.iloc[best_idx]["Section"])
    else:
        matched_units.append("No Match")
        matched_sections.append("No Match")

exam_df["matched_unit"] = matched_units
exam_df["matched_section"] = matched_sections

# === MCQ Format Validation ===
exam_df["option_count_valid"] = exam_df.apply(
    lambda row: all(pd.notnull(row[f"option_{i}"]) for i in range(1, 5)), axis=1
)

# === Gemini Setup ===
genai.configure(api_key="AIzaSyCGgwigTnwX7CSJZfdvlCAdZ46KQTyD6XI")  # Replace with your actual Gemini API key
gemini_model = genai.GenerativeModel("gemini-2.0-flash")

# === Helper: Split DataFrame into Batches ===
def split_df_into_batches(df, batch_size):
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:i + batch_size]

# === Robust Output Parser ===
def extract_labels(text, valid_labels):
    lines = text.strip().splitlines()
    results = {}
    for line in lines:
        match = re.match(r"Q(\d+):?\s*(\w+)", line.strip())
        if match:
            idx, label = int(match.group(1)), match.group(2)
            results[idx] = label if label in valid_labels else "Unclear"
    return results

# === Prompt Builders ===
def build_difficulty_batch_prompt(df):
    header = """
You are a biology teacher assessing the difficulty of multiple-choice questions.

Difficulty Levels:
- Easy: Factual recall or basic recognition
- Medium: Requires explanation or understanding
- Difficult: Requires analysis, synthesis, or reasoning

Classify each MCQ with one word: Easy, Medium, or Difficult.

Respond in this format:
Q[ID]: [Level]

Example:
Q23: Medium
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

def build_blooms_batch_prompt(df):
    header = """
You are an educational evaluator. Classify each biology MCQ using Bloom's Revised Taxonomy.

Levels:
- Remember
- Understand
- Apply
- Analyze
- Evaluate
- Create

Respond in the format:
Q[ID]: [Level]

Example:
Q23: Apply
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

# === Difficulty Classification (Batched + Robust Parsing) ===
difficulty_levels = ["Unclear"] * len(exam_df)
for batch_df in split_df_into_batches(exam_df, 10):
    try:
        prompt = build_difficulty_batch_prompt(batch_df)
        response = gemini_model.generate_content(prompt, generation_config=config)
        parsed = extract_labels(response.text, ["Easy", "Medium", "Difficult"])
        for idx, label in parsed.items():
            difficulty_levels[idx] = label
    except Exception as e:
        print(f"Difficulty API Error: {e}")

exam_df["difficulty"] = difficulty_levels

# === Bloom’s Taxonomy Classification (Batched + Robust Parsing) ===
bloom_levels = ["Unclear"] * len(exam_df)
for batch_df in split_df_into_batches(exam_df, 10):
    try:
        prompt = build_blooms_batch_prompt(batch_df)
        response = gemini_model.generate_content(prompt, generation_config=config)
        parsed = extract_labels(response.text, ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"])
        for idx, label in parsed.items():
            bloom_levels[idx] = label
    except Exception as e:
        print(f"Bloom API Error: {e}")

exam_df["bloom_level"] = bloom_levels

# === Summary ===
print("\n=== Coverage by Section ===")
print(exam_df["matched_section"].value_counts())
print("\n=== Difficulty Distribution ===")
print(exam_df["difficulty"].value_counts())
print("\n=== Bloom’s Distribution ===")
print(exam_df["bloom_level"].value_counts())
print("\n=== Invalid MCQs (≠ 4 options) ===")
print(len(exam_df[~exam_df["option_count_valid"]]))

# === Save Result ===
exam_df.to_csv("evaluated_exam_gemini_revised.csv", index=False)
files.download("evaluated_exam_gemini_revised.csv")


In [None]:
# === Install Required Libraries ===
!pip install -q -U google-generativeai sentence-transformers faiss-cpu pandas

# === Imports ===
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import google.generativeai as genai
from google.generativeai.types import GenerationConfig
from google.colab import files
import re
import time

# === Upload CSVs ===
uploaded = files.upload()
exam_df = pd.read_csv("grade9Q1.csv", encoding="ISO-8859-1")
textbook_df = pd.read_csv("biologyg9.csv", encoding="ISO-8859-1")

# === Setup Configuration ===
config = GenerationConfig(
    temperature=0.0,
    top_p=1.0,
    top_k=1
)

# === Semantic Search Setup ===
embedder = SentenceTransformer("all-MiniLM-L6-v2")
text_chunks = textbook_df["Text"].tolist()
chunk_embeddings = embedder.encode(text_chunks, convert_to_tensor=True)

matched_units, matched_sections = [], []
threshold = 0.6
for question in exam_df["question_text"]:
    q_embed = embedder.encode(question, convert_to_tensor=True)
    similarities = util.cos_sim(q_embed, chunk_embeddings)
    best_idx = similarities.argmax().item()
    best_score = similarities[0][best_idx].item()
    if best_score >= threshold:
        matched_units.append(textbook_df.iloc[best_idx]["Unit"])
        matched_sections.append(textbook_df.iloc[best_idx]["Section"])
    else:
        matched_units.append("No Match")
        matched_sections.append("No Match")

exam_df["matched_unit"] = matched_units
exam_df["matched_section"] = matched_sections

# === MCQ Format Validation ===
exam_df["option_count_valid"] = exam_df.apply(
    lambda row: all(pd.notnull(row[f"option_{i}"]) for i in range(1, 5)), axis=1
)

# === Gemini Setup ===
genai.configure(api_key="AIzaSyCGgwigTnwX7CSJZfdvlCAdZ46KQTyD6XI")  # Replace with your actual Gemini API key
gemini_model = genai.GenerativeModel("gemini-2.0-flash")

# === Helper: Split DataFrame into Batches ===
def split_df_into_batches(df, batch_size):
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:i + batch_size]

# === Robust Output Parser ===
def extract_labels(text, valid_labels):
    lines = text.strip().splitlines()
    results = {}
    for line in lines:
        match = re.match(r"Q(\d+):?\s*(\w+)", line.strip())
        if match:
            idx, label = int(match.group(1)), match.group(2)
            results[idx] = label if label in valid_labels else "Unclear"
    return results

# === Prompt Builders ===
def build_difficulty_batch_prompt(df):
    header = """
You are a biology teacher assessing the difficulty of multiple-choice questions.

Difficulty Levels:
- Easy: Factual recall or basic recognition
- Medium: Requires explanation or understanding
- Difficult: Requires analysis, synthesis, or reasoning

Classify each MCQ with one word: Easy, Medium, or Difficult.

Respond in this format:
Q[ID]: [Level]

Example:
Q23: Medium
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

def build_blooms_batch_prompt(df):
    header = """
You are an educational evaluator. Classify each biology MCQ using Bloom's Revised Taxonomy.

Levels:
- Remember
- Understand
- Apply
- Analyze
- Evaluate
- Create

Respond in the format:
Q[ID]: [Level]

Example:
Q23: Apply
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

# === Difficulty Classification (Batched + Robust Parsing) ===
difficulty_levels = ["Unclear"] * len(exam_df)
for batch_df in split_df_into_batches(exam_df, 10):
    try:
        prompt = build_difficulty_batch_prompt(batch_df)
        response = gemini_model.generate_content(prompt, generation_config=config)
        parsed = extract_labels(response.text, ["Easy", "Medium", "Difficult"])
        for idx, label in parsed.items():
            difficulty_levels[idx] = label
    except Exception as e:
        print(f"Difficulty API Error: {e}")

exam_df["difficulty"] = difficulty_levels

# === Bloom’s Taxonomy Classification (Batched + Retry on 429 Errors) ===
bloom_levels = ["Unclear"] * len(exam_df)
valid_bloom = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]

for batch_df in split_df_into_batches(exam_df, 5):  # Smaller batch
    success = False
    retries = 3
    wait_seconds = 15
    while not success and retries > 0:
        try:
            prompt = build_blooms_batch_prompt(batch_df)
            response = gemini_model.generate_content(prompt, generation_config=config)
            parsed = extract_labels(response.text, valid_bloom)
            for idx, label in parsed.items():
                bloom_levels[idx] = label
            success = True
        except Exception as e:
            if "429" in str(e):  # Rate limit
                print("Quota limit reached. Retrying after delay...")
                time.sleep(wait_seconds)
                retries -= 1
            else:
                print(f"Bloom API Error: {e}")
                break

exam_df["bloom_level"] = bloom_levels

# === Summary ===
print("\n=== Coverage by Section ===")
print(exam_df["matched_section"].value_counts())
print("\n=== Difficulty Distribution ===")
print(exam_df["difficulty"].value_counts())
print("\n=== Bloom’s Distribution ===")
print(exam_df["bloom_level"].value_counts())
print("\n=== Invalid MCQs (≠ 4 options) ===")
print(len(exam_df[~exam_df["option_count_valid"]]))

# === Save Result ===
exam_df.to_csv("evaluated_exam_gemini_revised.csv", index=False)
files.download("evaluated_exam_gemini_revised.csv")


In [None]:
# === Install Required Libraries ===
!pip install -q -U google-generativeai sentence-transformers faiss-cpu pandas

# === Imports ===
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import google.generativeai as genai
from google.generativeai.types import GenerationConfig
from google.colab import files
import re
import time

# === Upload CSVs ===
uploaded = files.upload()
exam_df = pd.read_csv("grade9Q1.csv", encoding="ISO-8859-1")
textbook_df = pd.read_csv("biologyg9.csv", encoding="ISO-8859-1")

# === Setup Configuration ===
config = GenerationConfig(
    temperature=0.0,
    top_p=1.0,
    top_k=1
)

# === Semantic Search Setup ===
embedder = SentenceTransformer("all-MiniLM-L6-v2")
text_chunks = textbook_df["Text"].tolist()
chunk_embeddings = embedder.encode(text_chunks, convert_to_tensor=True)

matched_units, matched_sections = [], []
threshold = 0.6
for question in exam_df["question_text"]:
    q_embed = embedder.encode(question, convert_to_tensor=True)
    similarities = util.cos_sim(q_embed, chunk_embeddings)
    best_idx = similarities.argmax().item()
    best_score = similarities[0][best_idx].item()
    if best_score >= threshold:
        matched_units.append(textbook_df.iloc[best_idx]["Unit"])
        matched_sections.append(textbook_df.iloc[best_idx]["Section"])
    else:
        matched_units.append("No Match")
        matched_sections.append("No Match")

exam_df["matched_unit"] = matched_units
exam_df["matched_section"] = matched_sections

# === MCQ Format Validation ===
exam_df["option_count_valid"] = exam_df.apply(
    lambda row: all(pd.notnull(row[f"option_{i}"]) for i in range(1, 5)), axis=1
)

# === Gemini Setup ===
genai.configure(api_key="AIzaSyCGgwigTnwX7CSJZfdvlCAdZ46KQTyD6XI")  # Replace with your actual Gemini API key
gemini_model = genai.GenerativeModel("gemini-2.0-flash")

# === Helper: Split DataFrame into Batches ===
def split_df_into_batches(df, batch_size):
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:i + batch_size]

# === Robust Output Parser ===
def extract_labels(text, valid_labels):
    lines = text.strip().splitlines()
    results = {}
    for line in lines:
        match = re.match(r"Q(\d+):?\s*(\w+)", line.strip())
        if match:
            idx, label = int(match.group(1)), match.group(2)
            results[idx] = label if label in valid_labels else "Unclear"
    return results

# === Prompt Builders ===
def build_difficulty_batch_prompt(df):
    header = """
You are a biology teacher assessing the difficulty of multiple-choice questions.

Difficulty Levels:
- Easy: Factual recall or basic recognition
- Medium: Requires explanation or understanding
- Difficult: Requires analysis, synthesis, or reasoning

Classify each MCQ with one word: Easy, Medium, or Difficult.

Respond in this format:
Q[ID]: [Level]

Example:
Q23: Medium
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

def build_blooms_batch_prompt(df):
    header = """
You are an educational evaluator. Classify each biology MCQ using Bloom's Revised Taxonomy.

Levels:
- Remember
- Understand
- Apply
- Analyze
- Evaluate
- Create

Respond in the format:
Q[ID]: [Level]

Example:
Q23: Apply
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

# === Difficulty Classification (Batched + Robust Parsing) ===
difficulty_levels = ["Unclear"] * len(exam_df)
for batch_df in split_df_into_batches(exam_df, 10):
    try:
        prompt = build_difficulty_batch_prompt(batch_df)
        response = gemini_model.generate_content(prompt, generation_config=config)
        parsed = extract_labels(response.text, ["Easy", "Medium", "Difficult"])
        for idx, label in parsed.items():
            difficulty_levels[idx] = label
    except Exception as e:
        print(f"Difficulty API Error: {e}")

exam_df["difficulty"] = difficulty_levels

# === Bloom’s Taxonomy Classification (Batched + Retry on 429 Errors) ===
bloom_levels = ["Unclear"] * len(exam_df)
valid_bloom = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]

for batch_df in split_df_into_batches(exam_df, 5):  # Smaller batch
    success = False
    retries = 3
    wait_seconds = 15
    while not success and retries > 0:
        try:
            prompt = build_blooms_batch_prompt(batch_df)
            response = gemini_model.generate_content(prompt, generation_config=config)
            parsed = extract_labels(response.text, valid_bloom)
            for idx, label in parsed.items():
                bloom_levels[idx] = label
            success = True
        except Exception as e:
            if "429" in str(e):  # Rate limit
                print("Quota limit reached. Retrying after delay...")
                time.sleep(wait_seconds)
                retries -= 1
            else:
                print(f"Bloom API Error: {e}")
                break

exam_df["bloom_level"] = bloom_levels

# === Summary ===
print("\n=== Coverage by Section ===")
print(exam_df["matched_section"].value_counts())
print("\n=== Difficulty Distribution ===")
print(exam_df["difficulty"].value_counts())
print("\n=== Bloom’s Distribution ===")
print(exam_df["bloom_level"].value_counts())
print("\n=== Invalid MCQs (≠ 4 options) ===")
print(len(exam_df[~exam_df["option_count_valid"]]))

# === Save Result ===
exam_df.to_csv("evaluated_exam_gemini_revised.csv", index=False)
files.download("evaluated_exam_gemini_revised.csv")


In [None]:
# === Install Required Libraries ===
!pip install -q -U google-generativeai sentence-transformers faiss-cpu pandas

# === Imports ===
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import google.generativeai as genai
from google.generativeai.types import GenerationConfig
from google.colab import files
import re
import time

# === Upload CSVs ===
uploaded = files.upload()
exam_df = pd.read_csv("grade9Q1.csv", encoding="ISO-8859-1")
textbook_df = pd.read_csv("biologyg9.csv", encoding="ISO-8859-1")

# === Setup Configuration ===
config = GenerationConfig(
    temperature=0.0,
    top_p=1.0,
    top_k=1
)

# === Semantic Search Setup ===
embedder = SentenceTransformer("all-MiniLM-L6-v2")
text_chunks = textbook_df["Text"].tolist()
chunk_embeddings = embedder.encode(text_chunks, convert_to_tensor=True)

matched_units, matched_sections = [], []
threshold = 0.6
for question in exam_df["question_text"]:
    q_embed = embedder.encode(question, convert_to_tensor=True)
    similarities = util.cos_sim(q_embed, chunk_embeddings)
    best_idx = similarities.argmax().item()
    best_score = similarities[0][best_idx].item()
    if best_score >= threshold:
        matched_units.append(textbook_df.iloc[best_idx]["Unit"])
        matched_sections.append(textbook_df.iloc[best_idx]["Section"])
    else:
        matched_units.append("No Match")
        matched_sections.append("No Match")

exam_df["matched_unit"] = matched_units
exam_df["matched_section"] = matched_sections

# === MCQ Format Validation ===
exam_df["option_count_valid"] = exam_df.apply(
    lambda row: all(pd.notnull(row[f"option_{i}"]) for i in range(1, 5)), axis=1
)

# === Gemini Setup ===
genai.configure(api_key="AIzaSyCGgwigTnwX7CSJZfdvlCAdZ46KQTyD6XI")  # Replace with your actual Gemini API key
gemini_model = genai.GenerativeModel("gemini-2.0-flash")

# === Helper: Split DataFrame into Batches ===
def split_df_into_batches(df, batch_size):
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:i + batch_size]

# === Robust Output Parser ===
def extract_labels(text, valid_labels):
    lines = text.strip().splitlines()
    results = {}
    for line in lines:
        match = re.match(r"Q(\d+):?\s*(\w+)", line.strip())
        if match:
            idx, label = int(match.group(1)), match.group(2)
            results[idx] = label if label in valid_labels else "Unclear"
    return results

# === Prompt Builders ===
def build_difficulty_batch_prompt(df):
    header = """
You are a biology teacher assessing the difficulty of multiple-choice questions.

Difficulty Levels:
- Easy: Factual recall or basic recognition
- Medium: Requires explanation or understanding
- Difficult: Requires analysis, synthesis, or reasoning

Classify each MCQ with one word: Easy, Medium, or Difficult.

Respond in this format:
Q[ID]: [Level]

Example:
Q23: Medium
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

def build_blooms_batch_prompt(df):
    header = """
You are an educational evaluator. Classify each biology MCQ using Bloom's Revised Taxonomy.

Levels:
- Remember
- Understand
- Apply
- Analyze
- Evaluate
- Create

Respond in the format:
Q[ID]: [Level]

Example:
Q23: Apply
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

# === Difficulty Classification ===
difficulty_levels = ["Unclear"] * len(exam_df)
for batch_df in split_df_into_batches(exam_df, 10):
    try:
        prompt = build_difficulty_batch_prompt(batch_df)
        response = gemini_model.generate_content(prompt, generation_config=config)
        parsed = extract_labels(response.text, ["Easy", "Medium", "Difficult"])
        for idx, label in parsed.items():
            difficulty_levels[idx] = label
    except Exception as e:
        print(f"Difficulty API Error: {e}")

exam_df["difficulty"] = difficulty_levels

# === Bloom’s Taxonomy Classification (Initial Pass) ===
bloom_levels = ["Unclear"] * len(exam_df)
valid_bloom = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]

for batch_df in split_df_into_batches(exam_df, 5):  # Smaller batch to reduce 429 errors
    success = False
    retries = 3
    wait_seconds = 15
    while not success and retries > 0:
        try:
            prompt = build_blooms_batch_prompt(batch_df)
            response = gemini_model.generate_content(prompt, generation_config=config)
            parsed = extract_labels(response.text, valid_bloom)
            for idx, label in parsed.items():
                bloom_levels[idx] = label
            success = True
        except Exception as e:
            if "429" in str(e):  # Rate limit
                print("Quota limit reached. Retrying after delay...")
                time.sleep(wait_seconds)
                retries -= 1
            else:
                print(f"Bloom API Error: {e}")
                break

exam_df["bloom_level"] = bloom_levels

# === Retry Bloom Classification for Unclear ===
unclear_bloom_df = exam_df[exam_df["bloom_level"] == "Unclear"]
if not unclear_bloom_df.empty:
    print(f"\nRetrying Bloom classification for {len(unclear_bloom_df)} unclear items...")
    for batch_df in split_df_into_batches(unclear_bloom_df, 5):
        success = False
        retries = 2
        while not success and retries > 0:
            try:
                prompt = build_blooms_batch_prompt(batch_df)
                response = gemini_model.generate_content(prompt, generation_config=config)
                parsed = extract_labels(response.text, valid_bloom)
                for idx, label in parsed.items():
                    exam_df.at[idx, "bloom_level"] = label
                success = True
            except Exception as e:
                if "429" in str(e):
                    print("Quota limit reached during retry. Waiting...")
                    time.sleep(15)
                    retries -= 1
                else:
                    print(f"Retry Bloom API Error: {e}")
                    break

# === Summary Output ===
print("\n=== Coverage by Section ===")
print(exam_df["matched_section"].value_counts())
print("\n=== Difficulty Distribution ===")
print(exam_df["difficulty"].value_counts())
print("\n=== Bloom’s Distribution ===")
print(exam_df["bloom_level"].value_counts())
print("\n=== Invalid MCQs (≠ 4 options) ===")
print(len(exam_df[~exam_df["option_count_valid"]]))

# === Save Final Output ===
exam_df.to_csv("evaluated_exam_gemini_revised.csv", index=False)
files.download("evaluated_exam_gemini_revised.csv")


In [None]:
# === Install Required Libraries ===
!pip install -q -U google-generativeai sentence-transformers faiss-cpu pandas

# === Imports ===
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import google.generativeai as genai
from google.generativeai.types import GenerationConfig
from google.colab import files
import re
import time

# === Upload CSVs ===
uploaded = files.upload()
exam_df = pd.read_csv("grade9Q1.csv", encoding="ISO-8859-1")
textbook_df = pd.read_csv("biologyg9.csv", encoding="ISO-8859-1")

# === Setup Configuration ===
config = GenerationConfig(
    temperature=0.0,
    top_p=1.0,
    top_k=1
)

# === Semantic Search Setup ===
embedder = SentenceTransformer("all-MiniLM-L6-v2")
text_chunks = textbook_df["Text"].tolist()
chunk_embeddings = embedder.encode(text_chunks, convert_to_tensor=True)

matched_units, matched_sections = [], []
threshold = 0.6
for question in exam_df["question_text"]:
    q_embed = embedder.encode(question, convert_to_tensor=True)
    similarities = util.cos_sim(q_embed, chunk_embeddings)
    best_idx = similarities.argmax().item()
    best_score = similarities[0][best_idx].item()
    if best_score >= threshold:
        matched_units.append(textbook_df.iloc[best_idx]["Unit"])
        matched_sections.append(textbook_df.iloc[best_idx]["Section"])
    else:
        matched_units.append("No Match")
        matched_sections.append("No Match")

exam_df["matched_unit"] = matched_units
exam_df["matched_section"] = matched_sections

# === MCQ Format Validation ===
exam_df["option_count_valid"] = exam_df.apply(
    lambda row: all(pd.notnull(row[f"option_{i}"]) for i in range(1, 5)), axis=1
)

# === Gemini Setup ===
genai.configure(api_key="AIzaSyCGgwigTnwX7CSJZfdvlCAdZ46KQTyD6XI")  # Replace with your actual Gemini API key
gemini_model = genai.GenerativeModel("gemini-2.0-flash")

# === Helper: Split DataFrame into Batches ===
def split_df_into_batches(df, batch_size):
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:i + batch_size]

# === Robust Output Parser ===
def extract_labels(text, valid_labels):
    lines = text.strip().splitlines()
    results = {}
    for line in lines:
        match = re.match(r"Q(\d+):?\s*(\w+)", line.strip())
        if match:
            idx, label = int(match.group(1)), match.group(2)
            results[idx] = label if label in valid_labels else "Unclear"
    return results

# === Prompt Builders ===
def build_difficulty_batch_prompt(df):
    header = """
You are a biology teacher assessing the difficulty of multiple-choice questions.

Difficulty Levels:
- Easy: Factual recall or basic recognition
- Medium: Requires explanation or understanding
- Difficult: Requires analysis, synthesis, or reasoning

Classify each MCQ with one word: Easy, Medium, or Difficult.

Respond in this format:
Q[ID]: [Level]

Example:
Q23: Medium
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

def build_blooms_batch_prompt(df):
    header = """
You are an educational evaluator. Classify each biology MCQ using Bloom's Revised Taxonomy.

Levels:
- Remember
- Understand
- Apply
- Analyze
- Evaluate
- Create

Respond in the format:
Q[ID]: [Level]

Example:
Q23: Apply
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

# === Difficulty Classification ===
difficulty_levels = ["Unclear"] * len(exam_df)
for batch_df in split_df_into_batches(exam_df, 10):
    try:
        prompt = build_difficulty_batch_prompt(batch_df)
        response = gemini_model.generate_content(prompt, generation_config=config)
        parsed = extract_labels(response.text, ["Easy", "Medium", "Difficult"])
        for idx, label in parsed.items():
            difficulty_levels[idx] = label
    except Exception as e:
        print(f"Difficulty API Error: {e}")

exam_df["difficulty"] = difficulty_levels

# === Bloom’s Taxonomy Classification (Initial Pass) ===
bloom_levels = ["Unclear"] * len(exam_df)
valid_bloom = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]

for batch_df in split_df_into_batches(exam_df, 5):  # Smaller batch to reduce 429 errors
    success = False
    retries = 3
    wait_seconds = 15
    while not success and retries > 0:
        try:
            prompt = build_blooms_batch_prompt(batch_df)
            response = gemini_model.generate_content(prompt, generation_config=config)
            parsed = extract_labels(response.text, valid_bloom)
            for idx, label in parsed.items():
                bloom_levels[idx] = label
            success = True
        except Exception as e:
            if "429" in str(e):  # Rate limit
                print("Quota limit reached. Retrying after delay...")
                time.sleep(wait_seconds)
                retries -= 1
            else:
                print(f"Bloom API Error: {e}")
                break

exam_df["bloom_level"] = bloom_levels

# === Retry Bloom Classification for Unclear ===
unclear_bloom_df = exam_df[exam_df["bloom_level"] == "Unclear"]
if not unclear_bloom_df.empty:
    print(f"\nRetrying Bloom classification for {len(unclear_bloom_df)} unclear items...")
    for batch_df in split_df_into_batches(unclear_bloom_df, 5):
        success = False
        retries = 2
        while not success and retries > 0:
            try:
                prompt = build_blooms_batch_prompt(batch_df)
                response = gemini_model.generate_content(prompt, generation_config=config)
                parsed = extract_labels(response.text, valid_bloom)
                for idx, label in parsed.items():
                    exam_df.at[idx, "bloom_level"] = label
                success = True
            except Exception as e:
                if "429" in str(e):
                    print("Quota limit reached during retry. Waiting...")
                    time.sleep(15)
                    retries -= 1
                else:
                    print(f"Retry Bloom API Error: {e}")
                    break

# === Summary Output ===
print("\n=== Coverage by Section ===")
print(exam_df["matched_section"].value_counts())
print("\n=== Difficulty Distribution ===")
print(exam_df["difficulty"].value_counts())
print("\n=== Bloom’s Distribution ===")
print(exam_df["bloom_level"].value_counts())
print("\n=== Invalid MCQs (≠ 4 options) ===")
print(len(exam_df[~exam_df["option_count_valid"]]))

# === Save Final Output ===
exam_df.to_csv("evaluated_exam_gemini_revised.csv", index=False)
files.download("evaluated_exam_gemini_revised.csv")


In [None]:
# === Install Required Libraries ===
!pip install -q -U google-generativeai sentence-transformers faiss-cpu pandas

# === Imports ===
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import google.generativeai as genai
from google.generativeai.types import GenerationConfig
from google.colab import files
import re
import time

# === Upload CSVs ===
uploaded = files.upload()
exam_df = pd.read_csv("grade9Q1.csv", encoding="ISO-8859-1")
textbook_df = pd.read_csv("biologyg9.csv", encoding="ISO-8859-1")

# === Setup Configuration ===
config = GenerationConfig(
    temperature=0.0,
    top_p=1.0,
    top_k=1
)

# === Semantic Search Setup ===
embedder = SentenceTransformer("all-MiniLM-L6-v2")
text_chunks = textbook_df["Text"].tolist()
chunk_embeddings = embedder.encode(text_chunks, convert_to_tensor=True)

matched_units, matched_sections = [], []
threshold = 0.6
for question in exam_df["question_text"]:
    q_embed = embedder.encode(question, convert_to_tensor=True)
    similarities = util.cos_sim(q_embed, chunk_embeddings)
    best_idx = similarities.argmax().item()
    best_score = similarities[0][best_idx].item()
    if best_score >= threshold:
        matched_units.append(textbook_df.iloc[best_idx]["Unit"])
        matched_sections.append(textbook_df.iloc[best_idx]["Section"])
    else:
        matched_units.append("No Match")
        matched_sections.append("No Match")

exam_df["matched_unit"] = matched_units
exam_df["matched_section"] = matched_sections

# === MCQ Format Validation ===
exam_df["option_count_valid"] = exam_df.apply(
    lambda row: all(pd.notnull(row[f"option_{i}"]) for i in range(1, 5)), axis=1
)

# === Gemini Setup ===
genai.configure(api_key="AIzaSyCGgwigTnwX7CSJZfdvlCAdZ46KQTyD6XI")  # Replace with your actual Gemini API key
gemini_model = genai.GenerativeModel("gemini-2.0-flash")

# === Helper: Split DataFrame into Batches ===
def split_df_into_batches(df, batch_size):
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:i + batch_size]

# === Robust Output Parser ===
def extract_labels(text, valid_labels):
    lines = text.strip().splitlines()
    results = {}
    for line in lines:
        match = re.match(r"Q(\d+):?\s*(\w+)", line.strip())
        if match:
            idx, label = int(match.group(1)), match.group(2)
            results[idx] = label if label in valid_labels else "Unclear"
    return results

# === Prompt Builders ===
def build_difficulty_batch_prompt(df):
    header = """
You are a biology teacher assessing the difficulty of multiple-choice questions.

Difficulty Levels:
- Easy: Factual recall or basic recognition
- Medium: Requires explanation or understanding
- Difficult: Requires analysis, synthesis, or reasoning

Classify each MCQ with one word: Easy, Medium, or Difficult.

Respond in this format:
Q[ID]: [Level]

Example:
Q23: Medium
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

def build_blooms_batch_prompt(df):
    header = """
You are an educational evaluator. Classify each biology MCQ using Bloom's Revised Taxonomy.

Levels:
- Remember
- Understand
- Apply
- Analyze
- Evaluate
- Create

Respond in the format:
Q[ID]: [Level]

Example:
Q23: Apply
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

# === Difficulty Classification ===
difficulty_levels = ["Unclear"] * len(exam_df)
for batch_df in split_df_into_batches(exam_df, 10):
    try:
        prompt = build_difficulty_batch_prompt(batch_df)
        response = gemini_model.generate_content(prompt, generation_config=config)
        parsed = extract_labels(response.text, ["Easy", "Medium", "Difficult"])
        for idx, label in parsed.items():
            difficulty_levels[idx] = label
    except Exception as e:
        print(f"Difficulty API Error: {e}")

exam_df["difficulty"] = difficulty_levels

# === Bloom’s Taxonomy Classification (Initial Pass) ===
bloom_levels = ["Unclear"] * len(exam_df)
valid_bloom = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]

for batch_df in split_df_into_batches(exam_df, 5):  # Smaller batch to reduce 429 errors
    success = False
    retries = 3
    wait_seconds = 15
    while not success and retries > 0:
        try:
            prompt = build_blooms_batch_prompt(batch_df)
            response = gemini_model.generate_content(prompt, generation_config=config)
            parsed = extract_labels(response.text, valid_bloom)
            for idx, label in parsed.items():
                bloom_levels[idx] = label
            success = True
        except Exception as e:
            if "429" in str(e):  # Rate limit
                print("Quota limit reached. Retrying after delay...")
                time.sleep(wait_seconds)
                retries -= 1
            else:
                print(f"Bloom API Error: {e}")
                break

exam_df["bloom_level"] = bloom_levels

# === Retry Bloom Classification for Unclear ===
unclear_bloom_df = exam_df[exam_df["bloom_level"] == "Unclear"]
if not unclear_bloom_df.empty:
    print(f"\nRetrying Bloom classification for {len(unclear_bloom_df)} unclear items...")
    for batch_df in split_df_into_batches(unclear_bloom_df, 5):
        success = False
        retries = 2
        while not success and retries > 0:
            try:
                prompt = build_blooms_batch_prompt(batch_df)
                response = gemini_model.generate_content(prompt, generation_config=config)
                parsed = extract_labels(response.text, valid_bloom)
                for idx, label in parsed.items():
                    exam_df.at[idx, "bloom_level"] = label
                success = True
            except Exception as e:
                if "429" in str(e):
                    print("Quota limit reached during retry. Waiting...")
                    time.sleep(15)
                    retries -= 1
                else:
                    print(f"Retry Bloom API Error: {e}")
                    break

# === Summary Output ===
print("\n=== Coverage by Section ===")
print(exam_df["matched_section"].value_counts())
print("\n=== Difficulty Distribution ===")
print(exam_df["difficulty"].value_counts())
print("\n=== Bloom’s Distribution ===")
print(exam_df["bloom_level"].value_counts())
print("\n=== Invalid MCQs (≠ 4 options) ===")
print(len(exam_df[~exam_df["option_count_valid"]]))

# === Save Final Output ===
exam_df.to_csv("evaluated_exam_gemini_revised.csv", index=False)
files.download("evaluated_exam_gemini_revised.csv")


In [None]:
# === Install Required Libraries ===
!pip install -q -U google-generativeai sentence-transformers faiss-cpu pandas

# === Imports ===
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import google.generativeai as genai
from google.generativeai.types import GenerationConfig
from google.colab import files
import re
import time

# === Upload CSVs ===
uploaded = files.upload()
exam_df = pd.read_csv("grade9Q1.csv", encoding="ISO-8859-1")
textbook_df = pd.read_csv("biologyg9.csv", encoding="ISO-8859-1")

# === Setup Configuration ===
config = GenerationConfig(
    temperature=0.0,
    top_p=1.0,
    top_k=1
)

# === Semantic Search Setup ===
embedder = SentenceTransformer("all-MiniLM-L6-v2")
text_chunks = textbook_df["Text"].tolist()
chunk_embeddings = embedder.encode(text_chunks, convert_to_tensor=True)

matched_units, matched_sections = [], []
threshold = 0.6
for question in exam_df["question_text"]:
    q_embed = embedder.encode(question, convert_to_tensor=True)
    similarities = util.cos_sim(q_embed, chunk_embeddings)
    best_idx = similarities.argmax().item()
    best_score = similarities[0][best_idx].item()
    if best_score >= threshold:
        matched_units.append(textbook_df.iloc[best_idx]["Unit"])
        matched_sections.append(textbook_df.iloc[best_idx]["Section"])
    else:
        matched_units.append("No Match")
        matched_sections.append("No Match")

exam_df["matched_unit"] = matched_units
exam_df["matched_section"] = matched_sections

# === MCQ Format Validation ===
exam_df["option_count_valid"] = exam_df.apply(
    lambda row: all(pd.notnull(row[f"option_{i}"]) for i in range(1, 5)), axis=1
)

# === Gemini Setup ===
genai.configure(api_key="AIzaSyCGgwigTnwX7CSJZfdvlCAdZ46KQTyD6XI")  # Replace with your actual Gemini API key
gemini_model = genai.GenerativeModel("gemini-2.0-flash")

# === Helper: Split DataFrame into Batches ===
def split_df_into_batches(df, batch_size):
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:i + batch_size]

# === Robust Output Parser ===
def extract_labels(text, valid_labels):
    lines = text.strip().splitlines()
    results = {}
    for line in lines:
        match = re.match(r"Q(\d+):?\s*(\w+)", line.strip())
        if match:
            idx, label = int(match.group(1)), match.group(2)
            results[idx] = label if label in valid_labels else "Unclear"
    return results

# === Prompt Builders ===
def build_difficulty_batch_prompt(df):
    header = """
You are a biology teacher assessing the difficulty of multiple-choice questions.

Difficulty Levels:
- Easy: Factual recall or basic recognition
- Medium: Requires explanation or understanding
- Difficult: Requires analysis, synthesis, or reasoning

Classify each MCQ with one word: Easy, Medium, or Difficult.

Respond in this format:
Q[ID]: [Level]

Example:
Q23: Medium
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

def build_blooms_batch_prompt(df):
    header = """
You are an educational evaluator. Classify each biology MCQ using Bloom's Revised Taxonomy.

Levels:
- Remember
- Understand
- Apply
- Analyze
- Evaluate
- Create

Respond in the format:
Q[ID]: [Level]

Example:
Q23: Apply
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

# === Difficulty Classification ===
difficulty_levels = ["Unclear"] * len(exam_df)
for batch_df in split_df_into_batches(exam_df, 10):
    try:
        prompt = build_difficulty_batch_prompt(batch_df)
        response = gemini_model.generate_content(prompt, generation_config=config)
        parsed = extract_labels(response.text, ["Easy", "Medium", "Difficult"])
        for idx, label in parsed.items():
            difficulty_levels[idx] = label
    except Exception as e:
        print(f"Difficulty API Error: {e}")

exam_df["difficulty"] = difficulty_levels

# === Delay to Prevent Quota Hit ===
print("\nWaiting for 45 seconds before starting Bloom's Taxonomy classification...")
time.sleep(45)

# === Bloom’s Taxonomy Classification (Initial Pass) ===
bloom_levels = ["Unclear"] * len(exam_df)
valid_bloom = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]

for batch_df in split_df_into_batches(exam_df, 5):
    success = False
    retries = 3
    while not success and retries > 0:
        try:
            prompt = build_blooms_batch_prompt(batch_df)
            response = gemini_model.generate_content(prompt, generation_config=config)
            parsed = extract_labels(response.text, valid_bloom)
            for idx, label in parsed.items():
                bloom_levels[idx] = label
            success = True
        except Exception as e:
            if "429" in str(e):
                print("Quota limit reached. Retrying after delay...")
                time.sleep(15)
                retries -= 1
            else:
                print(f"Bloom API Error: {e}")
                break

exam_df["bloom_level"] = bloom_levels

# === Retry Bloom Classification for Unclear ===
unclear_bloom_df = exam_df[exam_df["bloom_level"] == "Unclear"]
if not unclear_bloom_df.empty:
    print(f"\nRetrying Bloom classification for {len(unclear_bloom_df)} unclear items...")
    for batch_df in split_df_into_batches(unclear_bloom_df, 5):
        success = False
        retries = 2
        while not success and retries > 0:
            try:
                prompt = build_blooms_batch_prompt(batch_df)
                response = gemini_model.generate_content(prompt, generation_config=config)
                parsed = extract_labels(response.text, valid_bloom)
                for idx, label in parsed.items():
                    exam_df.at[idx, "bloom_level"] = label
                success = True
            except Exception as e:
                if "429" in str(e):
                    print("Quota limit reached during retry. Waiting...")
                    time.sleep(15)
                    retries -= 1
                else:
                    print(f"Retry Bloom API Error: {e}")
                    break

# === Summary Output ===
print("\n=== Coverage by Section ===")
print(exam_df["matched_section"].value_counts())
print("\n=== Difficulty Distribution ===")
print(exam_df["difficulty"].value_counts())
print("\n=== Bloom’s Distribution ===")
print(exam_df["bloom_level"].value_counts())
print("\n=== Invalid MCQs (≠ 4 options) ===")
print(len(exam_df[~exam_df["option_count_valid"]]))

# === Save Final Output ===
exam_df.to_csv("evaluated_exam_gemini_revised.csv", index=False)
files.download("evaluated_exam_gemini_revised.csv")


In [None]:
# === Install Required Libraries ===
!pip install -q -U google-generativeai sentence-transformers faiss-cpu pandas

# === Imports ===
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import google.generativeai as genai
from google.generativeai.types import GenerationConfig
from google.colab import files
import re
import time

# === Upload CSVs ===
uploaded = files.upload()
exam_df = pd.read_csv("grade9Q1.csv", encoding="ISO-8859-1")
textbook_df = pd.read_csv("biologyg9.csv", encoding="ISO-8859-1")

# === Setup Configuration ===
config = GenerationConfig(
    temperature=0.0,
    top_p=1.0,
    top_k=1
)

# === Semantic Search Setup ===
embedder = SentenceTransformer("all-MiniLM-L6-v2")
text_chunks = textbook_df["Text"].tolist()
chunk_embeddings = embedder.encode(text_chunks, convert_to_tensor=True)

matched_units, matched_sections = [], []
threshold = 0.6
for question in exam_df["question_text"]:
    q_embed = embedder.encode(question, convert_to_tensor=True)
    similarities = util.cos_sim(q_embed, chunk_embeddings)
    best_idx = similarities.argmax().item()
    best_score = similarities[0][best_idx].item()
    if best_score >= threshold:
        matched_units.append(textbook_df.iloc[best_idx]["Unit"])
        matched_sections.append(textbook_df.iloc[best_idx]["Section"])
    else:
        matched_units.append("No Match")
        matched_sections.append("No Match")

exam_df["matched_unit"] = matched_units
exam_df["matched_section"] = matched_sections

# === MCQ Format Validation ===
exam_df["option_count_valid"] = exam_df.apply(
    lambda row: all(pd.notnull(row[f"option_{i}"]) for i in range(1, 5)), axis=1
)

# === Gemini Setup ===
genai.configure(api_key="AIzaSyCGgwigTnwX7CSJZfdvlCAdZ46KQTyD6XI")  # Replace with your actual Gemini API key
gemini_model = genai.GenerativeModel("gemini-2.0-flash")

# === Helper: Split DataFrame into Batches ===
def split_df_into_batches(df, batch_size):
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:i + batch_size]

# === Robust Output Parser ===
def extract_labels(text, valid_labels):
    lines = text.strip().splitlines()
    results = {}
    for line in lines:
        match = re.match(r"Q(\d+):?\s*(\w+)", line.strip())
        if match:
            idx, label = int(match.group(1)), match.group(2)
            results[idx] = label if label in valid_labels else "Unclear"
    return results

# === Prompt Builders ===
def build_difficulty_batch_prompt(df):
    header = """
You are a biology teacher assessing the difficulty of multiple-choice questions.

Difficulty Levels:
- Easy: Factual recall or basic recognition
- Medium: Requires explanation or understanding
- Difficult: Requires analysis, synthesis, or reasoning

Classify each MCQ with one word: Easy, Medium, or Difficult.

Respond in this format:
Q[ID]: [Level]

Example:
Q23: Medium
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

def build_blooms_batch_prompt(df):
    header = """
You are an educational evaluator. Classify each biology MCQ using Bloom's Revised Taxonomy.

Levels:
- Remember
- Understand
- Apply
- Analyze
- Evaluate
- Create

Respond in the format:
Q[ID]: [Level]

Example:
Q23: Apply
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

# === Difficulty Classification ===
difficulty_levels = ["Unclear"] * len(exam_df)
for batch_df in split_df_into_batches(exam_df, 10):
    try:
        prompt = build_difficulty_batch_prompt(batch_df)
        response = gemini_model.generate_content(prompt, generation_config=config)
        parsed = extract_labels(response.text, ["Easy", "Medium", "Difficult"])
        for idx, label in parsed.items():
            difficulty_levels[idx] = label
    except Exception as e:
        print(f"Difficulty API Error: {e}")

exam_df["difficulty"] = difficulty_levels

# === Delay to Prevent Quota Hit ===
print("\nWaiting for 45 seconds before starting Bloom's Taxonomy classification...")
time.sleep(45)

# === Bloom’s Taxonomy Classification (Initial Pass) ===
bloom_levels = ["Unclear"] * len(exam_df)
valid_bloom = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]

for batch_df in split_df_into_batches(exam_df, 5):
    success = False
    retries = 3
    while not success and retries > 0:
        try:
            prompt = build_blooms_batch_prompt(batch_df)
            response = gemini_model.generate_content(prompt, generation_config=config)
            parsed = extract_labels(response.text, valid_bloom)
            for idx, label in parsed.items():
                bloom_levels[idx] = label
            success = True
        except Exception as e:
            if "429" in str(e):
                print("Quota limit reached. Retrying after delay...")
                time.sleep(15)
                retries -= 1
            else:
                print(f"Bloom API Error: {e}")
                break

exam_df["bloom_level"] = bloom_levels

# === Retry Bloom Classification for Unclear ===
unclear_bloom_df = exam_df[exam_df["bloom_level"] == "Unclear"]
if not unclear_bloom_df.empty:
    print(f"\nRetrying Bloom classification for {len(unclear_bloom_df)} unclear items...")
    for batch_df in split_df_into_batches(unclear_bloom_df, 5):
        success = False
        retries = 2
        while not success and retries > 0:
            try:
                prompt = build_blooms_batch_prompt(batch_df)
                response = gemini_model.generate_content(prompt, generation_config=config)
                parsed = extract_labels(response.text, valid_bloom)
                for idx, label in parsed.items():
                    exam_df.at[idx, "bloom_level"] = label
                success = True
            except Exception as e:
                if "429" in str(e):
                    print("Quota limit reached during retry. Waiting...")
                    time.sleep(15)
                    retries -= 1
                else:
                    print(f"Retry Bloom API Error: {e}")
                    break

# === Summary Output ===
print("\n=== Coverage by Section ===")
print(exam_df["matched_section"].value_counts())
print("\n=== Difficulty Distribution ===")
print(exam_df["difficulty"].value_counts())
print("\n=== Bloom’s Distribution ===")
print(exam_df["bloom_level"].value_counts())
print("\n=== Invalid MCQs (≠ 4 options) ===")
print(len(exam_df[~exam_df["option_count_valid"]]))

# === Save Final Output ===
exam_df.to_csv("evaluated_exam_gemini_revised.csv", index=False)
files.download("evaluated_exam_gemini_revised.csv")


In [None]:
# === Install Required Libraries ===
!pip install -q -U google-generativeai sentence-transformers faiss-cpu pandas

# === Imports ===
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import google.generativeai as genai
from google.generativeai.types import GenerationConfig
from google.colab import files
import re
import time

# === Upload CSVs ===
uploaded = files.upload()
exam_df = pd.read_csv("grade9Q1.csv", encoding="ISO-8859-1")
textbook_df = pd.read_csv("biologyg9.csv", encoding="ISO-8859-1")

# === Setup Configuration ===
config = GenerationConfig(
    temperature=0.0,
    top_p=1.0,
    top_k=1
)

# === Semantic Search Setup ===
embedder = SentenceTransformer("all-MiniLM-L6-v2")
text_chunks = textbook_df["Text"].tolist()
chunk_embeddings = embedder.encode(text_chunks, convert_to_tensor=True)

matched_units, matched_sections = [], []
threshold = 0.6
for question in exam_df["question_text"]:
    q_embed = embedder.encode(question, convert_to_tensor=True)
    similarities = util.cos_sim(q_embed, chunk_embeddings)
    best_idx = similarities.argmax().item()
    best_score = similarities[0][best_idx].item()
    if best_score >= threshold:
        matched_units.append(textbook_df.iloc[best_idx]["Unit"])
        matched_sections.append(textbook_df.iloc[best_idx]["Section"])
    else:
        matched_units.append("No Match")
        matched_sections.append("No Match")

exam_df["matched_unit"] = matched_units
exam_df["matched_section"] = matched_sections

# === MCQ Format Validation ===
exam_df["option_count_valid"] = exam_df.apply(
    lambda row: all(pd.notnull(row[f"option_{i}"]) for i in range(1, 5)), axis=1
)

# === Gemini Setup ===
genai.configure(api_key="AIzaSyCGgwigTnwX7CSJZfdvlCAdZ46KQTyD6XI")  # Replace with your actual Gemini API key
gemini_model = genai.GenerativeModel("gemini-2.0-flash")

# === Helper: Split DataFrame into Batches ===
def split_df_into_batches(df, batch_size):
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:i + batch_size]

# === Robust Output Parser ===
def extract_labels(text, valid_labels):
    lines = text.strip().splitlines()
    results = {}
    for line in lines:
        match = re.match(r"Q(\d+):?\s*(\w+)", line.strip())
        if match:
            idx, label = int(match.group(1)), match.group(2)
            results[idx] = label if label in valid_labels else "Unclear"
    return results

# === Prompt Builders ===
def build_difficulty_batch_prompt(df):
    header = """
You are a biology teacher assessing the difficulty of multiple-choice questions.

Difficulty Levels:
- Easy: Factual recall or basic recognition
- Medium: Requires explanation or understanding
- Difficult: Requires analysis, synthesis, or reasoning

Classify each MCQ with one word: Easy, Medium, or Difficult.

Respond in this format:
Q[ID]: [Level]

Example:
Q23: Medium
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

def build_blooms_batch_prompt(df):
    header = """
You are an educational evaluator. Classify each biology MCQ using Bloom's Revised Taxonomy.

Levels:
- Remember
- Understand
- Apply
- Analyze
- Evaluate
- Create

Respond in the format:
Q[ID]: [Level]

Example:
Q23: Apply
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

# === Step 1: Difficulty Classification ===
difficulty_levels = ["Unclear"] * len(exam_df)
for batch_df in split_df_into_batches(exam_df, 10):
    try:
        prompt = build_difficulty_batch_prompt(batch_df)
        response = gemini_model.generate_content(prompt, generation_config=config)
        parsed = extract_labels(response.text, ["Easy", "Medium", "Difficult"])
        for idx, label in parsed.items():
            difficulty_levels[idx] = label
    except Exception as e:
        print(f"Difficulty API Error: {e}")

exam_df["difficulty"] = difficulty_levels

# === Step 2: WAIT before Bloom Classification ===
print("\n✅ Difficulty classification complete.")
print("⏳ Waiting 120 seconds before starting Bloom's taxonomy classification...")
time.sleep(120)

# === Step 3: Bloom's Taxonomy Classification ===
bloom_levels = ["Unclear"] * len(exam_df)
valid_bloom = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]

for batch_df in split_df_into_batches(exam_df, 5):
    success = False
    retries = 3
    while not success and retries > 0:
        try:
            prompt = build_blooms_batch_prompt(batch_df)
            response = gemini_model.generate_content(prompt, generation_config=config)
            parsed = extract_labels(response.text, valid_bloom)
            for idx, label in parsed.items():
                bloom_levels[idx] = label
            success = True
        except Exception as e:
            if "429" in str(e):
                print("⚠️ Quota limit reached. Retrying after 15s...")
                time.sleep(15)
                retries -= 1
            else:
                print(f"Bloom API Error: {e}")
                break

exam_df["bloom_level"] = bloom_levels

# === Final Output Summary ===
print("\n=== Coverage by Section ===")
print(exam_df["matched_section"].value_counts())
print("\n=== Difficulty Distribution ===")
print(exam_df["difficulty"].value_counts())
print("\n=== Bloom’s Distribution ===")
print(exam_df["bloom_level"].value_counts())
print("\n=== Invalid MCQs (≠ 4 options) ===")
print(len(exam_df[~exam_df["option_count_valid"]]))

# === Save Final Output ===
exam_df.to_csv("evaluated_exam_gemini_revised.csv", index=False)
files.download("evaluated_exam_gemini_revised.csv")


In [None]:
# === Install Required Libraries ===
!pip install -q -U google-generativeai sentence-transformers faiss-cpu pandas

# === Imports ===
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import google.generativeai as genai
from google.generativeai.types import GenerationConfig
from google.colab import files
import re
import time

# === Upload CSVs ===
uploaded = files.upload()
exam_df = pd.read_csv("grade9Q1.csv", encoding="ISO-8859-1")
textbook_df = pd.read_csv("biologyg9.csv", encoding="ISO-8859-1")

# === Setup Configuration ===
config = GenerationConfig(
    temperature=0.0,
    top_p=1.0,
    top_k=1
)

# === Semantic Search Setup ===
embedder = SentenceTransformer("all-MiniLM-L6-v2")
text_chunks = textbook_df["Text"].tolist()
chunk_embeddings = embedder.encode(text_chunks, convert_to_tensor=True)

matched_units, matched_sections = [], []
threshold = 0.6
for question in exam_df["question_text"]:
    q_embed = embedder.encode(question, convert_to_tensor=True)
    similarities = util.cos_sim(q_embed, chunk_embeddings)
    best_idx = similarities.argmax().item()
    best_score = similarities[0][best_idx].item()
    if best_score >= threshold:
        matched_units.append(textbook_df.iloc[best_idx]["Unit"])
        matched_sections.append(textbook_df.iloc[best_idx]["Section"])
    else:
        matched_units.append("No Match")
        matched_sections.append("No Match")

exam_df["matched_unit"] = matched_units
exam_df["matched_section"] = matched_sections

# === MCQ Format Validation ===
exam_df["option_count_valid"] = exam_df.apply(
    lambda row: all(pd.notnull(row[f"option_{i}"]) for i in range(1, 5)), axis=1
)

# === Gemini Setup ===
genai.configure(api_key="AIzaSyCGgwigTnwX7CSJZfdvlCAdZ46KQTyD6XI")  # Replace with your actual Gemini API key
gemini_model = genai.GenerativeModel("gemini-2.0-flash")

# === Helper: Split DataFrame into Batches ===
def split_df_into_batches(df, batch_size):
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:i + batch_size]

# === Robust Output Parser ===
def extract_labels(text, valid_labels):
    lines = text.strip().splitlines()
    results = {}
    for line in lines:
        match = re.match(r"Q(\d+):?\s*(\w+)", line.strip())
        if match:
            idx, label = int(match.group(1)), match.group(2)
            results[idx] = label if label in valid_labels else "Unclear"
    return results

# === Prompt Builders ===
def build_difficulty_batch_prompt(df):
    header = """
You are a biology teacher assessing the difficulty of multiple-choice questions.

Difficulty Levels:
- Easy: Factual recall or basic recognition
- Medium: Requires explanation or understanding
- Difficult: Requires analysis, synthesis, or reasoning

Classify each MCQ with one word: Easy, Medium, or Difficult.

Respond in this format:
Q[ID]: [Level]

Example:
Q23: Medium
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

def build_blooms_batch_prompt(df):
    header = """
You are an educational evaluator. Classify each biology MCQ using Bloom's Revised Taxonomy.

Levels:
- Remember
- Understand
- Apply
- Analyze
- Evaluate
- Create

Respond in the format:
Q[ID]: [Level]

Example:
Q23: Apply
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

# === Step 1: Difficulty Classification ===
difficulty_levels = ["Unclear"] * len(exam_df)
for batch_df in split_df_into_batches(exam_df, 10):
    try:
        prompt = build_difficulty_batch_prompt(batch_df)
        response = gemini_model.generate_content(prompt, generation_config=config)
        parsed = extract_labels(response.text, ["Easy", "Medium", "Difficult"])
        for idx, label in parsed.items():
            difficulty_levels[idx] = label
    except Exception as e:
        print(f"Difficulty API Error: {e}")

exam_df["difficulty"] = difficulty_levels

# === Step 2: WAIT before Bloom Classification ===
print("\n✅ Difficulty classification complete.")
print("⏳ Waiting 120 seconds before starting Bloom's taxonomy classification...")
time.sleep(120)

# === Step 3: Bloom's Taxonomy Classification ===
bloom_levels = ["Unclear"] * len(exam_df)
valid_bloom = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]

for batch_df in split_df_into_batches(exam_df, 5):
    success = False
    retries = 3
    while not success and retries > 0:
        try:
            prompt = build_blooms_batch_prompt(batch_df)
            response = gemini_model.generate_content(prompt, generation_config=config)
            parsed = extract_labels(response.text, valid_bloom)
            for idx, label in parsed.items():
                bloom_levels[idx] = label
            success = True
        except Exception as e:
            if "429" in str(e):
                print("⚠️ Quota limit reached. Retrying after 15s...")
                time.sleep(15)
                retries -= 1
            else:
                print(f"Bloom API Error: {e}")
                break

exam_df["bloom_level"] = bloom_levels

# === Final Output Summary ===
print("\n=== Coverage by Section ===")
print(exam_df["matched_section"].value_counts())
print("\n=== Difficulty Distribution ===")
print(exam_df["difficulty"].value_counts())
print("\n=== Bloom’s Distribution ===")
print(exam_df["bloom_level"].value_counts())
print("\n=== Invalid MCQs (≠ 4 options) ===")
print(len(exam_df[~exam_df["option_count_valid"]]))

# === Save Final Output ===
exam_df.to_csv("evaluated_exam_gemini_revised.csv", index=False)
files.download("evaluated_exam_gemini_revised.csv")


In [None]:
# === Install Required Libraries ===
!pip install -q -U google-generativeai sentence-transformers faiss-cpu pandas

# === Imports ===
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import google.generativeai as genai
from google.generativeai.types import GenerationConfig
from google.colab import files
import re

# === Upload CSV Files ===
uploaded = files.upload()
exam_df = pd.read_csv("grade9Q1.csv", encoding="ISO-8859-1")
textbook_df = pd.read_csv("biologyg9.csv", encoding="ISO-8859-1")

# === Gemini Configuration ===
genai.configure(api_key="AIzaSyCGgwigTnwX7CSJZfdvlCAdZ46KQTyD6XI")  # Replace with your key
gemini_model = genai.GenerativeModel("gemini-2.0-flash")
config = GenerationConfig(temperature=0.0, top_p=1.0, top_k=1)

# === Semantic Matching ===
embedder = SentenceTransformer("all-MiniLM-L6-v2")
text_chunks = textbook_df["Text"].tolist()
chunk_embeddings = embedder.encode(text_chunks, convert_to_tensor=True)

matched_units, matched_sections = [], []
threshold = 0.6
for question in exam_df["question_text"]:
    q_embed = embedder.encode(question, convert_to_tensor=True)
    similarities = util.cos_sim(q_embed, chunk_embeddings)
    best_idx = similarities.argmax().item()
    best_score = similarities[0][best_idx].item()
    if best_score >= threshold:
        matched_units.append(textbook_df.iloc[best_idx]["Unit"])
        matched_sections.append(textbook_df.iloc[best_idx]["Section"])
    else:
        matched_units.append("No Match")
        matched_sections.append("No Match")

exam_df["matched_unit"] = matched_units
exam_df["matched_section"] = matched_sections

# === Prompt Builder ===
def build_difficulty_batch_prompt(df):
    header = """
You are a biology teacher assessing the difficulty of multiple-choice questions.

Difficulty Levels:
- Easy
- Medium
- Difficult

Classify each MCQ in this format:
Q[ID]: [Level]
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

# === Label Parser ===
def extract_labels(text, valid_labels):
    results = {}
    for line in text.strip().splitlines():
        match = re.match(r"Q(\d+):?\s*(\w+)", line.strip())
        if match:
            idx, label = int(match.group(1)), match.group(2)
            results[idx] = label if label in valid_labels else "Unclear"
    return results

# === Run Difficulty Classification ===
difficulty_levels = ["Unclear"] * len(exam_df)
for i in range(0, len(exam_df), 10):
    batch_df = exam_df.iloc[i:i+10]
    try:
        prompt = build_difficulty_batch_prompt(batch_df)
        response = gemini_model.generate_content(prompt, generation_config=config)
        parsed = extract_labels(response.text, ["Easy", "Medium", "Difficult"])
        for idx, label in parsed.items():
            difficulty_levels[idx] = label
    except Exception as e:
        print(f"Difficulty API Error: {e}")

exam_df["difficulty"] = difficulty_levels

# === Output Summary ===
print("\n=== Difficulty Distribution ===")
print(exam_df["difficulty"].value_counts())

# === Save Output ===
exam_df.to_csv("difficulty_output.csv", index=False)
files.download("difficulty_output.csv")


In [None]:
# === Install Required Libraries ===
!pip install -q -U google-generativeai sentence-transformers faiss-cpu pandas

# === Imports ===
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import google.generativeai as genai
from google.generativeai.types import GenerationConfig
from google.colab import files
import re

# === Upload CSV Files ===
uploaded = files.upload()
exam_df = pd.read_csv("grade9Q1.csv", encoding="ISO-8859-1")
textbook_df = pd.read_csv("biologyg9.csv", encoding="ISO-8859-1")

# === Gemini Configuration ===
genai.configure(api_key="AIzaSyCGgwigTnwX7CSJZfdvlCAdZ46KQTyD6XI")  # Replace with your key
gemini_model = genai.GenerativeModel("gemini-2.0-flash")
config = GenerationConfig(temperature=0.0, top_p=1.0, top_k=1)

# === Semantic Matching ===
embedder = SentenceTransformer("all-MiniLM-L6-v2")
text_chunks = textbook_df["Text"].tolist()
chunk_embeddings = embedder.encode(text_chunks, convert_to_tensor=True)

matched_units, matched_sections = [], []
threshold = 0.6
for question in exam_df["question_text"]:
    q_embed = embedder.encode(question, convert_to_tensor=True)
    similarities = util.cos_sim(q_embed, chunk_embeddings)
    best_idx = similarities.argmax().item()
    best_score = similarities[0][best_idx].item()
    if best_score >= threshold:
        matched_units.append(textbook_df.iloc[best_idx]["Unit"])
        matched_sections.append(textbook_df.iloc[best_idx]["Section"])
    else:
        matched_units.append("No Match")
        matched_sections.append("No Match")

exam_df["matched_unit"] = matched_units
exam_df["matched_section"] = matched_sections

# === Prompt Builder ===
def build_difficulty_batch_prompt(df):
    header = """
You are a biology teacher assessing the difficulty of multiple-choice questions.

Difficulty Levels:
- Easy
- Medium
- Difficult

Classify each MCQ in this format:
Q[ID]: [Level]
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

# === Label Parser ===
def extract_labels(text, valid_labels):
    results = {}
    for line in text.strip().splitlines():
        match = re.match(r"Q(\d+):?\s*(\w+)", line.strip())
        if match:
            idx, label = int(match.group(1)), match.group(2)
            results[idx] = label if label in valid_labels else "Unclear"
    return results

# === Run Difficulty Classification ===
difficulty_levels = ["Unclear"] * len(exam_df)
for i in range(0, len(exam_df), 10):
    batch_df = exam_df.iloc[i:i+10]
    try:
        prompt = build_difficulty_batch_prompt(batch_df)
        response = gemini_model.generate_content(prompt, generation_config=config)
        parsed = extract_labels(response.text, ["Easy", "Medium", "Difficult"])
        for idx, label in parsed.items():
            difficulty_levels[idx] = label
    except Exception as e:
        print(f"Difficulty API Error: {e}")

exam_df["difficulty"] = difficulty_levels

# === Output Summary ===
print("\n=== Difficulty Distribution ===")
print(exam_df["difficulty"].value_counts())

# === Save Output ===
exam_df.to_csv("difficulty_output.csv", index=False)
files.download("difficulty_output.csv")


In [None]:
# === Install Required Libraries ===
!pip install -q -U google-generativeai sentence-transformers faiss-cpu pandas

# === Imports ===
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import google.generativeai as genai
from google.generativeai.types import GenerationConfig
from google.colab import files
import re

# === Upload CSVs ===
uploaded = files.upload()
exam_df = pd.read_csv("grade9Q1.csv", encoding="ISO-8859-1")
textbook_df = pd.read_csv("biologyg9.csv", encoding="ISO-8859-1")

# === Setup Configuration ===
config = GenerationConfig(
    temperature=0.0,
    top_p=1.0,
    top_k=1
)

# === Semantic Search Setup ===
embedder = SentenceTransformer("all-MiniLM-L6-v2")
text_chunks = textbook_df["Text"].tolist()
chunk_embeddings = embedder.encode(text_chunks, convert_to_tensor=True)

matched_units, matched_sections = [], []
threshold = 0.6
for question in exam_df["question_text"]:
    q_embed = embedder.encode(question, convert_to_tensor=True)
    similarities = util.cos_sim(q_embed, chunk_embeddings)
    best_idx = similarities.argmax().item()
    best_score = similarities[0][best_idx].item()
    if best_score >= threshold:
        matched_units.append(textbook_df.iloc[best_idx]["Unit"])
        matched_sections.append(textbook_df.iloc[best_idx]["Section"])
    else:
        matched_units.append("No Match")
        matched_sections.append("No Match")

exam_df["matched_unit"] = matched_units
exam_df["matched_section"] = matched_sections

# === MCQ Format Validation ===
exam_df["option_count_valid"] = exam_df.apply(
    lambda row: all(pd.notnull(row[f"option_{i}"]) for i in range(1, 5)), axis=1
)

# === Gemini Setup ===
genai.configure(api_key="AIzaSyCGgwigTnwX7CSJZfdvlCAdZ46KQTyD6XI")
gemini_model = genai.GenerativeModel("gemini-2.0-flash")

# === Helper Functions ===
def split_df_into_batches(df, batch_size):
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:i + batch_size]

def extract_labels(text, valid_labels):
    lines = text.strip().splitlines()
    results = {}
    for line in lines:
        match = re.match(r"Q(\d+):?\s*(\w+)", line.strip())
        if match:
            idx, label = int(match.group(1)), match.group(2)
            results[idx] = label if label in valid_labels else "Unclear"
    return results

def build_difficulty_batch_prompt(df):
    header = """
You are a biology teacher assessing the difficulty of multiple-choice questions.

Difficulty Levels:
- Easy: Factual recall or basic recognition
- Medium: Requires explanation or understanding
- Difficult: Requires analysis, synthesis, or reasoning

Classify each MCQ with one word: Easy, Medium, or Difficult.

Respond in this format:
Q[ID]: [Level]

Example:
Q23: Medium
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

# === Step 1: Difficulty Classification ===
difficulty_levels = ["Unclear"] * len(exam_df)
for batch_df in split_df_into_batches(exam_df, 10):
    try:
        prompt = build_difficulty_batch_prompt(batch_df)
        response = gemini_model.generate_content(prompt, generation_config=config)
        parsed = extract_labels(response.text, ["Easy", "Medium", "Difficult"])
        for idx, label in parsed.items():
            difficulty_levels[idx] = label
    except Exception as e:
        print(f"Difficulty API Error: {e}")

exam_df["difficulty"] = difficulty_levels

# === Final Output Summary ===
print("\n=== Coverage by Section ===")
print(exam_df["matched_section"].value_counts())
print("\n=== Difficulty Distribution ===")
print(exam_df["difficulty"].value_counts())
print("\n=== Invalid MCQs (≠ 4 options) ===")
print(len(exam_df[~exam_df["option_count_valid"]]))

# === Save Final Output ===
exam_df.to_csv("difficulty_only.csv", index=False)
files.download("difficulty_only.csv")


In [None]:
# === Install Required Libraries ===
!pip install -q -U google-generativeai pandas

# === Imports ===
import pandas as pd
import google.generativeai as genai
from google.generativeai.types import GenerationConfig
from google.colab import files
import re
import time

# === Upload Difficulty-Classified File ===
uploaded = files.upload()
exam_df = pd.read_csv("difficulty_only.csv", encoding="ISO-8859-1")

# === Gemini Setup ===
genai.configure(api_key="AIzaSyCGgwigTnwX7CSJZfdvlCAdZ46KQTyD6XI")
gemini_model = genai.GenerativeModel("gemini-2.0-flash")
config = GenerationConfig(temperature=0.0, top_p=1.0, top_k=1)

# === Helper Functions ===
def split_df_into_batches(df, batch_size):
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:i + batch_size]

def extract_labels(text, valid_labels):
    lines = text.strip().splitlines()
    results = {}
    for line in lines:
        match = re.match(r"Q(\d+):?\s*(\w+)", line.strip())
        if match:
            idx, label = int(match.group(1)), match.group(2)
            results[idx] = label if label in valid_labels else "Unclear"
    return results

def build_blooms_batch_prompt(df):
    header = """
You are an educational evaluator. Classify each biology MCQ using Bloom's Revised Taxonomy.

Levels:
- Remember
- Understand
- Apply
- Analyze
- Evaluate
- Create

Respond in the format:
Q[ID]: [Level]

Example:
Q23: Apply
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

# === Step: Bloom's Taxonomy Classification ===
print("\n⏳ Starting Bloom’s taxonomy classification after delay...")
time.sleep(60)

bloom_levels = ["Unclear"] * len(exam_df)
valid_bloom = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]

for batch_df in split_df_into_batches(exam_df, 5):
    success = False
    retries = 3
    while not success and retries > 0:
        try:
            prompt = build_blooms_batch_prompt(batch_df)
            response = gemini_model.generate_content(prompt, generation_config=config)
            parsed = extract_labels(response.text, valid_bloom)
            for idx, label in parsed.items():
                bloom_levels[idx] = label
            success = True
        except Exception as e:
            if "429" in str(e):
                print("⚠️ Quota limit reached. Retrying after 15s...")
                time.sleep(15)
                retries -= 1
            else:
                print(f"Bloom API Error: {e}")
                break

exam_df["bloom_level"] = bloom_levels

# === Final Output Summary ===
print("\n=== Bloom’s Distribution ===")
print(exam_df["bloom_level"].value_counts())

# === Save Final Output ===
exam_df.to_csv("bloom_only.csv", index=False)
files.download("bloom_only.csv")


In [None]:
# === Install Required Libraries ===
!pip install -q -U google-generativeai pandas

# === Imports ===
import pandas as pd
import google.generativeai as genai
from google.generativeai.types import GenerationConfig
from google.colab import files
import re
import time

# === Upload Difficulty-Classified File ===
uploaded = files.upload()
exam_df = pd.read_csv("difficulty_only.csv", encoding="ISO-8859-1")

# === Gemini Setup ===
genai.configure(api_key="AIzaSyCGgwigTnwX7CSJZfdvlCAdZ46KQTyD6XI")
gemini_model = genai.GenerativeModel("gemini-2.0-flash")
config = GenerationConfig(temperature=0.0, top_p=1.0, top_k=1)

# === Helper Functions ===
def split_df_into_batches(df, batch_size):
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:i + batch_size]

def extract_labels(text, valid_labels):
    lines = text.strip().splitlines()
    results = {}
    for line in lines:
        match = re.match(r"Q(\d+):?\s*(\w+)", line.strip())
        if match:
            idx, label = int(match.group(1)), match.group(2)
            results[idx] = label if label in valid_labels else "Unclear"
    return results

def build_blooms_batch_prompt(df):
    header = """
You are an educational evaluator. Classify each biology MCQ using Bloom's Revised Taxonomy.

Levels:
- Remember
- Understand
- Apply
- Analyze
- Evaluate
- Create

Respond in the format:
Q[ID]: [Level]

Example:
Q23: Apply
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

# === Step: Bloom's Taxonomy Classification ===
print("\n⏳ Starting Bloom’s taxonomy classification after delay...")
time.sleep(60)

bloom_levels = ["Unclear"] * len(exam_df)
valid_bloom = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]

for batch_df in split_df_into_batches(exam_df, 5):
    success = False
    retries = 3
    while not success and retries > 0:
        try:
            prompt = build_blooms_batch_prompt(batch_df)
            response = gemini_model.generate_content(prompt, generation_config=config)
            parsed = extract_labels(response.text, valid_bloom)
            for idx, label in parsed.items():
                bloom_levels[idx] = label
            success = True
        except Exception as e:
            if "429" in str(e):
                print("⚠️ Quota limit reached. Retrying after 15s...")
                time.sleep(15)
                retries -= 1
            else:
                print(f"Bloom API Error: {e}")
                break

exam_df["bloom_level"] = bloom_levels

# === Final Output Summary ===
print("\n=== Bloom’s Distribution ===")
print(exam_df["bloom_level"].value_counts())

# === Save Final Output ===
exam_df.to_csv("bloom_only.csv", index=False)
files.download("bloom_only.csv")


In [None]:
# === Install Required Libraries ===
!pip install -q -U google-generativeai pandas

# === Imports ===
import pandas as pd
import google.generativeai as genai
from google.generativeai.types import GenerationConfig
from google.colab import files
import re

# === Upload Difficulty-Classified File ===
uploaded = files.upload()  # Upload difficulty_only.csv
exam_df = pd.read_csv("difficulty_only.csv", encoding="ISO-8859-1")

# === Gemini Setup ===
genai.configure(api_key="AIzaSyCGgwigTnwX7CSJZfdvlCAdZ46KQTyD6XI")  # Replace with your actual Gemini API key
gemini_model = genai.GenerativeModel("gemini-2.0-flash")

# === Stable Generation Configuration ===
config = GenerationConfig(
    temperature=0.0,  # No randomness
    top_p=1.0,
    top_k=1
)

# === Helper: Split DataFrame into Batches ===
def split_df_into_batches(df, batch_size):
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:i + batch_size]

# === Output Parser ===
def extract_labels(text, valid_labels):
    lines = text.strip().splitlines()
    results = {}
    for line in lines:
        match = re.match(r"Q(\d+):?\s*(\w+)", line.strip())
        if match:
            idx, label = int(match.group(1)), match.group(2)
            results[idx] = label if label in valid_labels else "Unclear"
    return results

# === Prompt Builder (Bloom) ===
def build_blooms_batch_prompt(df):
    header = """
You are an educational evaluator. Classify each biology MCQ using Bloom's Revised Taxonomy.

Levels:
- Remember
- Understand
- Apply
- Analyze
- Evaluate
- Create

Respond in this format:
Q[ID]: [Level]

Example:
Q23: Apply
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

# === Step: Bloom's Taxonomy Classification ===
print("⏳ Starting stable Bloom’s taxonomy classification...")

bloom_levels = ["Unclear"] * len(exam_df)
valid_bloom = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]

for batch_df in split_df_into_batches(exam_df, 10):  # consistent batching
    try:
        prompt = build_blooms_batch_prompt(batch_df)
        response = gemini_model.generate_content(prompt, generation_config=config)
        parsed = extract_labels(response.text, valid_bloom)
        for idx, label in parsed.items():
            bloom_levels[idx] = label
    except Exception as e:
        print(f"Bloom API Error: {e}")

exam_df["bloom_level"] = bloom_levels

# === Final Output Summary ===
print("\n=== Bloom’s Distribution ===")
print(exam_df["bloom_level"].value_counts())

# === Save Final Output ===
exam_df.to_csv("bloom_stable.csv", index=False)
files.download("bloom_stable.csv")


In [None]:
# === Install Required Libraries ===
!pip install -q -U google-generativeai pandas

# === Imports ===
import pandas as pd
import google.generativeai as genai
from google.generativeai.types import GenerationConfig
from google.colab import files
import re

# === Upload Difficulty-Classified File ===
uploaded = files.upload()  # Upload difficulty_only.csv
exam_df = pd.read_csv("difficulty_only.csv", encoding="ISO-8859-1")

# === Gemini Setup ===
genai.configure(api_key="AIzaSyCGgwigTnwX7CSJZfdvlCAdZ46KQTyD6XI")  # Replace with your actual Gemini API key
gemini_model = genai.GenerativeModel("gemini-2.0-flash")

# === Stable Generation Configuration ===
config = GenerationConfig(
    temperature=0.0,  # No randomness
    top_p=1.0,
    top_k=1
)

# === Helper: Split DataFrame into Batches ===
def split_df_into_batches(df, batch_size):
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:i + batch_size]

# === Output Parser ===
def extract_labels(text, valid_labels):
    lines = text.strip().splitlines()
    results = {}
    for line in lines:
        match = re.match(r"Q(\d+):?\s*(\w+)", line.strip())
        if match:
            idx, label = int(match.group(1)), match.group(2)
            results[idx] = label if label in valid_labels else "Unclear"
    return results

# === Prompt Builder (Bloom) ===
def build_blooms_batch_prompt(df):
    header = """
You are an educational evaluator. Classify each biology MCQ using Bloom's Revised Taxonomy.

Levels:
- Remember
- Understand
- Apply
- Analyze
- Evaluate
- Create

Respond in this format:
Q[ID]: [Level]

Example:
Q23: Apply
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

# === Step: Bloom's Taxonomy Classification ===
print("⏳ Starting stable Bloom’s taxonomy classification...")

bloom_levels = ["Unclear"] * len(exam_df)
valid_bloom = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]

for batch_df in split_df_into_batches(exam_df, 10):  # consistent batching
    try:
        prompt = build_blooms_batch_prompt(batch_df)
        response = gemini_model.generate_content(prompt, generation_config=config)
        parsed = extract_labels(response.text, valid_bloom)
        for idx, label in parsed.items():
            bloom_levels[idx] = label
    except Exception as e:
        print(f"Bloom API Error: {e}")

exam_df["bloom_level"] = bloom_levels

# === Final Output Summary ===
print("\n=== Bloom’s Distribution ===")
print(exam_df["bloom_level"].value_counts())

# === Save Final Output ===
exam_df.to_csv("bloom_stable.csv", index=False)
files.download("bloom_stable.csv")


In [None]:
# === Install Required Libraries ===
!pip install -q -U google-generativeai pandas

# === Imports ===
import pandas as pd
import google.generativeai as genai
from google.generativeai.types import GenerationConfig
from google.colab import files
import re

# === Upload Difficulty-Classified File ===
uploaded = files.upload()  # Upload difficulty_only.csv
exam_df = pd.read_csv("difficulty_only.csv", encoding="ISO-8859-1")

# === Gemini Setup ===
genai.configure(api_key="AIzaSyCGgwigTnwX7CSJZfdvlCAdZ46KQTyD6XI")  # Replace with your actual Gemini API key
gemini_model = genai.GenerativeModel("gemini-2.0-flash")

# === Stable Generation Configuration ===
config = GenerationConfig(
    temperature=0.0,  # No randomness
    top_p=1.0,
    top_k=1
)

# === Helper: Split DataFrame into Batches ===
def split_df_into_batches(df, batch_size):
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:i + batch_size]

# === Output Parser ===
def extract_labels(text, valid_labels):
    lines = text.strip().splitlines()
    results = {}
    for line in lines:
        match = re.match(r"Q(\d+):?\s*(\w+)", line.strip())
        if match:
            idx, label = int(match.group(1)), match.group(2)
            results[idx] = label if label in valid_labels else "Unclear"
    return results

# === Prompt Builder (Bloom) ===
def build_blooms_batch_prompt(df):
    header = """
You are an educational evaluator. Classify each biology MCQ using Bloom's Revised Taxonomy.

Levels:
- Remember
- Understand
- Apply
- Analyze
- Evaluate
- Create

Respond in this format:
Q[ID]: [Level]

Example:
Q23: Apply
"""
    body = ""
    for idx, row in df.iterrows():
        body += f"\nQ{idx}: {row['question_text']}\n"
        body += f"A. {row['option_1']}\nB. {row['option_2']}\nC. {row['option_3']}\nD. {row['option_4']}\n"
    return header + body

# === Step: Bloom's Taxonomy Classification ===
print("⏳ Starting stable Bloom’s taxonomy classification...")

bloom_levels = ["Unclear"] * len(exam_df)
valid_bloom = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]

for batch_df in split_df_into_batches(exam_df, 10):  # consistent batching
    try:
        prompt = build_blooms_batch_prompt(batch_df)
        response = gemini_model.generate_content(prompt, generation_config=config)
        parsed = extract_labels(response.text, valid_bloom)
        for idx, label in parsed.items():
            bloom_levels[idx] = label
    except Exception as e:
        print(f"Bloom API Error: {e}")

exam_df["bloom_level"] = bloom_levels

# === Final Output Summary ===
print("\n=== Bloom’s Distribution ===")
print(exam_df["bloom_level"].value_counts())

# === Save Final Output ===
exam_df.to_csv("bloom_stable.csv", index=False)
files.download("bloom_stable.csv")
