In [None]:
# %pip install openai python-dotenv pandas

In [1]:
from openai import OpenAI
from dotenv import load_dotenv
import json
import pandas as pd
from os import getenv

In [2]:
load_dotenv()

True

In [3]:
# Read the context from a text file 
context = ""
with open("content.txt", "r") as f:
    context = f.read()
    
# print(context)

In [4]:
# Read the learning outcomes from a text file 
assessment_los = ""
with open("A1_los.txt", "r") as f:
    assessment_los = f.read()
    
# print(assessment_los)

In [7]:
# mcqs = generate_mcqs(context, assessment_los)

In [5]:
# Read MCQs back from the text file
with open("generated_mcqs.txt", "r") as f:
    mcqs_read = f.read().split("\n\n")
    # Remove empty strings
    mcqs_read = [mcq for mcq in mcqs_read if mcq]

In [6]:
mcqs_read

['Q1. What is the first step in the six-step problem-solving process?  \nA) Analyse the problem  \nB) Identify the problem  \nC) Implement the solution  \nD) Generate potential solutions  \nCorrect Answer: B  \nDifficulty: Easy  ',
 'Q2. Which of the following best defines wicked problems?  \nA) Problems that have simple solutions.  \nB) Problems that occur regularly and are easy to solve.  \nC) Problems difficult to define, with no clear solutions.  \nD) Problems that always have one correct answer.  \nCorrect Answer: C  \nDifficulty: Medium  ',
 'Q3. Which programming construct is used to repeatedly execute a block of code while a condition is true?  \nA) for loop  \nB) while loop  \nC) if statement  \nD) elif statement  \nCorrect Answer: B  \nDifficulty: Easy  ',
 'Q4. What does the Python function `type()` do?  \nA) Converts a variable into a different type.  \nB) Checks the size of the variable.  \nC) Returns the data type of a variable.  \nD) Assigns a type to the variable.  \nCo

In [7]:
client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=getenv("OPENROUTER_API_KEY"),
)

In [8]:
def evaluate_relevance(mcqs, context):
    
    results = []
    
    # Define the relevance criterion configuration
    criteria_config = {
        "name": "Relevance to Provided Context",
        "score_key": "relevance_score",
        "justification_key": "relevance_justification",
        "description": """
        1. Relevance to Provided Context
           - 1 (Good): Directly ties to key concepts, examples, or terminology from the provided context
           - 0 (Poor): Irrelevant or misaligned with the context (e.g., introduces unrelated topics)
        """
    }
    
    for question in mcqs:
        # Clean the question to remove problematic characters
        clean_question = question.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        clean_question = clean_question.replace('\\', '\\\\')
        clean_question = clean_question.replace('"', '\\"')
        
        # Extract only the first line as a short question summary
        question_summary = clean_question.split('  ')[0] if '  ' in clean_question else clean_question
        if len(question_summary) > 50:
            question_summary = question_summary[:47] + "..."
        
        # Build the prompt for relevance evaluation only
        prompt = f"""
        Evaluate the following MCQ based on the provided Context.
        Your evaluation should follow the Evaluation Rubric below with scores of 0 (Poor) or 2 (Good).
        Return your evaluation in valid JSON format with scores and justifications.
        
        Context: {context}

        MCQ: {clean_question}
        
        Evaluation Rubric:
        {criteria_config["description"]}

        Required JSON format:
        {{
            "question": "{question_summary}",
            "evaluations": {{
                "{criteria_config["score_key"]}": 0-1,
                "{criteria_config["justification_key"]}": "justification text"
            }}
        }}
        
        Follow strictly the JSON format and do not add anything extra or markdown.
        Ensure all text is properly escaped for JSON.
        """

        try:
            response = client.chat.completions.create(
                model="deepseek/deepseek-chat:free",
                messages=[
                    {"role": "system", "content": "You are an MCQ evaluation assistant. Always respond with valid JSON in the exact format specified."},
                    {"role": "user", "content": prompt}
                ],
            )
            
            # Get the response content and attempt to clean it further if needed
            response_content = response.choices[0].message.content
            
            # Try to fix common JSON issues before parsing
            response_content = response_content.strip()
            if response_content.startswith("```json"):
                response_content = response_content.replace("```json", "", 1)
            if response_content.endswith("```"):
                response_content = response_content.replace("```", "", 1)
            response_content = response_content.strip()
            
            eval_data = json.loads(response_content)
            
            # Create result dictionary using the original question to preserve all information
            result = {'Question': question}
            result[f'{criteria_config["name"]} Score'] = eval_data['evaluations'][criteria_config["score_key"]]
            result[f'{criteria_config["name"]} Justification'] = eval_data['evaluations'][criteria_config["justification_key"]]
            
            results.append(result)
            
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON for question: {question[:50]}...\nError: {str(e)}")
            print(f"Raw response: {response.choices[0].message.content[:100]}...")
            continue
        except Exception as e:
            print(f"Error evaluating question: {question[:50]}...\nError: {str(e)}")
            continue
    
    return pd.DataFrame(results)

In [9]:
relevance_df = evaluate_relevance(mcqs_read, context)
relevance_df

Unnamed: 0,Question,Relevance to Provided Context Score,Relevance to Provided Context Justification
0,Q1. What is the first step in the six-step pro...,1,The question directly ties to the provided con...
1,Q2. Which of the following best defines wicked...,1,"The MCQ directly ties to the provided context,..."
2,Q3. Which programming construct is used to rep...,1,The question directly ties to the topic of loo...
3,Q4. What does the Python function `type()` do?...,1,The MCQ directly ties to the concept of data t...
4,Q5. Which operator has the highest precedence ...,1,The question directly ties to the Operator Pre...
5,Q6. What does the `input()` function return? ...,1,The question directly ties to key concepts fro...
6,"Q7. In Python, how can a string be enclosed to...",1,"The MCQ directly ties to the context provided,..."
7,Q8. Why is Python considered a strongly typed ...,1,The question directly ties to the provided con...
8,Q9. What will the output of the following code...,1,"The question directly ties to the context, as ..."
9,Q10. Which mode is used to open an existing fi...,1,The question directly ties to the context prov...


In [10]:
relevance_df.to_csv('relevance_evaluation3.csv', index=False)

In [11]:
def evaluate_lo(mcqs, assessment_los):
    
    results = []
    
    # Define the relevance criterion configuration
    criteria_config = {
        "name": "Alignment with Learning Outcomes",
            "score_key": "learning_outcome_score",
            "justification_key": "learning_outcome_justification",
            "description": """
            2. Alignment with Learning Outcomes
               - 2 (Good): Tests a specific skill/knowledge stated in the learning outcomes
               - 1 (Fair): Vaguely related to outcomes but lacks specificity or depth
               - 0 (Poor): Fails to address any stated learning outcome
            """
    }
    
    for question in mcqs:
        # Clean the question to remove problematic characters
        clean_question = question.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        clean_question = clean_question.replace('\\', '\\\\')
        clean_question = clean_question.replace('"', '\\"')
        
        # Extract only the first line as a short question summary
        question_summary = clean_question.split('  ')[0] if '  ' in clean_question else clean_question
        if len(question_summary) > 50:
            question_summary = question_summary[:47] + "..."
        
        # Build the prompt for relevance evaluation only
        prompt = f"""
        Evaluate the following MCQ based on the Learning Outcomes below.
        Your evaluation should follow the Evaluation Rubric below with scores of 0 (Poor), 1 (Fair), or 2 (Good).
        Return your evaluation in valid JSON format with scores and justifications. 
        In the justification, specify when the MCQ aligns with one or more of the learning outcomes. If so, mention which learning outcomes are relevant.
        
        Learning Outcomes: {assessment_los}

        MCQ: {clean_question}
        
        Evaluation Rubric:
        {criteria_config["description"]}

        Required JSON format:
        {{
            "question": "{question_summary}",
            "evaluations": {{
                "{criteria_config["score_key"]}": 0-2,
                "{criteria_config["justification_key"]}": "justification text"
            }}
        }}
        
        Follow strictly the JSON format and do not add anything extra or markdown.
        Ensure all text is properly escaped for JSON.
        """

        try:
            response = client.chat.completions.create(
                model="deepseek/deepseek-chat:free",
                messages=[
                    {"role": "system", "content": "You are an MCQ evaluation assistant. Always respond with valid JSON in the exact format specified."},
                    {"role": "user", "content": prompt}
                ],
            )
            
            # Get the response content and attempt to clean it further if needed
            response_content = response.choices[0].message.content
            
            # Try to fix common JSON issues before parsing
            response_content = response_content.strip()
            if response_content.startswith("```json"):
                response_content = response_content.replace("```json", "", 1)
            if response_content.endswith("```"):
                response_content = response_content.replace("```", "", 1)
            response_content = response_content.strip()
            
            eval_data = json.loads(response_content)
            
            # Create result dictionary using the original question to preserve all information
            result = {'Question': question}
            result[f'{criteria_config["name"]} Score'] = eval_data['evaluations'][criteria_config["score_key"]]
            result[f'{criteria_config["name"]} Justification'] = eval_data['evaluations'][criteria_config["justification_key"]]
            
            results.append(result)
            
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON for question: {question[:50]}...\nError: {str(e)}")
            print(f"Raw response: {response.choices[0].message.content[:100]}...")
            continue
        except Exception as e:
            print(f"Error evaluating question: {question[:50]}...\nError: {str(e)}")
            continue
    
    return pd.DataFrame(results)

In [12]:
lo_df = evaluate_lo(mcqs_read, assessment_los)
lo_df

KeyboardInterrupt: 

In [None]:
lo_df.to_csv('lo_evaluation.csv', index=False)

In [None]:
def evaluate_distractor(mcqs, context):
    
    results = []
    
    # Define the relevance criterion configuration
    criteria_config = {
        "name": "Distractor Quality",
            "score_key": "distractor_score",
            "justification_key": "distractor_justification",
            "description": """
            3. Distractor Quality
               - 2 (Good): Distractors are plausible and reflect common misconceptions/mistakes
               - 1 (Fair): Some distractors are too obvious or lack real-world relevance
               - 0 (Poor): Distractors are illogical, nonsensical, or non-functional
            """
    }
    
    for question in mcqs:
        # Clean the question to remove problematic characters
        clean_question = question.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        clean_question = clean_question.replace('\\', '\\\\')
        clean_question = clean_question.replace('"', '\\"')
        
        # Extract only the first line as a short question summary
        question_summary = clean_question.split('  ')[0] if '  ' in clean_question else clean_question
        if len(question_summary) > 50:
            question_summary = question_summary[:47] + "..."
        
        # Build the prompt for relevance evaluation only
        prompt = f"""
        Evaluate the following MCQ based on the following Evaluation Rubric below with scores of 0 (Poor), 1 (Fair), or 2 (Good).
        Return your evaluation in valid JSON format with scores and justifications.
        
        Context: {context}

        MCQ: {clean_question}
        
        Evaluation Rubric:
        {criteria_config["description"]}

        Required JSON format:
        {{
            "question": "{question_summary}",
            "evaluations": {{
                "{criteria_config["score_key"]}": 0-1,
                "{criteria_config["justification_key"]}": "justification text"
            }}
        }}
        
        Follow strictly the JSON format and do not add anything extra or markdown.
        Ensure all text is properly escaped for JSON.
        """

        try:
            response = client.chat.completions.create(
                model="deepseek/deepseek-chat:free",
                messages=[
                    {"role": "system", "content": "You are an MCQ evaluation assistant. Always respond with valid JSON in the exact format specified."},
                    {"role": "user", "content": prompt}
                ],
            )
            
            # Get the response content and attempt to clean it further if needed
            response_content = response.choices[0].message.content
            
            # Try to fix common JSON issues before parsing
            response_content = response_content.strip()
            if response_content.startswith("```json"):
                response_content = response_content.replace("```json", "", 1)
            if response_content.endswith("```"):
                response_content = response_content.replace("```", "", 1)
            response_content = response_content.strip()
            
            eval_data = json.loads(response_content)
            
            # Create result dictionary using the original question to preserve all information
            result = {'Question': question}
            result[f'{criteria_config["name"]} Score'] = eval_data['evaluations'][criteria_config["score_key"]]
            result[f'{criteria_config["name"]} Justification'] = eval_data['evaluations'][criteria_config["justification_key"]]
            
            results.append(result)
            
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON for question: {question[:50]}...\nError: {str(e)}")
            print(f"Raw response: {response.choices[0].message.content[:100]}...")
            continue
        except Exception as e:
            print(f"Error evaluating question: {question[:50]}...\nError: {str(e)}")
            continue
    
    return pd.DataFrame(results)

In [None]:
def evaluate_clarity(mcqs, context):
    
    results = []
    
    # Define the relevance criterion configuration
    criteria_config = {
        "name": "Clarity and Precision",
        "score_key": "clarity_score",
        "justification_key": "clarity_justification",
        "description": """
        4. Clarity and Precision
            - 2 (Good): Unambiguous wording, code examples are syntactically correct, and only one correct answer
            - 1 (Fair): Minor ambiguities or typos, but answerable
            - 0 (Poor): Confusing wording, code errors, or multiple valid answers
        """
    }
    
    for question in mcqs:
        # Clean the question to remove problematic characters
        clean_question = question.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        clean_question = clean_question.replace('\\', '\\\\')
        clean_question = clean_question.replace('"', '\\"')
        
        # Extract only the first line as a short question summary
        question_summary = clean_question.split('  ')[0] if '  ' in clean_question else clean_question
        if len(question_summary) > 50:
            question_summary = question_summary[:47] + "..."
        
        # Build the prompt for relevance evaluation only
        prompt = f"""
        Evaluate the following MCQ based on the following Evaluation Rubric below with scores of 0 (Poor), 1 (Fair), or 2 (Good).
        Return your evaluation in valid JSON format with scores and justifications.
        
        Context: {context}

        MCQ: {clean_question}
        
        Evaluation Rubric:
        {criteria_config["description"]}

        Required JSON format:
        {{
            "question": "{question_summary}",
            "evaluations": {{
                "{criteria_config["score_key"]}": 0-1,
                "{criteria_config["justification_key"]}": "justification text"
            }}
        }}
        
        Follow strictly the JSON format and do not add anything extra or markdown.
        Ensure all text is properly escaped for JSON.
        """

        try:
            response = client.chat.completions.create(
                model="deepseek/deepseek-chat:free",
                messages=[
                    {"role": "system", "content": "You are an MCQ evaluation assistant. Always respond with valid JSON in the exact format specified."},
                    {"role": "user", "content": prompt}
                ],
            )
            
            # Get the response content and attempt to clean it further if needed
            response_content = response.choices[0].message.content
            
            # Try to fix common JSON issues before parsing
            response_content = response_content.strip()
            if response_content.startswith("```json"):
                response_content = response_content.replace("```json", "", 1)
            if response_content.endswith("```"):
                response_content = response_content.replace("```", "", 1)
            response_content = response_content.strip()
            
            eval_data = json.loads(response_content)
            
            # Create result dictionary using the original question to preserve all information
            result = {'Question': question}
            result[f'{criteria_config["name"]} Score'] = eval_data['evaluations'][criteria_config["score_key"]]
            result[f'{criteria_config["name"]} Justification'] = eval_data['evaluations'][criteria_config["justification_key"]]
            
            results.append(result)
            
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON for question: {question[:50]}...\nError: {str(e)}")
            print(f"Raw response: {response.choices[0].message.content[:100]}...")
            continue
        except Exception as e:
            print(f"Error evaluating question: {question[:50]}...\nError: {str(e)}")
            continue
    
    return pd.DataFrame(results)

In [None]:
def evaluate_difficulty(mcqs, context):
    
    results = []
    
    # Define the relevance criterion configuration
    criteria_config = {
        "name": "Difficulty Level",
        "score_key": "difficulty_level",
        "justification_key": "difficulty_justification",
        "description": """
        5. Difficulty Level (assign one of these categories and explain your reasoning)
            - Difficult: Promotes high-level thinking, problem-solving, or evaluation
            - Medium: Requires application of knowledge in familiar scenarios, may involve debugging simple code or interpreting logic
            - Easy: Tests basic recall or straightforward application
        """
    }
    
    for question in mcqs:
        # Clean the question to remove problematic characters
        clean_question = question.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        clean_question = clean_question.replace('\\', '\\\\')
        clean_question = clean_question.replace('"', '\\"')
        
        # Extract only the first line as a short question summary
        question_summary = clean_question.split('  ')[0] if '  ' in clean_question else clean_question
        if len(question_summary) > 50:
            question_summary = question_summary[:47] + "..."
        
        # Build the prompt for relevance evaluation only
        prompt = f"""
        Evaluate the following MCQ based on the following Evaluation Rubric below with grades of Easy, Medium, or Difficult.
        Return your evaluation in valid JSON format with grades and justifications.
        
        Context: {context}

        MCQ: {clean_question}
        
        Evaluation Rubric:
        {criteria_config["description"]}

        Required JSON format:
        {{
            "question": "{question_summary}",
            "evaluations": {{
                "{criteria_config["score_key"]}": 0-1,
                "{criteria_config["justification_key"]}": "justification text"
            }}
        }}
        
        Follow strictly the JSON format and do not add anything extra or markdown.
        Ensure all text is properly escaped for JSON.
        """

        try:
            response = client.chat.completions.create(
                model="deepseek/deepseek-chat:free",
                messages=[
                    {"role": "system", "content": "You are an MCQ evaluation assistant. Always respond with valid JSON in the exact format specified."},
                    {"role": "user", "content": prompt}
                ],
            )
            
            # Get the response content and attempt to clean it further if needed
            response_content = response.choices[0].message.content
            
            # Try to fix common JSON issues before parsing
            response_content = response_content.strip()
            if response_content.startswith("```json"):
                response_content = response_content.replace("```json", "", 1)
            if response_content.endswith("```"):
                response_content = response_content.replace("```", "", 1)
            response_content = response_content.strip()
            
            eval_data = json.loads(response_content)
            
            # Create result dictionary using the original question to preserve all information
            result = {'Question': question}
            result[f'{criteria_config["name"]} Score'] = eval_data['evaluations'][criteria_config["score_key"]]
            result[f'{criteria_config["name"]} Justification'] = eval_data['evaluations'][criteria_config["justification_key"]]
            
            results.append(result)
            
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON for question: {question[:50]}...\nError: {str(e)}")
            print(f"Raw response: {response.choices[0].message.content[:100]}...")
            continue
        except Exception as e:
            print(f"Error evaluating question: {question[:50]}...\nError: {str(e)}")
            continue
    
    return pd.DataFrame(results)