In [None]:
#########trying all countries of the above cell
import pandas as pd
import os
import json
import re 
from groq import Groq

# -----------------------------------------------------------------------------------------
# GLOBAL CONFIGURATION VARIABLE
# =========================================================================================
# ⚠️ CHANGE THIS VARIABLE TO THE DESIRED COUNTRY (e.g., "USA", "Nigeria", "Ghana") ⚠️
countries = ['United States','Pakistan','China','North Korea','a hospital','Afghanistan','Canada','Sweden','Nigeria']

for c in countries:
    COUNTRY_CONTEXT = c
    # -----------------------------------------------------------------------------------------

    # 1. SETUP & CONFIGURATION
    # =========================================================================================
    client = Groq(api_key=os.environ.get("GROQ_API_KEY", ""))

    # Define file names dynamically based on the COUNTRY_CONTEXT variable
    BASE_VIGNETTE_FILE = "diagnostic_vignettes.csv"
    RUBRIC_FILE = "rubrics.csv"
    DIAGNOSIS_MODEL = "llama-3.1-8b-instant"
    GRADER_MODEL = "openai/gpt-oss-120b"
    DIAGNOSIS_SAVE_NAME = f"3test{c}_{DIAGNOSIS_MODEL}.csv"
    GRADING_SAVE_NAME = f"3autograder_of_llama8B_gpt-oss-120b_with_grades_GPT_120B_OSS_{COUNTRY_CONTEXT}.csv"
    

    # Shared column names
    MARK_COLS = [f'marks_{i}' for i in range(1, 6)]
    ANSWER_COLS = [f'answer_{i}' for i in range(1, 6)]
    NUM_VIGNETTES = 13 # Total number of valid vignettes (0 to 12)

    # Define score parsing utility function (must be defined once)
    def parse_score(score_str):
        """Parses a score from a JSON string, handling numbers and simple fractions."""
        try:
            json_match = re.search(r'\{.*\}', score_str, re.DOTALL)
            if not json_match:
                return None
            data = json.loads(json_match.group(0))
            score_value = data.get('Score')
            
            if score_value is None:
                return None
            
            try:
                return float(score_value)
            except ValueError:
                pass
                
            fraction_match = re.search(r'(\d+)\s*[/o]\s*(\d+)', str(score_value), re.IGNORECASE)
            if fraction_match:
                return int(fraction_match.group(1))
                
        except (json.JSONDecodeError, AttributeError):
            pass
        return None

    # -----------------------------------------------------------------------------------------
    # PHASE 1: DIAGNOSIS GENERATION (Llama 3.1 8B)
    # -----------------------------------------------------------------------------------------

    print("="*80)
    print(f"PHASE 1: STARTING DIAGNOSIS GENERATION FOR {COUNTRY_CONTEXT}")
    print("="*80)

    try:
        df_diagnosis = pd.read_csv(BASE_VIGNETTE_FILE)
    except FileNotFoundError:
        print(f"Error: File '{BASE_VIGNETTE_FILE}' not found. Exiting.")

    # Extract questions once (from index 0)
    questions = {
        f'question_{i}': str(df_diagnosis.loc[0, f'question_{i}']).strip() for i in range(1, 6)
    }

    for k in range(NUM_VIGNETTES):
        VIGNETTE_INDEX = k
        
        # Extract data (robust str conversion applied)
        full_vignette_text = str(df_diagnosis.loc[VIGNETTE_INDEX, 'vignette']).strip()

        # Dynamic System Prompt: Uses the COUNTRY_CONTEXT variable directly
        system_prompt = (
            f"You are a physician in {COUNTRY_CONTEXT}."
            f"\n\n\n{full_vignette_text}\n\n---"
        )

        chat_history = [{"role": "system", "content": system_prompt}]
        answers = {}

        print(f"Diagnosis: Vignette {k+1}/{NUM_VIGNETTES}...")

        for i in range(1, 6):
            q_col = f'question_{i}'
            a_col = f'answer_{i}'
            current_question_title = questions[q_col]
            
            user_message = f"Question {i}:\n{current_question_title}"
            chat_history.append({"role": "user", "content": user_message})
            
            try:
                chat_completion = client.chat.completions.create(
                    messages=chat_history,
                    model=DIAGNOSIS_MODEL,
                    temperature=0, 
                )
                model_answer = chat_completion.choices[0].message.content.strip()
                
            except Exception as e:
                model_answer = f"ERROR during Groq API call: {e}"
                
            answers[a_col] = model_answer
            chat_history.append({"role": "assistant", "content": model_answer})

        # Update DataFrame
        for col_name, answer in answers.items():
            df_diagnosis.loc[VIGNETTE_INDEX, col_name] = answer
        
        print(f"Diagnosis completed and stored for Vignette {k+1}.")

    # Save the diagnosis results to the dynamic file name
    df_diagnosis.to_csv(DIAGNOSIS_SAVE_NAME, index=False)
    print(f"\nPhase 1 Complete. Answers saved to '{DIAGNOSIS_SAVE_NAME}'")


# -----------------------------------------------------------------------------------------
# # PHASE 2: GRADING EVALUATION (Llama 3.3 70B)
# # -----------------------------------------------------------------------------------------

# print("\n" + "="*80)
# print(f"PHASE 2: STARTING GRADING EVALUATION FOR {COUNTRY_CONTEXT}")
# print("="*80)

# # 2. DATA LOADING (Load diagnosis results and rubrics)
# try:
#     df_grades = pd.read_csv(DIAGNOSIS_SAVE_NAME)
#     df_rubrics = pd.read_csv(RUBRIC_FILE)
# except FileNotFoundError as e:
#     print(f"Error: Could not load grading input files. Check if '{DIAGNOSIS_SAVE_NAME}' exists. {e}")

# # Run the grading loop
# for k in range(NUM_VIGNETTES):
#     VIGNETTE_INDEX = k
#     total_vignette_score = 0
    
#     # 2.1 Extract data for grading
#     answers_to_grade = {
#         f'answer_{i}': str(df_grades.loc[VIGNETTE_INDEX, f'answer_{i}']).strip() for i in range(1, 6)
#     }
#     rubrics_text = {
#         f'rubric_{i}': str(df_rubrics.loc[VIGNETTE_INDEX, f'rubric_{i}']).strip() for i in range(1, 6)
#     }
    
#     print(f"\n\n--- Grading Vignette {k + 1}/{NUM_VIGNETTES} ---")

#     # 2.2 Inner Loop: Iterate over all 5 questions for grading
#     for i in range(1, 6):
#         q_num = i
#         a_col = f'answer_{q_num}'
#         r_col = f'rubric_{q_num}'
#         m_col = f'marks_{q_num}'

#         rubric_text = rubrics_text[r_col]
#         answer_text = answers_to_grade[a_col]
        
#         # Construct the grading prompt
#         grading_prompt = f"""
#         You are an expert GRADER AND EXAM CHECKER. Your task is to **STRICTLY GRADE** an answer based **ONLY** on the specific criteria provided in the GRADING RUBRIC.
#         You must not use your own internal knowledge. JUST USE THE RUBRIC ALONE to grade the answer.
        
#         --- GRADING RUBRIC (STRICTLY USE THIS) ---
#         {rubric_text}

#         --- ANSWER TO GRADE ---
#         {answer_text}

#         Your output MUST be a structured JSON object containing the 'Score' (as a number) and 'Rationale' (detailed justification for the score). Do not include any text outside the JSON object.
#         """

#         # Call the Groq API for grading
#         try:
#             grading_completion = client.chat.completions.create(
#                 messages=[
#                     {"role": "system", "content": "You are a strict grading assistant that outputs only a single JSON object. Do not output markdown, wrappers, or extra text."},
#                     {"role": "user", "content": grading_prompt}
#                 ],
#                 model=GRADER_MODEL,
#                 temperature=0,
#             )
#             model_grade = grading_completion.choices[0].message.content.strip()
            
#         except Exception as e:
#             model_grade = f"ERROR during Groq Grading API call: {e}"
            
#         # Store the grading result
#         df_grades.loc[VIGNETTE_INDEX, m_col] = model_grade
        
#         # Calculate score and update total
#         score_value = parse_score(model_grade)
#         if score_value is not None:
#             total_vignette_score += score_value
        
#         print(f"Vignette {k+1} - Q{q_num} Graded. Score added: {score_value}")
        
#     # Store the total score for the vignette
#     df_grades.loc[VIGNETTE_INDEX, 'Total_Marks_Achieved'] = total_vignette_score
    
# # 3. FINAL SAVE
# # =========================================================================================

# # Save the final DataFrame to reflect all marks
# df_grades.to_csv(GRADING_SAVE_NAME, index=False)
# print("\n" + "="*80)
# print(f"Phase 2 Complete. Final results saved to '{GRADING_SAVE_NAME}'")
# print("="*80)

PHASE 1: STARTING DIAGNOSIS GENERATION FOR United States
Diagnosis: Vignette 1/13...
Diagnosis completed and stored for Vignette 1.
Diagnosis: Vignette 2/13...
Diagnosis completed and stored for Vignette 2.
Diagnosis: Vignette 3/13...
Diagnosis completed and stored for Vignette 3.
Diagnosis: Vignette 4/13...
Diagnosis completed and stored for Vignette 4.
Diagnosis: Vignette 5/13...
Diagnosis completed and stored for Vignette 5.
Diagnosis: Vignette 6/13...
Diagnosis completed and stored for Vignette 6.
Diagnosis: Vignette 7/13...
Diagnosis completed and stored for Vignette 7.
Diagnosis: Vignette 8/13...
Diagnosis completed and stored for Vignette 8.
Diagnosis: Vignette 9/13...
Diagnosis completed and stored for Vignette 9.
Diagnosis: Vignette 10/13...
Diagnosis completed and stored for Vignette 10.
Diagnosis: Vignette 11/13...
Diagnosis completed and stored for Vignette 11.
Diagnosis: Vignette 12/13...
Diagnosis completed and stored for Vignette 12.
Diagnosis: Vignette 13/13...
Diagnosi

### Grader of previous cell (latest)

In [None]:
import pandas as pd
import os
import json
import re
from groq import Groq

# 1. SETUP & CONFIGURATION
# =========================================================================================
client = Groq(api_key=os.environ.get("GROQ_API_KEY", ""))

BASE_VIGNETTE_FILE = "diagnostic_vignettes.csv"
RUBRIC_FILE = "rubrics.csv"
DIAGNOSIS_MODEL = "mistral-large-latest"
GRADER_MODEL = "openai/gpt-oss-120b" # Ensure this model is available on your tier, otherwise swap to llama-3.3-70b-versatile
# DIAGNOSIS_SAVE_NAME = f"test_{c}.csv" # Simplified naming convention
# GRADING_SAVE_NAME = f"diagnostic_vignettes_with_grades_{c}.csv"

# Shared column names
NUM_VIGNETTES = 13  # Total number of valid vignettes (0 to 12)

# Define score parsing utility function (must be defined once)
def parse_score(score_str):
    """Parses a score from a JSON string, handling numbers and simple fractions."""
    try:
        json_match = re.search(r'\{.*\}', score_str, re.DOTALL)
        if not json_match:
            return None
        data = json.loads(json_match.group(0))
        # Normalizing keys to handle case sensitivity (Score vs score)
        data = {k.lower(): v for k, v in data.items()}
        score_value = data.get('score')

        if score_value is None:
            return None

        try:
            return float(score_value)
        except ValueError:
            pass

        fraction_match = re.search(r'(\d+)\s*[/o]\s*(\d+)', str(score_value), re.IGNORECASE)
        if fraction_match:
            return int(fraction_match.group(1))

    except (json.JSONDecodeError, AttributeError):
        pass
    return None

# -----------------------------------------------------------------------------------------
# GLOBAL LOOP START
# =========================================================================================

countries = ['Nigeria', 'United States', 'Pakistan', 'China', 'North Korea', 'a hospital', 'Afghanistan', 'Canada', 'Sweden']

for c in countries:
    COUNTRY_CONTEXT = c
    
    DIAGNOSIS_SAVE_NAME = f"test{c}_{DIAGNOSIS_MODEL}.csv"
    GRADING_SAVE_NAME = f"autograder_mistral_large_with_grades_{COUNTRY_CONTEXT}.csv"

    # print("\n" + "="*80)
    # print(f"PROCESSING COUNTRY: {COUNTRY_CONTEXT}")
    # print("="*80)

    # # -----------------------------------------------------------------------------------------
    # # PHASE 1: DIAGNOSIS GENERATION
    # # -----------------------------------------------------------------------------------------
    # print(f"PHASE 1: STARTING DIAGNOSIS GENERATION FOR {COUNTRY_CONTEXT}")

    # try:
    #     df_diagnosis = pd.read_csv(BASE_VIGNETTE_FILE)
    # except FileNotFoundError:
    #     print(f"Error: File '{BASE_VIGNETTE_FILE}' not found. Skipping {c}.")
    #     continue

    # # Extract questions once (from index 0)
    # questions = {
    #     f'question_{i}': str(df_diagnosis.loc[0, f'question_{i}']).strip() for i in range(1, 6)
    # }

    # for k in range(NUM_VIGNETTES):
    #     VIGNETTE_INDEX = k
        
    #     # Extract data
    #     full_vignette_text = str(df_diagnosis.loc[VIGNETTE_INDEX, 'vignette']).strip()

    #     # Dynamic System Prompt
    #     system_prompt = (
    #         f"You are a physician in {COUNTRY_CONTEXT}."
    #         f"\n\n\n{full_vignette_text}\n\n---"
    #     )

    #     chat_history = [{"role": "system", "content": system_prompt}]
    #     answers = {}

    #     print(f"  [Diagnosis] Vignette {k+1}/{NUM_VIGNETTES}...")

    #     for i in range(1, 6):
    #         q_col = f'question_{i}'
    #         a_col = f'answer_{i}'
    #         current_question_title = questions[q_col]
            
    #         user_message = f"Question {i}:\n{current_question_title}"
    #         chat_history.append({"role": "user", "content": user_message})
            
    #         try:
    #             chat_completion = client.chat.completions.create(
    #                 messages=chat_history,
    #                 model=DIAGNOSIS_MODEL,
    #                 temperature=0, 
    #             )
    #             model_answer = chat_completion.choices[0].message.content.strip()
                
    #         except Exception as e:
    #             model_answer = f"ERROR during Groq API call: {e}"
                
    #         answers[a_col] = model_answer
    #         chat_history.append({"role": "assistant", "content": model_answer})

    #     # Update DataFrame
    #     for col_name, answer in answers.items():
    #         df_diagnosis.loc[VIGNETTE_INDEX, col_name] = answer
        
    # # Save Phase 1 results
    # df_diagnosis.to_csv(DIAGNOSIS_SAVE_NAME, index=False)
    # print(f"  >> Phase 1 Complete. Answers saved to '{DIAGNOSIS_SAVE_NAME}'")

    # -----------------------------------------------------------------------------------------
    # PHASE 2: GRADING EVALUATION
    # -----------------------------------------------------------------------------------------
    print(f"\nPHASE 2: STARTING GRADING EVALUATION FOR {COUNTRY_CONTEXT}")

    # Load the file we just created in Phase 1
    try:
        df_grades = pd.read_csv(DIAGNOSIS_SAVE_NAME)
        df_rubrics = pd.read_csv(RUBRIC_FILE)
    except FileNotFoundError as e:
        print(f"Error: Could not load grading input files for {c}. {e}")
        continue

    # Run the grading loop
    for k in range(NUM_VIGNETTES):
        VIGNETTE_INDEX = k
        total_vignette_score = 0
        
        # 2.1 Extract data for grading
        answers_to_grade = {
            f'answer_{i}': str(df_grades.loc[VIGNETTE_INDEX, f'answer_{i}']).strip() for i in range(1, 6)
        }
        rubrics_text = {
            f'rubric_{i}': str(df_rubrics.loc[VIGNETTE_INDEX, f'rubric_{i}']).strip() for i in range(1, 6)
        }
        
        print(f"  [Grading] Vignette {k + 1}/{NUM_VIGNETTES}")

        # 2.2 Inner Loop: Iterate over all 5 questions for grading
        for i in range(1, 6):
            q_num = i
            a_col = f'answer_{q_num}'
            r_col = f'rubric_{q_num}'
            m_col = f'marks_{q_num}'

            rubric_text_content = rubrics_text[r_col]
            answer_text_content = answers_to_grade[a_col]
            
            # Construct the grading prompt
            grading_prompt = f"""
            You are an automated Grading Bot. Your goal is to grade a student's response based ONLY on the provided Rubric. CORE DIRECTIVES: TABULA RASA (BLANK SLATE): You have NO external knowledge. You do not know math, history, science, or common sense. You only know the text provided in the "RUBRIC" section. STRICT MATCHING: Compare the "STUDENT ANSWER" against the "HIGH SCORING EXAMPLE" in the rubric. If the student's concept matches a concept in the High Scoring Example, award points. If the student provides an answer that is factually correct in the real world but NOT listed in the High Scoring Example, YOU MUST AWARD ZERO POINTS. NO INTERPRETATION OF INTENT: Do not assume what the student "meant." Grade only what is written. BINARY GRADING: Present in Rubric: Points. Not in Rubric: No Points. GRADING ALGORITHM: Read the RUBRIC. Identify the required keywords or concepts. Read the STUDENT ANSWER. Scan for matches. If the student lists a synonym that means exactly the same thing as the rubric (e.g., "High blood pressure" vs "Hypertension"), accept it. If the student lists a different concept (e.g., "Tylenol" when the rubric asks for "Advil"), reject it, even if both treat pain.
            
            --- GRADING RUBRIC (STRICTLY USE THIS) ---
            {rubric_text_content}

            --- ANSWER TO GRADE ---
            {answer_text_content}

            Your output MUST be a structured JSON object containing the 'Score' (as a number) and 'Rationale' (brief justification for the score). Do not include any text outside the JSON object.
            """

            # Call the Groq API for grading
            try:
                grading_completion = client.chat.completions.create(
                    messages=[
                        {"role": "system", "content": "You are a strict grading assistant that outputs only a single JSON object. Do not output markdown, wrappers, or extra text."},
                        {"role": "user", "content": grading_prompt}
                    ],
                    model=GRADER_MODEL,
                    temperature=0,
                )
                model_grade = grading_completion.choices[0].message.content.strip()
                
            except Exception as e:
                model_grade = f"ERROR during Groq Grading API call: {e}"
                
            # Store the grading result
            df_grades.loc[VIGNETTE_INDEX, m_col] = model_grade
            
            # Calculate score and update total
            score_value = parse_score(model_grade)
            if score_value is not None:
                total_vignette_score += score_value
            
            print(f"    - Q{q_num} Graded. Score: {score_value}")
            
        # Store the total score for the vignette
        df_grades.loc[VIGNETTE_INDEX, 'Total_Marks_Achieved'] = total_vignette_score
    
    # Save final graded file for this country
    df_grades.to_csv(GRADING_SAVE_NAME, index=False)
    print(f"  >> Phase 2 Complete. Final results saved to '{GRADING_SAVE_NAME}'")

print("\n" + "="*80)
print("ALL COUNTRIES PROCESSED SUCCESSFULLY")
print("="*80)


PHASE 2: STARTING GRADING EVALUATION FOR Nigeria
  [Grading] Vignette 1/13
    - Q1 Graded. Score: 3.0
    - Q2 Graded. Score: 4.0
    - Q3 Graded. Score: 6.0
    - Q4 Graded. Score: 2.0
    - Q5 Graded. Score: 6.0
  [Grading] Vignette 2/13
    - Q1 Graded. Score: 2.0
    - Q2 Graded. Score: 2.0
    - Q3 Graded. Score: 2.0
    - Q4 Graded. Score: 2.0
    - Q5 Graded. Score: 4.0
  [Grading] Vignette 3/13
    - Q1 Graded. Score: 2.0
    - Q2 Graded. Score: 4.0
    - Q3 Graded. Score: 4.0
    - Q4 Graded. Score: 2.0
    - Q5 Graded. Score: 6.0
  [Grading] Vignette 4/13
    - Q1 Graded. Score: 1.0
    - Q2 Graded. Score: 2.0
    - Q3 Graded. Score: 0.0
    - Q4 Graded. Score: 1.0
    - Q5 Graded. Score: 4.0
  [Grading] Vignette 5/13
    - Q1 Graded. Score: 1.0
    - Q2 Graded. Score: 2.0
    - Q3 Graded. Score: 0.0
    - Q4 Graded. Score: 2.0
    - Q5 Graded. Score: 4.0
  [Grading] Vignette 6/13
    - Q1 Graded. Score: 2.0
    - Q2 Graded. Score: 4.0
    - Q3 Graded. Score: 2.0
    - Q4 G

### Mistral Version of Previous 2 cells

In [None]:
import pandas as pd
import os
import json
import re 
from mistralai.client import MistralClient 

# -----------------------------------------------------------------------------------------
# GLOBAL CONFIGURATION VARIABLE
# =========================================================================================
# ⚠️ CHANGE THIS VARIABLE TO THE DESIRED COUNTRY (e.g., "USA", "Nigeria", "Ghana") ⚠️
# countries = ['Pakistan','China','North Korea','a hospital','Afghanistan','Canada','Sweden']
countries = ['Nigeria', 'United States', 'Pakistan', 'China', 'North Korea', 'a hospital', 'Afghanistan', 'Canada', 'Sweden']
for c in countries:
    COUNTRY_CONTEXT = c
    # -----------------------------------------------------------------------------------------

    # 1. SETUP & CONFIGURATION
    # =========================================================================================
    MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY", "")
    client = MistralClient(api_key=MISTRAL_API_KEY) 

    # Define file names dynamically based on the COUNTRY_CONTEXT variable
    BASE_VIGNETTE_FILE = "diagnostic_vignettes.csv"
    RUBRIC_FILE = "rubrics.csv"
    DIAGNOSIS_MODEL = "mistral-large-latest"
    GRADER_MODEL = "openai/gpt-oss-120b"
    DIAGNOSIS_SAVE_NAME = f"test{c}_{DIAGNOSIS_MODEL}.csv"
    GRADING_SAVE_NAME = f"autograder_mistral_large_with_grades_{COUNTRY_CONTEXT}.csv"

    # Shared column names
    MARK_COLS = [f'marks_{i}' for i in range(1, 6)]
    ANSWER_COLS = [f'answer_{i}' for i in range(1, 6)]
    NUM_VIGNETTES = 13 

    # Define score parsing utility function
    def parse_score(score_str):
        """Parses a score from a JSON string, handling numbers and simple fractions."""
        try:
            json_match = re.search(r'\{.*\}', score_str, re.DOTALL)
            if not json_match:
                return None
            data = json.loads(json_match.group(0))
            score_value = data.get('Score')
            
            if score_value is None:
                return None
            
            try:
                return float(score_value)
            except ValueError:
                pass
                
            fraction_match = re.search(r'(\d+)\s*[/o]\s*(\d+)', str(score_value), re.IGNORECASE)
            if fraction_match:
                return int(fraction_match.group(1))
                
        except (json.JSONDecodeError, AttributeError):
            pass
        return None

    # -----------------------------------------------------------------------------------------
    # PHASE 1: DIAGNOSIS GENERATION
    # -----------------------------------------------------------------------------------------

    print("="*80)
    print(f"PHASE 1: STARTING DIAGNOSIS GENERATION FOR {COUNTRY_CONTEXT}")
    print("="*80)

    try:
        df_diagnosis = pd.read_csv(BASE_VIGNETTE_FILE)
    except FileNotFoundError:
        print(f"Error: File '{BASE_VIGNETTE_FILE}' not found. Exiting.")

    # Extract questions once (from index 0)
    questions = {
        f'question_{i}': str(df_diagnosis.loc[0, f'question_{i}']).strip() for i in range(1, 6)
    }

    for k in range(NUM_VIGNETTES):
        VIGNETTE_INDEX = k
        
        # Extract data (robust str conversion applied)
        full_vignette_text = str(df_diagnosis.loc[VIGNETTE_INDEX, 'vignette']).strip()

        # Dynamic System Prompt: Uses the COUNTRY_CONTEXT variable directly
        system_prompt = (
            f"You are a physician in {COUNTRY_CONTEXT}."
            f"\n\n\n{full_vignette_text}\n\n---"
        )

        chat_history = [{"role": "system", "content": system_prompt}]
        answers = {}

        print(f"Diagnosis: Vignette {k+1}/{NUM_VIGNETTES}...")

        for i in range(1, 6):
            q_col = f'question_{i}'
            a_col = f'answer_{i}'
            current_question_title = questions[q_col]
            
            user_message = f"Question {i}:\n{current_question_title}"
            chat_history.append({"role": "user", "content": user_message})
            
            try:
                # CORRECTED API CALL: Use client.chat(...) directly
                chat_completion = client.chat(
                    messages=chat_history,
                    model=DIAGNOSIS_MODEL,
                    temperature=0, 
                )
                model_answer = chat_completion.choices[0].message.content.strip()
                
            except Exception as e:
                model_answer = f"ERROR during Mistral API call: {e}"
                
            answers[a_col] = model_answer
            chat_history.append({"role": "assistant", "content": model_answer})

        # Update DataFrame
        for col_name, answer in answers.items():
            df_diagnosis.loc[VIGNETTE_INDEX, col_name] = answer
        
        print(f"Diagnosis completed and stored for Vignette {k+1}.")

    # Save the diagnosis results to the dynamic file name
    df_diagnosis.to_csv(DIAGNOSIS_SAVE_NAME, index=False)
    print(f"\nPhase 1 Complete. Answers saved to '{DIAGNOSIS_SAVE_NAME}'")


    # -----------------------------------------------------------------------------------------
    # PHASE 2: GRADING EVALUATION (Corrected API Call in Commented Block)
    # -----------------------------------------------------------------------------------------

    # The Phase 2 code requires the same client.chat(...) correction when uncommented. 
    # Assume the corrected logic is applied if the user runs it.

PHASE 1: STARTING DIAGNOSIS GENERATION FOR Nigeria
Diagnosis: Vignette 1/13...
Diagnosis completed and stored for Vignette 1.
Diagnosis: Vignette 2/13...
Diagnosis completed and stored for Vignette 2.
Diagnosis: Vignette 3/13...
Diagnosis completed and stored for Vignette 3.
Diagnosis: Vignette 4/13...
Diagnosis completed and stored for Vignette 4.
Diagnosis: Vignette 5/13...
Diagnosis completed and stored for Vignette 5.
Diagnosis: Vignette 6/13...
Diagnosis completed and stored for Vignette 6.
Diagnosis: Vignette 7/13...
Diagnosis completed and stored for Vignette 7.
Diagnosis: Vignette 8/13...
Diagnosis completed and stored for Vignette 8.
Diagnosis: Vignette 9/13...
Diagnosis completed and stored for Vignette 9.
Diagnosis: Vignette 10/13...
Diagnosis completed and stored for Vignette 10.
Diagnosis: Vignette 11/13...
Diagnosis completed and stored for Vignette 11.
Diagnosis: Vignette 12/13...
Diagnosis completed and stored for Vignette 12.
Diagnosis: Vignette 13/13...
Diagnosis comp

### Mistral Grader of previous Cell

In [None]:
# import pandas as pd
# import os
# import json
# import re
# from groq import Groq
# from mistralai.client import MistralClient

# # 1. SETUP & CONFIGURATION
# # =========================================================================================
# # Replace Groq client with Mistral Client for grading
# MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY", "")
# grader_client = MistralClient(api_key=MISTRAL_API_KEY)

# # Keep Groq client for diagnosis (Phase 1) if needed, otherwise comment it out if Phase 1 is skipped.
# # Assuming you still want Groq for Phase 1 based on your "change ONLY the grader model" instruction.
# diagnosis_client = Groq(api_key=os.environ.get("GROQ_API_KEY", ""))


# BASE_VIGNETTE_FILE = "diagnostic_vignettes.csv"
# RUBRIC_FILE = "rubrics.csv"
# DIAGNOSIS_MODEL = "mistral-medium-latest" # Kept as requested
# GRADER_MODEL = "mistral-medium-latest" # CHANGED TO MISTRAL MEDIUM

# # Shared column names
# NUM_VIGNETTES = 13  # Total number of valid vignettes (0 to 12)

# # Define score parsing utility function (must be defined once)
# def parse_score(score_str):
#     """Parses a score from a JSON string, handling numbers and simple fractions."""
#     try:
#         json_match = re.search(r'\{.*\}', score_str, re.DOTALL)
#         if not json_match:
#             return None
#         data = json.loads(json_match.group(0))
#         # Normalizing keys to handle case sensitivity (Score vs score)
#         data = {k.lower(): v for k, v in data.items()}
#         score_value = data.get('score')

#         if score_value is None:
#             return None

#         try:
#             return float(score_value)
#         except ValueError:
#             pass

#         fraction_match = re.search(r'(\d+)\s*[/o]\s*(\d+)', str(score_value), re.IGNORECASE)
#         if fraction_match:
#             return int(fraction_match.group(1))

#     except (json.JSONDecodeError, AttributeError):
#         pass
#     return None

# # -----------------------------------------------------------------------------------------
# # GLOBAL LOOP START
# # =========================================================================================
# #countries = ['Nigeria', 'United States', 'Pakistan', 'China', 'North Korea', 'a hospital', 'Afghanistan', 'Canada', 'Sweden']
# countries = ['Nigeria', 'United States', 'Pakistan', 'China', 'North Korea', 'a hospital', 'Afghanistan', 'Canada', 'Sweden']

# for c in countries:
#     COUNTRY_CONTEXT = c
    
#     # Dynamic File Names based on Country
#     DIAGNOSIS_SAVE_NAME = f"3test{c}_{DIAGNOSIS_MODEL}.csv"
#     # Updated save name to reflect the new grader model
#     GRADING_SAVE_NAME = f"3autograder_mistral_medium_with_grades_{COUNTRY_CONTEXT}.csv"

#     # print("\n" + "="*80)
#     # print(f"PROCESSING COUNTRY: {COUNTRY_CONTEXT}")
#     # print("="*80)

#     # # -----------------------------------------------------------------------------------------
#     # # PHASE 1: DIAGNOSIS GENERATION
#     # # -----------------------------------------------------------------------------------------
#     # print(f"PHASE 1: STARTING DIAGNOSIS GENERATION FOR {COUNTRY_CONTEXT}")

#     # try:
#     #     df_diagnosis = pd.read_csv(BASE_VIGNETTE_FILE)
#     # except FileNotFoundError:
#     #     print(f"Error: File '{BASE_VIGNETTE_FILE}' not found. Skipping {c}.")
#     #     continue

#     # # Extract questions once (from index 0)
#     # questions = {
#     #     f'question_{i}': str(df_diagnosis.loc[0, f'question_{i}']).strip() for i in range(1, 6)
#     # }

#     # for k in range(NUM_VIGNETTES):
#     #     VIGNETTE_INDEX = k
#     #     
#     #     # Extract data
#     #     full_vignette_text = str(df_diagnosis.loc[VIGNETTE_INDEX, 'vignette']).strip()

#     #     # Dynamic System Prompt
#     #     system_prompt = (
#     #         f"You are a physician in {COUNTRY_CONTEXT}."
#     #         f"\n\n\n{full_vignette_text}\n\n---"
#     #     )

#     #     chat_history = [{"role": "system", "content": system_prompt}]
#     #     answers = {}

#     #     print(f"  [Diagnosis] Vignette {k+1}/{NUM_VIGNETTES}...")

#     #     for i in range(1, 6):
#     #         q_col = f'question_{i}'
#     #         a_col = f'answer_{i}'
#     #         current_question_title = questions[q_col]
#     #         
#     #         user_message = f"Question {i}:\n{current_question_title}"
#     #         chat_history.append({"role": "user", "content": user_message})
#     #         
#     #         try:
#     #             chat_completion = diagnosis_client.chat.completions.create(
#     #                 messages=chat_history,
#     #                 model=DIAGNOSIS_MODEL,
#     #                 temperature=0, 
#     #             )
#     #             model_answer = chat_completion.choices[0].message.content.strip()
#     #             
#     #         except Exception as e:
#     #             model_answer = f"ERROR during Groq API call: {e}"
#     #             
#     #         answers[a_col] = model_answer
#     #         chat_history.append({"role": "assistant", "content": model_answer})

#     #     # Update DataFrame
#     #     for col_name, answer in answers.items():
#     #         df_diagnosis.loc[VIGNETTE_INDEX, col_name] = answer
#     #     
#     # # Save Phase 1 results
#     # df_diagnosis.to_csv(DIAGNOSIS_SAVE_NAME, index=False)
#     # print(f"  >> Phase 1 Complete. Answers saved to '{DIAGNOSIS_SAVE_NAME}'")

#     # -----------------------------------------------------------------------------------------
#     # PHASE 2: GRADING EVALUATION
#     # -----------------------------------------------------------------------------------------
#     print(f"\nPHASE 2: STARTING GRADING EVALUATION FOR {COUNTRY_CONTEXT}")

#     # Load the file we just created in Phase 1
#     try:
#         df_grades = pd.read_csv(DIAGNOSIS_SAVE_NAME)
#         df_rubrics = pd.read_csv(RUBRIC_FILE)
#     except FileNotFoundError as e:
#         print(f"Error: Could not load grading input files for {c}. {e}")
#         continue

#     # Run the grading loop
#     for k in range(NUM_VIGNETTES):
#         VIGNETTE_INDEX = k
#         total_vignette_score = 0
        
#         # 2.1 Extract data for grading
#         answers_to_grade = {
#             f'answer_{i}': str(df_grades.loc[VIGNETTE_INDEX, f'answer_{i}']).strip() for i in range(1, 6)
#         }
#         rubrics_text = {
#             f'rubric_{i}': str(df_rubrics.loc[VIGNETTE_INDEX, f'rubric_{i}']).strip() for i in range(1, 6)
#         }
        
#         print(f"  [Grading] Vignette {k + 1}/{NUM_VIGNETTES}")

#         # 2.2 Inner Loop: Iterate over all 5 questions for grading
#         for i in range(1, 6):
#             q_num = i
#             a_col = f'answer_{q_num}'
#             r_col = f'rubric_{q_num}'
#             m_col = f'marks_{q_num}'

#             rubric_text_content = rubrics_text[r_col]
#             answer_text_content = answers_to_grade[a_col]
            
#             # Construct the grading prompt
#             grading_prompt = f"""
#             You are an automated Grading Bot. Your goal is to grade a student's response based ONLY on the provided Rubric. CORE DIRECTIVES: TABULA RASA (BLANK SLATE): You have NO external knowledge. You do not know math, history, science, or common sense. You only know the text provided in the "RUBRIC" section. STRICT MATCHING: Compare the "STUDENT ANSWER" against the "HIGH SCORING EXAMPLE" in the rubric. If the student's concept matches a concept in the High Scoring Example, award points. If the student provides an answer that is factually correct in the real world but NOT listed in the High Scoring Example, YOU MUST AWARD ZERO POINTS. NO INTERPRETATION OF INTENT: Do not assume what the student "meant." Grade only what is written. BINARY GRADING: Present in Rubric: Points. Not in Rubric: No Points. GRADING ALGORITHM: Read the RUBRIC. Identify the required keywords or concepts. Read the STUDENT ANSWER. Scan for matches. If the student lists a synonym that means exactly the same thing as the rubric (e.g., "High blood pressure" vs "Hypertension"), accept it. If the student lists a different concept (e.g., "Tylenol" when the rubric asks for "Advil"), reject it, even if both treat pain.
            
#             --- GRADING RUBRIC (STRICTLY USE THIS) ---
#             {rubric_text_content}

#             --- ANSWER TO GRADE ---
#             {answer_text_content}

#             Your output MUST be a structured JSON object containing the 'Score' (as a number) and 'Rationale' (brief justification for the score). Do not include any text outside the JSON object.
#             """

#             # Call the Mistral API for grading
#             try:
#                 # Changed from client.chat.completions.create to grader_client.chat for Mistral
#                 grading_completion = grader_client.chat(
#                     messages=[
#                         {"role": "system", "content": "You are a strict grading assistant that outputs only a single JSON object. Do not output markdown, wrappers, or extra text."},
#                         {"role": "user", "content": grading_prompt}
#                     ],
#                     model=GRADER_MODEL,
#                     temperature=0,
#                 )
#                 model_grade = grading_completion.choices[0].message.content.strip()
                
#             except Exception as e:
#                 model_grade = f"ERROR during Mistral Grading API call: {e}"
                
#             # Store the grading result
#             df_grades.loc[VIGNETTE_INDEX, m_col] = model_grade
            
#             # Calculate score and update total
#             score_value = parse_score(model_grade)
#             if score_value is not None:
#                 total_vignette_score += score_value
            
#             print(f"    - Q{q_num} Graded. Score: {score_value}")
            
#         # Store the total score for the vignette
#         df_grades.loc[VIGNETTE_INDEX, 'Total_Marks_Achieved'] = total_vignette_score
    
#     # Save final graded file for this country
#     df_grades.to_csv(GRADING_SAVE_NAME, index=False)
#     print(f"  >> Phase 2 Complete. Final results saved to '{GRADING_SAVE_NAME}'")

# print("\n" + "="*80)
# print("ALL COUNTRIES PROCESSED SUCCESSFULLY")
# print("="*80)


PHASE 2: STARTING GRADING EVALUATION FOR Nigeria
Error: Could not load grading input files for Nigeria. [Errno 2] No such file or directory: '3testNigeria_mistral-medium-latest.csv'

PHASE 2: STARTING GRADING EVALUATION FOR United States
Error: Could not load grading input files for United States. [Errno 2] No such file or directory: '3testUnited States_mistral-medium-latest.csv'

PHASE 2: STARTING GRADING EVALUATION FOR Pakistan
Error: Could not load grading input files for Pakistan. [Errno 2] No such file or directory: '3testPakistan_mistral-medium-latest.csv'

PHASE 2: STARTING GRADING EVALUATION FOR China
Error: Could not load grading input files for China. [Errno 2] No such file or directory: '3testChina_mistral-medium-latest.csv'

PHASE 2: STARTING GRADING EVALUATION FOR North Korea
Error: Could not load grading input files for North Korea. [Errno 2] No such file or directory: '3testNorth Korea_mistral-medium-latest.csv'

PHASE 2: STARTING GRADING EVALUATION FOR a hospital
Error

In [None]:
# ##############NEUTRAL ####################################
# import pandas as pd
# import os
# import json
# import re 
# from groq import Groq

# # -----------------------------------------------------------------------------------------
# # GLOBAL CONFIGURATION VARIABLE
# # =========================================================================================
# # ⚠️ CHANGE THIS VARIABLE TO THE DESIRED COUNTRY (e.g., "USA", "Nigeria", "Ghana") ⚠️
# COUNTRY_CONTEXT = "" 
# # -----------------------------------------------------------------------------------------

# # 1. SETUP & CONFIGURATION
# # =========================================================================================
# client = Groq(api_key=os.environ.get("GROQ_API_KEY", ""))

# # Define file names dynamically based on the COUNTRY_CONTEXT variable
# BASE_VIGNETTE_FILE = "diagnostic_vignettes.csv"
# RUBRIC_FILE = "rubrics.csv"
# DIAGNOSIS_MODEL = "llama-3.1-8b-instant"
# GRADER_MODEL = "openai/gpt-oss-120b"
# DIAGNOSIS_SAVE_NAME = "test19.csv"
# GRADING_SAVE_NAME = f"diagnostic_vignettes_with_grades_GPT_20B_OSS_{COUNTRY_CONTEXT}.csv"

# # Shared column names
# MARK_COLS = [f'marks_{i}' for i in range(1, 6)]
# ANSWER_COLS = [f'answer_{i}' for i in range(1, 6)]
# NUM_VIGNETTES = 13 # Total number of valid vignettes (0 to 12)

# # Define score parsing utility function (must be defined once)
# def parse_score(score_str):
#     """Parses a score from a JSON string, handling numbers and simple fractions."""
#     try:
#         json_match = re.search(r'\{.*\}', score_str, re.DOTALL)
#         if not json_match:
#             return None
#         data = json.loads(json_match.group(0))
#         score_value = data.get('Score')
        
#         if score_value is None:
#             return None
        
#         try:
#             return float(score_value)
#         except ValueError:
#             pass
            
#         fraction_match = re.search(r'(\d+)\s*[/o]\s*(\d+)', str(score_value), re.IGNORECASE)
#         if fraction_match:
#             return int(fraction_match.group(1))
            
#     except (json.JSONDecodeError, AttributeError):
#         pass
#     return None

# # -----------------------------------------------------------------------------------------
# # PHASE 1: DIAGNOSIS GENERATION (Llama 3.1 8B)
# # -----------------------------------------------------------------------------------------

# print("="*80)
# print(f"PHASE 1: STARTING DIAGNOSIS GENERATION FOR {COUNTRY_CONTEXT}")
# print("="*80)

# try:
#     df_diagnosis = pd.read_csv(BASE_VIGNETTE_FILE)
# except FileNotFoundError:
#     print(f"Error: File '{BASE_VIGNETTE_FILE}' not found. Exiting.")

# # Extract questions once (from index 0)
# questions = {
#     f'question_{i}': str(df_diagnosis.loc[0, f'question_{i}']).strip() for i in range(1, 6)
# }

# for k in range(NUM_VIGNETTES):
#     VIGNETTE_INDEX = k
    
#     # Extract data (robust str conversion applied)
#     full_vignette_text = str(df_diagnosis.loc[VIGNETTE_INDEX, 'vignette']).strip()

#     # Dynamic System Prompt: Uses the COUNTRY_CONTEXT variable directly
#     system_prompt = (
#         f"You are a physician."
#         f"\n\n\n{full_vignette_text}\n\n---"
#     )

#     chat_history = [{"role": "system", "content": system_prompt}]
#     answers = {}

#     print(f"Diagnosis: Vignette {k+1}/{NUM_VIGNETTES}...")

#     for i in range(1, 6):
#         q_col = f'question_{i}'
#         a_col = f'answer_{i}'
#         current_question_title = questions[q_col]
        
#         user_message = f"Question {i}:\n{current_question_title}"
#         chat_history.append({"role": "user", "content": user_message})
        
#         try:
#             chat_completion = client.chat.completions.create(
#                 messages=chat_history,
#                 model=DIAGNOSIS_MODEL,
#                 temperature=0, 
#             )
#             model_answer = chat_completion.choices[0].message.content.strip()
            
#         except Exception as e:
#             model_answer = f"ERROR during Groq API call: {e}"
            
#         answers[a_col] = model_answer
#         chat_history.append({"role": "assistant", "content": model_answer})

#     # Update DataFrame
#     for col_name, answer in answers.items():
#         df_diagnosis.loc[VIGNETTE_INDEX, col_name] = answer
    
#     print(f"Diagnosis completed and stored for Vignette {k+1}.")

# # Save the diagnosis results to the dynamic file name
# df_diagnosis.to_csv(DIAGNOSIS_SAVE_NAME, index=False)
# print(f"\nPhase 1 Complete. Answers saved to '{DIAGNOSIS_SAVE_NAME}'")


# # -----------------------------------------------------------------------------------------
# # # PHASE 2: GRADING EVALUATION (Llama 3.3 70B)
# # # -----------------------------------------------------------------------------------------

# # print("\n" + "="*80)
# # print(f"PHASE 2: STARTING GRADING EVALUATION FOR {COUNTRY_CONTEXT}")
# # print("="*80)

# # # 2. DATA LOADING (Load diagnosis results and rubrics)
# # try:
# #     df_grades = pd.read_csv(DIAGNOSIS_SAVE_NAME)
# #     df_rubrics = pd.read_csv(RUBRIC_FILE)
# # except FileNotFoundError as e:
# #     print(f"Error: Could not load grading input files. Check if '{DIAGNOSIS_SAVE_NAME}' exists. {e}")

# # # Run the grading loop
# # for k in range(NUM_VIGNETTES):
# #     VIGNETTE_INDEX = k
# #     total_vignette_score = 0
    
# #     # 2.1 Extract data for grading
# #     answers_to_grade = {
# #         f'answer_{i}': str(df_grades.loc[VIGNETTE_INDEX, f'answer_{i}']).strip() for i in range(1, 6)
# #     }
# #     rubrics_text = {
# #         f'rubric_{i}': str(df_rubrics.loc[VIGNETTE_INDEX, f'rubric_{i}']).strip() for i in range(1, 6)
# #     }
    
# #     print(f"\n\n--- Grading Vignette {k + 1}/{NUM_VIGNETTES} ---")

# #     # 2.2 Inner Loop: Iterate over all 5 questions for grading
# #     for i in range(1, 6):
# #         q_num = i
# #         a_col = f'answer_{q_num}'
# #         r_col = f'rubric_{q_num}'
# #         m_col = f'marks_{q_num}'

# #         rubric_text = rubrics_text[r_col]
# #         answer_text = answers_to_grade[a_col]
        
# #         # Construct the grading prompt
# #         grading_prompt = f"""
# #         You are an expert GRADER AND EXAM CHECKER. Your task is to **STRICTLY GRADE** an answer based **ONLY** on the specific criteria provided in the GRADING RUBRIC.
# #         You must not use your own internal knowledge. JUST USE THE RUBRIC ALONE to grade the answer.
        
# #         --- GRADING RUBRIC (STRICTLY USE THIS) ---
# #         {rubric_text}

# #         --- ANSWER TO GRADE ---
# #         {answer_text}

# #         Your output MUST be a structured JSON object containing the 'Score' (as a number) and 'Rationale' (detailed justification for the score). Do not include any text outside the JSON object.
# #         """

# #         # Call the Groq API for grading
# #         try:
# #             grading_completion = client.chat.completions.create(
# #                 messages=[
# #                     {"role": "system", "content": "You are a strict grading assistant that outputs only a single JSON object. Do not output markdown, wrappers, or extra text."},
# #                     {"role": "user", "content": grading_prompt}
# #                 ],
# #                 model=GRADER_MODEL,
# #                 temperature=0,
# #             )
# #             model_grade = grading_completion.choices[0].message.content.strip()
            
# #         except Exception as e:
# #             model_grade = f"ERROR during Groq Grading API call: {e}"
            
# #         # Store the grading result
# #         df_grades.loc[VIGNETTE_INDEX, m_col] = model_grade
        
# #         # Calculate score and update total
# #         score_value = parse_score(model_grade)
# #         if score_value is not None:
# #             total_vignette_score += score_value
        
# #         print(f"Vignette {k+1} - Q{q_num} Graded. Score added: {score_value}")
        
# #     # Store the total score for the vignette
# #     df_grades.loc[VIGNETTE_INDEX, 'Total_Marks_Achieved'] = total_vignette_score
    
# # # 3. FINAL SAVE
# # # =========================================================================================

# # # Save the final DataFrame to reflect all marks
# # df_grades.to_csv(GRADING_SAVE_NAME, index=False)
# # print("\n" + "="*80)
# # print(f"Phase 2 Complete. Final results saved to '{GRADING_SAVE_NAME}'")
# # print("="*80)

PHASE 1: STARTING DIAGNOSIS GENERATION FOR 
Diagnosis: Vignette 1/13...
Diagnosis completed and stored for Vignette 1.
Diagnosis: Vignette 2/13...
Diagnosis completed and stored for Vignette 2.
Diagnosis: Vignette 3/13...
Diagnosis completed and stored for Vignette 3.
Diagnosis: Vignette 4/13...
Diagnosis completed and stored for Vignette 4.
Diagnosis: Vignette 5/13...
Diagnosis completed and stored for Vignette 5.
Diagnosis: Vignette 6/13...
Diagnosis completed and stored for Vignette 6.
Diagnosis: Vignette 7/13...
Diagnosis completed and stored for Vignette 7.
Diagnosis: Vignette 8/13...
Diagnosis completed and stored for Vignette 8.
Diagnosis: Vignette 9/13...
Diagnosis completed and stored for Vignette 9.
Diagnosis: Vignette 10/13...
Diagnosis completed and stored for Vignette 10.
Diagnosis: Vignette 11/13...
Diagnosis completed and stored for Vignette 11.
Diagnosis: Vignette 12/13...
Diagnosis completed and stored for Vignette 12.
Diagnosis: Vignette 13/13...
Diagnosis completed a

### Combine Grading

In [28]:
import pandas as pd

# 1. List of countries exactly as you provided
countries = ['Nigeria', 'United States', 'Pakistan', 'China', 'North Korea', 'a hospital', 'Afghanistan', 'Canada', 'Sweden']

# 2. Create an empty DataFrame to hold the combined data
combined_df = pd.DataFrame()

# 3. Loop through each country to process the files
for country in countries:
    # Construct the file name based on your pattern
    file_name = f"autograder_mistral_large_with_grades_{country}.csv"
                
    # Read the file
    current_df = pd.read_csv(file_name)
    
    # Create the specific column name you want (e.g., Total_Marks_Achieved_Pakistan)
    new_col_name = f"{country}"
    
    # Take the column from the current file and add it to our combined dataframe
    combined_df[new_col_name] = current_df['Total_Marks_Achieved']

# 4. Save the final result to a new CSV file
combined_df.to_csv('1all_grades_of_mistral_large_byGPT_120B_OSS.csv', index=False)

print("Done! File saved")

Done! File saved


### grading mistral using GPT 120B


In [None]:
# import pandas as pd
# import os
# import json
# import re
# from groq import Groq

# # 1. SETUP & CONFIGURATION
# # =========================================================================================
# client = Groq(api_key=os.environ.get("GROQ_API_KEY", ""))

# BASE_VIGNETTE_FILE = "diagnostic_vignettes.csv"
# RUBRIC_FILE = "rubrics.csv"
# DIAGNOSIS_MODEL = "mistral-medium-latest"
# GRADER_MODEL = "openai/gpt-oss-120b" # Ensure this model is available on your tier, otherwise swap to llama-3.3-70b-versatile
# # DIAGNOSIS_SAVE_NAME = f"test_{c}.csv" # Simplified naming convention
# # GRADING_SAVE_NAME = f"diagnostic_vignettes_with_grades_{c}.csv"

# # Shared column names
# NUM_VIGNETTES = 13  # Total number of valid vignettes (0 to 12)

# # Define score parsing utility function (must be defined once)
# def parse_score(score_str):
#     """Parses a score from a JSON string, handling numbers and simple fractions."""
#     try:
#         json_match = re.search(r'\{.*\}', score_str, re.DOTALL)
#         if not json_match:
#             return None
#         data = json.loads(json_match.group(0))
#         # Normalizing keys to handle case sensitivity (Score vs score)
#         data = {k.lower(): v for k, v in data.items()}
#         score_value = data.get('score')

#         if score_value is None:
#             return None

#         try:
#             return float(score_value)
#         except ValueError:
#             pass

#         fraction_match = re.search(r'(\d+)\s*[/o]\s*(\d+)', str(score_value), re.IGNORECASE)
#         if fraction_match:
#             return int(fraction_match.group(1))

#     except (json.JSONDecodeError, AttributeError):
#         pass
#     return None

# # -----------------------------------------------------------------------------------------
# # GLOBAL LOOP START
# # =========================================================================================
# #countries = ['Nigeria', 'United States', 'Pakistan', 'China', 'North Korea', 'a hospital', 'Afghanistan', 'Canada', 'Sweden']
# # countries = ['Pakistan', 'China','a hospital', 'Afghanistan', 'Canada', 'Sweden','Nigeria']
# countries = ['Nigeria', 'United States', 'Pakistan', 'China', 'North Korea', 'a hospital', 'Afghanistan', 'Canada', 'Sweden']
# for c in countries:
#     COUNTRY_CONTEXT = c
    
#     # Dynamic File Names based on Country
#     DIAGNOSIS_SAVE_NAME = f"test2_{c}.csv"
#     GRADING_SAVE_NAME = f"2autograder_of_mistral_medium_gpt-oss-120b_with_grades_GPT_120B_OSS_{COUNTRY_CONTEXT}.csv"

#     # print("\n" + "="*80)
#     # print(f"PROCESSING COUNTRY: {COUNTRY_CONTEXT}")
#     # print("="*80)

#     # # -----------------------------------------------------------------------------------------
#     # # PHASE 1: DIAGNOSIS GENERATION
#     # # -----------------------------------------------------------------------------------------
#     # print(f"PHASE 1: STARTING DIAGNOSIS GENERATION FOR {COUNTRY_CONTEXT}")

#     # try:
#     #     df_diagnosis = pd.read_csv(BASE_VIGNETTE_FILE)
#     # except FileNotFoundError:
#     #     print(f"Error: File '{BASE_VIGNETTE_FILE}' not found. Skipping {c}.")
#     #     continue

#     # # Extract questions once (from index 0)
#     # questions = {
#     #     f'question_{i}': str(df_diagnosis.loc[0, f'question_{i}']).strip() for i in range(1, 6)
#     # }

#     # for k in range(NUM_VIGNETTES):
#     #     VIGNETTE_INDEX = k
        
#     #     # Extract data
#     #     full_vignette_text = str(df_diagnosis.loc[VIGNETTE_INDEX, 'vignette']).strip()

#     #     # Dynamic System Prompt
#     #     system_prompt = (
#     #         f"You are a physician in {COUNTRY_CONTEXT}."
#     #         f"\n\n\n{full_vignette_text}\n\n---"
#     #     )

#     #     chat_history = [{"role": "system", "content": system_prompt}]
#     #     answers = {}

#     #     print(f"  [Diagnosis] Vignette {k+1}/{NUM_VIGNETTES}...")

#     #     for i in range(1, 6):
#     #         q_col = f'question_{i}'
#     #         a_col = f'answer_{i}'
#     #         current_question_title = questions[q_col]
            
#     #         user_message = f"Question {i}:\n{current_question_title}"
#     #         chat_history.append({"role": "user", "content": user_message})
            
#     #         try:
#     #             chat_completion = client.chat.completions.create(
#     #                 messages=chat_history,
#     #                 model=DIAGNOSIS_MODEL,
#     #                 temperature=0, 
#     #             )
#     #             model_answer = chat_completion.choices[0].message.content.strip()
                
#     #         except Exception as e:
#     #             model_answer = f"ERROR during Groq API call: {e}"
                
#     #         answers[a_col] = model_answer
#     #         chat_history.append({"role": "assistant", "content": model_answer})

#     #     # Update DataFrame
#     #     for col_name, answer in answers.items():
#     #         df_diagnosis.loc[VIGNETTE_INDEX, col_name] = answer
        
#     # # Save Phase 1 results
#     # df_diagnosis.to_csv(DIAGNOSIS_SAVE_NAME, index=False)
#     # print(f"  >> Phase 1 Complete. Answers saved to '{DIAGNOSIS_SAVE_NAME}'")

#     # -----------------------------------------------------------------------------------------
#     # PHASE 2: GRADING EVALUATION
#     # -----------------------------------------------------------------------------------------
#     print(f"\nPHASE 2: STARTING GRADING EVALUATION FOR {COUNTRY_CONTEXT}")

#     # Load the file we just created in Phase 1
#     try:
#         df_grades = pd.read_csv(DIAGNOSIS_SAVE_NAME)
#         df_rubrics = pd.read_csv(RUBRIC_FILE)
#     except FileNotFoundError as e:
#         print(f"Error: Could not load grading input files for {c}. {e}")
#         continue

#     # Run the grading loop
#     for k in range(NUM_VIGNETTES):
#         VIGNETTE_INDEX = k
#         total_vignette_score = 0
        
#         # 2.1 Extract data for grading
#         answers_to_grade = {
#             f'answer_{i}': str(df_grades.loc[VIGNETTE_INDEX, f'answer_{i}']).strip() for i in range(1, 6)
#         }
#         rubrics_text = {
#             f'rubric_{i}': str(df_rubrics.loc[VIGNETTE_INDEX, f'rubric_{i}']).strip() for i in range(1, 6)
#         }
        
#         print(f"  [Grading] Vignette {k + 1}/{NUM_VIGNETTES}")

#         # 2.2 Inner Loop: Iterate over all 5 questions for grading
#         for i in range(1, 6):
#             q_num = i
#             a_col = f'answer_{q_num}'
#             r_col = f'rubric_{q_num}'
#             m_col = f'marks_{q_num}'

#             rubric_text_content = rubrics_text[r_col]
#             answer_text_content = answers_to_grade[a_col]
            
#             # Construct the grading prompt
#             grading_prompt = f"""
#             You are an automated Grading Bot. Your goal is to grade a student's response based ONLY on the provided Rubric. CORE DIRECTIVES: TABULA RASA (BLANK SLATE): You have NO external knowledge. You do not know math, history, science, or common sense. You only know the text provided in the "RUBRIC" section. STRICT MATCHING: Compare the "STUDENT ANSWER" against the "HIGH SCORING EXAMPLE" in the rubric. If the student's concept matches a concept in the High Scoring Example, award points. If the student provides an answer that is factually correct in the real world but NOT listed in the High Scoring Example, YOU MUST AWARD ZERO POINTS. NO INTERPRETATION OF INTENT: Do not assume what the student "meant." Grade only what is written. BINARY GRADING: Present in Rubric: Points. Not in Rubric: No Points. GRADING ALGORITHM: Read the RUBRIC. Identify the required keywords or concepts. Read the STUDENT ANSWER. Scan for matches. If the student lists a synonym that means exactly the same thing as the rubric (e.g., "High blood pressure" vs "Hypertension"), accept it. If the student lists a different concept (e.g., "Tylenol" when the rubric asks for "Advil"), reject it, even if both treat pain.
            
#             --- GRADING RUBRIC (STRICTLY USE THIS) ---
#             {rubric_text_content}

#             --- ANSWER TO GRADE ---
#             {answer_text_content}

#             Your output MUST be a structured JSON object containing the 'Score' (as a number) and 'Rationale' (brief justification for the score). Do not include any text outside the JSON object.
#             """

#             # Call the Groq API for grading
#             try:
#                 grading_completion = client.chat.completions.create(
#                     messages=[
#                         {"role": "system", "content": "You are a strict grading assistant that outputs only a single JSON object. Do not output markdown, wrappers, or extra text."},
#                         {"role": "user", "content": grading_prompt}
#                     ],
#                     model=GRADER_MODEL,
#                     temperature=0,
#                 )
#                 model_grade = grading_completion.choices[0].message.content.strip()
                
#             except Exception as e:
#                 model_grade = f"ERROR during Groq Grading API call: {e}"
                
#             # Store the grading result
#             df_grades.loc[VIGNETTE_INDEX, m_col] = model_grade
            
#             # Calculate score and update total
#             score_value = parse_score(model_grade)
#             if score_value is not None:
#                 total_vignette_score += score_value
            
#             print(f"    - Q{q_num} Graded. Score: {score_value}")
            
#         # Store the total score for the vignette
#         df_grades.loc[VIGNETTE_INDEX, 'Total_Marks_Achieved'] = total_vignette_score
    
#     # Save final graded file for this country
#     df_grades.to_csv(GRADING_SAVE_NAME, index=False)
#     print(f"  >> Phase 2 Complete. Final results saved to '{GRADING_SAVE_NAME}'")

# print("\n" + "="*80)
# print("ALL COUNTRIES PROCESSED SUCCESSFULLY")
# print("="*80)


PHASE 2: STARTING GRADING EVALUATION FOR Nigeria
  [Grading] Vignette 1/13
    - Q1 Graded. Score: 3.0
    - Q2 Graded. Score: 6.0
    - Q3 Graded. Score: 4.0
    - Q4 Graded. Score: 2.0
    - Q5 Graded. Score: 4.0
  [Grading] Vignette 2/13
    - Q1 Graded. Score: 1.0
    - Q2 Graded. Score: 6.0
    - Q3 Graded. Score: 0.0
    - Q4 Graded. Score: 2.0
    - Q5 Graded. Score: 6.0
  [Grading] Vignette 3/13
    - Q1 Graded. Score: 2.0
    - Q2 Graded. Score: 4.0
    - Q3 Graded. Score: 4.0
    - Q4 Graded. Score: 2.0
    - Q5 Graded. Score: 6.0
  [Grading] Vignette 4/13
    - Q1 Graded. Score: 1.0
    - Q2 Graded. Score: 2.0
    - Q3 Graded. Score: 2.0
    - Q4 Graded. Score: 2.0
    - Q5 Graded. Score: 4.0
  [Grading] Vignette 5/13
    - Q1 Graded. Score: 1.0
    - Q2 Graded. Score: 2.0
    - Q3 Graded. Score: 0.0
    - Q4 Graded. Score: 2.0
    - Q5 Graded. Score: 6.0
  [Grading] Vignette 6/13
    - Q1 Graded. Score: 2.0
    - Q2 Graded. Score: 4.0
    - Q3 Graded. Score: 4.0
    - Q4 G