In [7]:
import pandas as pd
import sympy
from sympy import symbols, Abs
import re
import ollama
from typing import Dict, List, Tuple, Optional, Union
import logging
import os
import time
import json

###70B parameter models
#ollama run llama3.3
#ollama run llama3.1:70b - this is lame
#ollama run deepseek-r1:70b

###14B Parameter Models
#ollama run phi4:14b-fp16
#ollama run deepseek-r1:14b-qwen-distill-fp16

###14B Parameter Models 8-bit quantised
#ollama run deepseek-r1:14b-qwen-distill-q8_0 
#ollama run phi4:14b-q8_0

#7B Parameter
#ollama run mathstral:7b-v0.1-fp16
#ollama run deepseek-r1:7b-qwen-distill-fp16

#3B Parameter Models 16-bit quantised
#ollama run llama3.2:3b-instruct-fp16
#ollama run gemma2:2b-instruct-fp16

#1B Parameter Models 16 bit quantised
#ollama run llama3.2:1b-instruct-fp16
#ollama run deepseek-r1:1.5b-qwen-distill-fp16

###For Debugging
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)

# Model configuration
MODEL_ID_1 = "phi4"
MODEL_ID_2 = "phi4:14b-q8_0"
MODEL_IDS = [MODEL_ID_1, MODEL_ID_2]

# Configuration variables
FILE_DIRECTORY = r"D:\AI_Marking\Datasets\" 
#Replace above with actual directory
ORIGINAL_FILE = os.path.join(FILE_DIRECTORY, r"Filename.txt") #Replace with actual filename
model_filename = "_".join(MODEL_IDS)
model_filename = re.sub(r'[\\/:*?"<>|]', '-', model_filename)
PROCESSED_FILE = os.path.join(FILE_DIRECTORY, f"QuestionNumber{model_filename}.txt") #Change output filename as desired
DICTIONARY_DIRECTORY = r"D:\AI_Marking\Datasets\Cache"

# Expected variables in student responses
EXPECTED_VARIABLES = ["v", "S", "w", "u", "t"] #Ensure that the variable being solved for is the first entry in the list (if relevant)

# Marker list for response parsing
MARKER_LIST = [
    "List of Equations: "
] #If you wish to achieve multiple tasks in one LLM call then you need to use the marker list to capture each part. Otherwise this should just match what you use in your LLM prompt

Default_Response = (
    "List of Equations: [] "
)#If the student response is blank, then this output is given instead of calling the LLM. 

# Column names for DataFrame
def marker_to_base_name(marker: str) -> str:
    #Convert the marker text into a usable column name (lowercase, underscores, etc.).
    #E.g. "List of Equations: " -> "list_of_equations"
    #     "Student notes v-u limit: " -> "student_notes_v_u_limit"
    
    # Remove trailing colon and extra spaces
    text = marker.replace(":", "").strip()
    # Replace hyphens with underscores first
    text = text.replace("-", "_")
    # Replace spaces with underscores
    text = text.replace(" ", "_").lower()
    return text
base_column_names = [marker_to_base_name(m) for m in MARKER_LIST]

COLUMN_NAMES = []
for bc in base_column_names:
    COLUMN_NAMES.append(f"Model1_{bc}")
    COLUMN_NAMES.append(f"Model2_{bc}")

MODEL_ERROR_COLUMNS = [
    'Model1_Batch1_Fails',
    'Model2_Batch2_Fails',
    'Model1_Consensus_Fails',
    'Model2_Consensus_Fails'
]

TIME_TRACKING_COLUMNS = [
    'Model1_Raw_Time', 'Model2_Raw_Time',
    'Model1_Batch1_Time', 'Model2_Batch2_Time',
    'Consensus_Time',
    'Total_Processing_Time'
]

TOKEN_TRACKING_COLUMNS = [
        'Model1_Raw_Tokens', 'Model2_Raw_Tokens',
        'Model1_Batch1_Tokens', 'Model2_Batch2_Tokens',
        'Consensus_Tokens'
    ]


In [8]:
#Defining Class and Functions
def sanitize_filename(filename):
    # Replace characters that are invalid in Windows filenames
    return re.sub(r'[\\/:*?"<>|]', '-', filename)

class LLMResponseValidator:
    def __init__(self, max_repair_attempts: int = 3, verbose: bool = False):
        #Initialize the validator with configuration settings.
        #Args:
        #    max_repair_attempts: Maximum number of repair attempts for parsing/consensus

        self.model_ids = MODEL_IDS
        self.max_repair_attempts = max_repair_attempts
        self.verbose = verbose
        self.variables = symbols(' '.join(EXPECTED_VARIABLES))
        
        # Error and attempt tracking
        self.current_error = None  # Most recent error message
        self.parse_error_count = 0  # Count of syntax/parsing errors
        self.mismatch_error_count = 0  # Count of mathematical mismatches between models
        self.current_attempt = 0  # Current attempt number for repair/consensus
        self.error_history = {
            'parse_errors': [],  # History of parsing errors
            'mismatch_errors': []  # History of mathematical mismatch errors
        }
        self.model1_fails = 0
        self.model2_fails = 0
        self.model1_batch_time = 0
        self.model2_batch_time = 0
        self.model1_raw_time = 0.0
        self.model2_raw_time = 0.0
        self.consensus_time = 0
        self.total_processing_time = 0

        self.model1_raw_tokens = 0
        self.model2_raw_tokens = 0
        self.model1_batch1_tokens = 0
        self.model2_batch2_tokens = 0
        self.consensus_tokens = 0
        
        # Set up logging (keep for debugging purposes)
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            filename='llm_validation.log'
        )
        self.logger = logging.getLogger(__name__)
        
    def reset_state(self):
        """Reset all state tracking variables for new processing attempt"""
        self.current_error = None
        self.parse_error_count = 0
        self.mismatch_error_count = 0
        self.current_attempt = 0
        self.error_history = {
            'parse_errors': [],
            'mismatch_errors': []
        }

    def parse_equation(self, equation_str: str, model_label: str) -> Optional[Union[sympy.Expr, List[sympy.Expr]]]:
        #Parse equation string into SymPy expression(s).
        try:
            # Handle empty or invalid input
            if not equation_str: 
                if model_label == "Model1":
                    self.model1_fails +=1
                elif model_label == "Model2":
                    self.model2_fails +=1
                else:
                    print(f"Model Label was {model_label}; neither Model1 or Model2")
                return None

            if equation_str == '[]':
                return 0
        
            # Clean the equation string
            equation_str = equation_str.strip('[]')
            if '=' in equation_str:
                # Split multiple equations if present
                equations = [eq.strip() for eq in equation_str.split(',')]
                results = []
                for eq in equations:
                    if '=' in eq:
                        eq = eq.split('=')[1].strip()
                    eq = eq.lower()
                    results.append(sympy.sympify(eq)) #Turns strings into Sympy equations/expressions
                return results if len(results) > 1 else results[0]
            
            # Convert to lowercase and parse
            equation_str = equation_str.lower()
            result = sympy.sympify(equation_str)
            self.current_error = None  # Clear error on successful parse
            return result
        except Exception as e:
            error_msg = f"Error parsing equation '{equation_str}': {str(e)}"
            self.current_error = error_msg
            self.parse_error_count += 1
            self.error_history['parse_errors'].append(error_msg)
            self.logger.error(error_msg)
            if model_label == "Model1":
                self.model1_fails +=1
            elif model_label == "Model2":
                self.model2_fails +=1
            else:
                print(f"Model Label was {model_label}; neither Model1 or Model2")
            return None

    def equations_match(self, eq1: str, eq2: str) -> Tuple[bool, List[Tuple[int, int]]]:
        #Check if two equation strings are mathematically equivalent.
        #Returns:
        #    Tuple[bool, List[Tuple[int, int]]]: 
        #        - Boolean indicating if all equations match
        #        - List of matching equation indices (idx1, idx2)

        parsed1 = self.parse_equation(eq1, "Model1")
        parsed2 = self.parse_equation(eq2, "Model2")
        
        if parsed1 is None or parsed2 is None:
            return False, []
            
        try:
            # Convert single equations to lists for consistent handling
            eqs1 = parsed1 if isinstance(parsed1, list) else [parsed1]
            eqs2 = parsed2 if isinstance(parsed2, list) else [parsed2]
            
            # Keep track of which equations match
            matches = []
            used_indices = set()
            
            # Try to find matches for each equation in eqs1
            for i, e1 in enumerate(eqs1):
                for j, e2 in enumerate(eqs2):
                    if j in used_indices:
                        continue
                        
                    try:
                        # Try direct comparison
                        diff = sympy.simplify(e1 - e2)
                        if diff == 0:
                            matches.append((i, j))
                            used_indices.add(j)
                            break
                    except Exception:
                        # If we can't subtract, try string comparison
                        if str(e1) == str(e2):
                            matches.append((i, j))
                            used_indices.add(j)
                            break
            
            # Print debug info about matches
            if self.verbose:
                print(f"\nFound {len(matches)} matching equations:")
                for i, j in matches:
                    print(f"Equation {i} from first set matches equation {j} from second set")
                if len(matches) < max(len(eqs1), len(eqs2)):
                    print("Some equations did not match:")
                    for i, e1 in enumerate(eqs1):
                        if i not in [m[0] for m in matches]:
                            print(f"Unmatched from first set: {e1}")
                    for j, e2 in enumerate(eqs2):
                        if j not in [m[1] for m in matches]:
                            print(f"Unmatched from second set: {e2}")
            
            # All equations match if we found matches for all equations in both sets
            all_match = (len(matches) == len(eqs1) == len(eqs2))
            
            return all_match, matches
            
        except Exception as e:
            error_msg = f"Error comparing equations: {str(e)}"
            self.mismatch_error_count += 1
            self.error_history['mismatch_errors'].append(error_msg)
            self.logger.error(error_msg)
            return False

    def create_repair_prompt(self, original_response: str, error_msg: str, 
                           student_answer: str) -> str:
        #Create prompt for repairing invalid responses.
        return f"""You are an expert in interpreting student mathematical responses and converting them into valid SymPy syntax. 
        Your overall goal is to extract all equations that students have explicitly written in their response to a physics question. 
        Note that equations described in words do not count, only record equations that students have written in SymPy syntax.
        The student may have made syntactic errors, please use your expertise to interpret what the student meant and write record the syntactically correct version.
        Ensure that only the expected variables are used, they are listed here: {EXPECTED_VARIABLES}

        The student's answer is as follows: <<<{student_answer}>>>
        Previously you recorded this equation: {original_response}
        However it could not be parsed, giving this error: {error_msg}
        
        Your task is to repair your previous response which had a parsing error. 

        Guidelines for your corrected response:
        1. Begin with "List of Equations: [" and end with "]".
        2. All equations must be in SymPy format using Eq(left, right), e.g., "Eq(v, u + w)" for "v = u + w"
        3. If student writes expression without "{EXPECTED_VARIABLES[0]}=", assume that this is intended to be the right hand side of the equation and the left side would be {EXPECTED_VARIABLES[0]}
        4. If multiple equations, separate them with commas
        5. Only use these variables: {EXPECTED_VARIABLES}. Capitalisation of the variables is very important, please adjust the student's variables to match the capatilisation of the expected variables given here.
        6. Correct notation errors (capitalization, brackets) but not mathematical errors
        7. Use "**(1/2)" for square root
        8. If there are multiple equations, separate them by commas but include all of them within "[" and "]".
        9. Example formats (pay careful attention to brackets and order of operations):
           - Student writes "v=2u+w": Output should be "Eq(v, (2*u) + w)"
           - Student writes "v0=u^2/w": Output should be "Eq(v_0, (u**2)/w)"
           - Student writes "u+w^2": Output should be "Eq(v, u + (w**2))"
        
        Provide only the "List of Equations: [...]" response, with no explanation or justification.
        """

    def create_consensus_prompt(self, student_answer: str, response1: str, 
                              response2: str, model1_id: str, model2_id: str) -> str:
        #Create prompt for reaching consensus between different responses.
        return f"""You are an expert in interpreting student mathematical responses and converting them into valid SymPy syntax.
        
        Two different interpretations were given for a student's answer, and we need your expert analysis to determine the most accurate interpretation.

        Original student answer: <<<{student_answer}>>>
        {model1_id} interpretation: {response1}
        {model2_id} interpretation: {response2}

        Guidelines for your consensus response:
        1. Begin with "List of Equations: [". 
        2. Then write the equation(s) in the student response. 
        3. Finally end your response with "]".
        
        All equations must be in SymPy format using Eq(left, right).
           - If student writes expression without "{EXPECTED_VARIABLES[0]}=", assume that this is intended to be the right hand side of the equation and the left side would be {EXPECTED_VARIABLES[0]}
           - Separate multiple equations with commas
           - Only use these variables: {EXPECTED_VARIABLES}. Capitalisation of the variables is very important, please adjust the student's variables to match the capatilisation of the expected variables given here.
           - Correct notation (v0 → v_0, x^2 → x**2) but not math errors
           - Use brackets for proper order of operations
           - Use "**(1/2)" for square root
           - If there are multiple equations, separate them by commas but include all of them within "[" and "]".
           Examples:
           - "v=2u+w" → "Eq(v, (2*u) + w)"
           - "v0=u^2/w" → "Eq(v_0, (u**2)/w)" (Not subscripts should only be included if they are in the variables list)
           - "u+w^2" → "Eq(v, u + (w**2))"

        Analyze both interpretations and the original student answer, then provide your expert consensus in the exact format specified above. 
        Provide only the "List of Equations: [...]" response, with no explanation or justification.
        """

    def build_initial_prompt(self, student_answer: str) -> str:
        initial_prompt = f"""You are an expert in interpreting student mathematical responses to physics questions. Please read this student answer: <<<{student_answer}>>>

        When writing equations, use SymPy syntax following these guidelines exactly:
           - Use Eq(left, right) syntax, e.g., "Eq(v, (2*u) + w)"
           - If student writes an expression without "{EXPECTED_VARIABLES[0]}=", assume that this is intended to be the right hand side of the equation and the left side would be {EXPECTED_VARIABLES[0]}
           - Separate multiple equations with commas
           - Only use these variables: {EXPECTED_VARIABLES}. Capitalisation of the variables is very important, please adjust the student's variables to match the capatilisation of the expected variables given here.
           - Correct notation (for example v0 → v_0 or x^2 → x**2) but do not correct math errors
           - Use brackets for proper order of operations
           - Use "**(1/2)" for square root
           - If there are multiple equations, separate them by commas but include all of them within "[" and "]".
           Examples:
           - "v=2u+w" → "Eq(v, (2*u) + w)"
           - "v0=u^2/w" → "Eq(v_0, (u**2)/w)" (Not subscripts should only be included if they are in the variables list)
           - "u+w^2" → "Eq(v, u + (w**2))"
        
        The based on these guidlines analyse the provided student answer and complete the following task exactly:
        Write "List of Equations: [". Then write all equations that are contained in the student text using SymPy format following the guidelines above. Separate each equation with a comma (,). Then write "]".  If the student has not written an equation using symbols (only describing in words) then leave the list blank "[]".
        
        When completing the task, if the student writes any variables that are not in this list: {EXPECTED_VARIABLES} use your expert judgement to interpret and rewrite what they meant in terms of these variables.
        Use the exact marker text and provide no additional text or justification.
        
        Student answer: <<<{student_answer}>>>
        Your response to the task: """
        return initial_prompt

    def extract_sections(self, text: str) -> Dict[str, str]:
        #Extract sections from LLM response by finding text between markers.
        #Returns a dictionary with snake_case keys and extracted values.
        if pd.isna(text) or not text:
            return {self._marker_to_key(marker): "Error in Extraction" 
                    for marker in MARKER_LIST}
        
        results = {}
        
        # Process each pair of consecutive markers
        for i in range(len(MARKER_LIST)):
            current_marker = MARKER_LIST[i]
            next_marker = MARKER_LIST[i + 1] if i < len(MARKER_LIST) - 1 else None
            
            # Find start of current section
            start_idx = text.find(current_marker)
            if start_idx == -1:
                # Marker not found
                results[self._marker_to_key(current_marker)] = "[]" if i == 0 else "0"
                continue
                
            # Move index to end of marker
            start_idx += len(current_marker)
            
            # Find end of section (either next marker or end of text)
            end_idx = text.find(next_marker) if next_marker else len(text)
            if end_idx == -1:
                end_idx = len(text)
                
            # Extract and clean the value
            value = text[start_idx:end_idx].strip()
            
            # Store result
            key = self._marker_to_key(current_marker)
            if "equations" in key.lower() or "equation" in key.lower():
                # Keep brackets for equations (both list_of_equations and final_equation)
                results[key] = value if value else "[]"
            else:
                # For other markers, just get the first digit or default to 0
                results[key] = next((char for char in value if char.isdigit()), "0")
        
        if self.verbose:
            print("\nExtracted results:")
            for k, v in results.items():
                print(f"{k}: {v}")
        
        return results
    
    def _marker_to_key(self, marker: str) -> str:
        """Convert a marker to a snake_case dictionary key."""
        # Remove the trailing ': ' and convert to lowercase
        key = marker.rstrip(': ').lower()
        # Replace hyphens with underscores
        key = key.replace('-', '_')
        # Replace spaces with underscores and remove special characters
        key = re.sub(r'[^a-z0-9_\s]', '', key)
        key = re.sub(r'\s+', '_', key)
        return key
        
    
    def get_llm_response(self, model_id: str, prompt: str) -> tuple[str, int]:
        """Get response from LLM model."""

        if self.verbose:
            print(f"\nCalling {model_id}...")
            
        try:
            response = ollama.chat(
                model=model_id,
                messages=[{'role': 'user', 'content': prompt}],
                options={"temperature": 0.0, "num_predict": 1500}
            )
            return response['message']['content'], response['eval_count']
        except Exception as e:
            self.logger.error(f"Error getting response from {model_id}: {str(e)}")
            return "", 0


    def reach_consensus(self, student_answer: str, equations1: str, equations2: str, 
                   df: pd.DataFrame, idx: int) -> str:
        #Attempt to reach consensus between two model responses.
        #Args:
        #    student_answer: Original student answer
        #    equations1: First model's equation response
        #    equations2: Second model's equation response
        #    df: DataFrame for storing results
        #    idx: Row index in DataFrame
        #Returns:
        #    str: Consensus equation or appropriate fallback
        # Step 0: Check initial parsing of both models' equations
        current_eq1, parsed1 = self.attempt_parse(equations1)
        current_eq2, parsed2 = self.attempt_parse(equations2)
        #######Check, this step may be unnecessary as we are pretty sure that if it is going to successfully repair that it would have already.
        #######We do however want the parsed1 and parsed2 variables.
        #######Also we definitely do not want to run this before even checking if the equations are the same. This is a huge un-needed cost.
        #####18/2/25 - introduced the new function. Note that it has many unnecessary arguments that I should remove
        
        # If only one parses, return that one
        if parsed1 is not None and parsed2 is None:
            return current_eq1
        elif parsed2 is not None and parsed1 is None:
            return current_eq2
            
        consensus_attempts = 0
        max_consensus_attempts = 2  # Two attempts after initial
        
        while consensus_attempts <= max_consensus_attempts:
            # Step 2: Check if current equations match
            if parsed1 is not None and parsed2 is not None:
                all_match, _ = self.equations_match(current_eq1, current_eq2)
                if all_match:
                    return current_eq1
            
            # Step 1: Check if we've exceeded max attempts
            if consensus_attempts >= max_consensus_attempts:
                if parsed1 is not None:
                    return current_eq1
                elif parsed2 is not None:
                    return current_eq2
                else:
                    return "Error: Unable to reach consensus and no valid equations available"
            
            consensus_attempts += 1
            self.logger.info(f"Starting consensus attempt {consensus_attempts}")
            
            # Steps 3-4: Get and check Model1's consensus response
            consensus_prompt = self.create_consensus_prompt(
                student_answer,
                current_eq1,
                current_eq2,
                self.model_ids[0],
                self.model_ids[1]
            )
            
            new_eq1, tokens = self.get_llm_response(self.model_ids[0], consensus_prompt)
            self.consensus_tokens += tokens
            df.at[idx, 'Consensus_Tokens'] = self.consensus_tokens
            new_eq1, parsed1 = self.attempt_parse_and_repair(
                new_eq1, student_answer, self.model_ids[0], df, idx, "Model1", in_consensus_mode=True
            )
            if parsed1 is not None:
                current_eq1 = new_eq1
            
            # Steps 5-6: Get and check Model2's consensus response
            consensus_prompt = self.create_consensus_prompt(
                student_answer,
                current_eq1,
                current_eq2,
                self.model_ids[1],
                self.model_ids[0]
            )
            
            new_eq2, tokens = self.get_llm_response(self.model_ids[1], consensus_prompt)
            self.consensus_tokens += tokens
            df.at[idx, 'Consensus_Tokens'] = self.consensus_tokens
            new_eq2, parsed2 = self.attempt_parse_and_repair(
                new_eq2, student_answer, self.model_ids[1], df, idx, "Model2", in_consensus_mode=True
            )
            if parsed2 is not None:
                current_eq2 = new_eq2
                
            # Step 7: If only one response parses, return it
            if parsed1 is not None and parsed2 is None:
                self.logger.info("Only Model1 response parsed successfully")
                return current_eq1
            elif parsed2 is not None and parsed1 is None:
                self.logger.info("Only Model2 response parsed successfully")
                return current_eq2
                
            self.logger.info(f"Completed consensus attempt {consensus_attempts}")
            # Loop continues to check for matches if both parse
            
        # This should never be reached due to the checks in the loop
        return "Error: Unexpected end of consensus process"
    
    def extract_bracket_content(self, response) -> str:
        if not response:
            return ""
        pattern = r'\[(.*?)\]'  # non-greedy match of anything between [ and ]
        match = re.search(pattern, str(response), re.DOTALL)
        if match:
            return match.group(1).strip()
        return ""

    def store_extracted_sections_in_df(self, df: pd.DataFrame, row_idx: int, model_label: str, 
                                  response_text: str, repairing_equations: bool = False, in_consensus_mode: bool = False):
        #Stores extracted sections in the DataFrame.
        #Args:
        #    df: DataFrame to update
        #    row_idx: Index of row to update
        #    model_label: Label of model (e.g., "Model1" or "Model2")
        #    response_text: Text response to extract sections from
        #    repairing_equations: If True, only update equations column
        sections_dict = self.extract_sections(response_text)
    
        for base_col_name, content in sections_dict.items():
            # Skip non-equation columns if we're only repairing equations
            if repairing_equations and "equations" not in base_col_name:
                continue
                
            # Strip any existing model prefixes before adding the new one
            clean_base_name = base_col_name
            for prefix in ['Model1_', 'Model2_']:
                if clean_base_name.startswith(prefix):
                    clean_base_name = clean_base_name[len(prefix):]
        
            # Build final DF column name
            final_col_name = f"{model_label}_{clean_base_name}"

        
            if "equations" in base_col_name:
                # This is the line for the bracket content, e.g. "list_of_equations"
                bracket_content = self.extract_bracket_content(content)
                # Store with the enclosing brackets
                if not in_consensus_mode:#in_consensus_mode:
                    df.at[row_idx, final_col_name] = f"[{bracket_content}]"
    
            else:
                # Attempt to convert "1"/"0" to int, else store string
                try:
                    df.at[row_idx, final_col_name] = int(content)
                except ValueError:
                    df.at[row_idx, final_col_name] = content
        return df

    def attempt_parse_and_repair(self, equations_str: str, student_answer: str, 
                                   model_id: str, df: pd.DataFrame, idx: int, 
                                   model_label: str, in_consensus_mode: bool = False) -> Tuple[str, Optional[Union[sympy.Expr, List[sympy.Expr]]]]:
        #Helper function to attempt parsing and repair if needed.
        #Args:
        #    equations_str: Equation string to parse
        #    student_answer: Original student answer
        #    model_id: ID of the model to use for repairs
        #    df: DataFrame for storing results
        #    idx: Row index in DataFrame
        #    model_label: Label of model (e.g., "Model1" or "Model2")
        #Returns:
        #    Tuple of (final equation string, parsed expression or None)
        parsed_expr = self.parse_equation(equations_str, model_label)
        
        repair_attempts = 3
        attempt_count = 0
        while parsed_expr is None and attempt_count < repair_attempts:
            attempt_count += 1
            if self.error_history['parse_errors']:
                error_msg = self.error_history['parse_errors'][-1]
            else:
                error_msg = "Unknown parsing error."
            
            repair_prompt = self.create_repair_prompt(
                original_response=equations_str,
                error_msg=error_msg,
                student_answer=student_answer
            )
            
            repair_response, repair_tokens = self.get_llm_response(model_id, repair_prompt)
            if model_label == "Model1":
                self.model1_batch1_tokens += repair_tokens
                #df.at[idx, 'Model1_Batch1_Tokens'] = self.model1_batch1_tokens
            elif model_label == "Model2":
                self.model2_batch2_tokens += repair_tokens
                #df.at[idx, 'Model2_Batch2_Tokens'] = self.model2_batch2_tokens
            self.store_extracted_sections_in_df(df, idx, model_label, repair_response, repairing_equations=True, in_consensus_mode=in_consensus_mode)
            
            equations_str = df.at[idx, f'{model_label}_list_of_equations']
            parsed_expr = self.parse_equation(equations_str, model_label)
        
        return equations_str, parsed_expr

    def attempt_parse(self, equations_str: str) -> Tuple[str, Optional[Union[sympy.Expr, List[sympy.Expr]]]]:
        parsed_expr = self.parse_equation(equations_str, "Consensus Check")   
        return equations_str, parsed_expr

    
    def load_or_create_cache(self, model_id):
        cache_dir = DICTIONARY_DIRECTORY  # Directory to store cache files
        os.makedirs(cache_dir, exist_ok=True)
        
        # Sanitize model_id before using in filename
        safe_model_id = sanitize_filename(model_id)
        cache_file = os.path.join(cache_dir, f"response_cache_{safe_model_id}.json")
        
        if os.path.exists(cache_file):
            try:
                with open(cache_file, 'r') as f:
                    return json.load(f)
            except (json.JSONDecodeError, IOError) as e:
                self.logger.error(f"Error loading cache for {model_id}: {str(e)}")
                return {}
        return {}
    
    def save_cache(self, cache_dict, model_id):
        cache_dir = DICTIONARY_DIRECTORY
        
        # Sanitize model_id before using in filename
        safe_model_id = sanitize_filename(model_id)
        cache_file = os.path.join(cache_dir, f"response_cache_{safe_model_id}.json")
        
        try:
            with open(cache_file, 'w') as f:
                json.dump(cache_dict, f)
        except IOError as e:
            self.logger.error(f"Error saving cache for {model_id}: {str(e)}")
    
    def load_consensus_cache(self, filename):
        cache_dir = DICTIONARY_DIRECTORY
        os.makedirs(cache_dir, exist_ok=True)
        
        # Sanitize filename before using
        safe_filename = sanitize_filename(filename)
        cache_file = os.path.join(cache_dir, safe_filename)
        
        if os.path.exists(cache_file):
            try:
                with open(cache_file, 'r') as f:
                    return json.load(f)
            except (json.JSONDecodeError, IOError) as e:
                self.logger.error(f"Error loading consensus cache: {str(e)}")
                return {}
        return {}
    
    def save_consensus_cache(self, cache_dict, filename):
        cache_dir = DICTIONARY_DIRECTORY
        
        # Sanitize filename before using
        safe_filename = sanitize_filename(filename)
        cache_file = os.path.join(cache_dir, safe_filename)
        
        try:
            with open(cache_file, 'w') as f:
                json.dump(cache_dict, f)
        except IOError as e:
            self.logger.error(f"Error saving consensus cache: {str(e)}")
    
    def process_dataframe_in_batches(self, test: bool = False, do_consensus: bool = False): 
        #Process all rows in multiple steps/batches to avoid repeated reloading of models.
        #Saves partial progress to PROCESSED_FILE after each row to allow resuming.

        # -----------------------------------------------------------------
        # 1. Load or create DataFrame
        # -----------------------------------------------------------------
        if os.path.exists(PROCESSED_FILE):
            if self.verbose:
                print(f"\nLoading existing processed file: {PROCESSED_FILE}")
            df = pd.read_csv(PROCESSED_FILE, sep='\t', encoding='latin-1')
        else:
            if self.verbose:
                print(f"\nStarting new processing from: {ORIGINAL_FILE}")
            df = pd.read_csv(ORIGINAL_FILE, sep='\t', encoding='latin-1')
        
        #model1_cache = self.load_or_create_cache(self.model_ids[0])
        #model2_cache = self.load_or_create_cache(self.model_ids[1])
        
        # Read last recorded error counts
        if os.path.exists(PROCESSED_FILE):
            last_row_with_counts = df.loc[df['Model1_Batch1_Fails'].notna() | 
                                          df['Model2_Batch2_Fails'].notna() | 
                                          df['Model1_Consensus_Fails'].notna() | 
                                          df['Model2_Consensus_Fails'].notna()]
            
            if not last_row_with_counts.empty:
                last_idx = last_row_with_counts.index.max()
                model1_fails_val = df.at[last_idx, 'Model1_Batch1_Fails']
                model2_fails_val = df.at[last_idx, 'Model2_Batch2_Fails']
                
                self.model1_fails = int(model1_fails_val) if pd.notna(model1_fails_val) else 0
                self.model2_fails = int(model2_fails_val) if pd.notna(model2_fails_val) else 0

                
                if self.verbose:
                    print(f"Resuming with Model1 fails: {self.model1_fails}, Model2 fails: {self.model2_fails}")
            else:
                self.model1_fails = 0
                self.model2_fails = 0
        else:
            self.model1_fails = 0
            self.model2_fails = 0


        # Read last recorded time values
        if os.path.exists(PROCESSED_FILE):
            last_row_with_times = df.loc[df['Total_Processing_Time'].notna()]
            
            if not last_row_with_times.empty:
                last_idx = last_row_with_times.index.max()
                while (last_idx in df.index and (pd.isna(df.at[last_idx, 'Model2_Batch2_Time']) or df.at[last_idx, 'Model2_Batch2_Time'] == 0.0)):
                    last_idx -= 1
        
                m1_time = df.at[last_idx, 'Model1_Batch1_Time']
                m2_time = df.at[last_idx, 'Model2_Batch2_Time']
                cons_time = df.at[last_idx, 'Consensus_Time']
                total_time = df.at[last_idx, 'Total_Processing_Time']
                
                r1_time = df.at[last_idx, 'Model1_Raw_Time']
                if pd.notna(r1_time):
                    self.model1_raw_time = float(r1_time)
                
                r2_time = df.at[last_idx, 'Model2_Raw_Time']
                if pd.notna(r2_time):
                    self.model2_raw_time = float(r2_time)
                
                self.model1_batch_time = float(m1_time) if pd.notna(m1_time) else 0.0
                self.model2_batch_time = float(m2_time) if pd.notna(m2_time) else 0.0
                self.consensus_time = float(cons_time) if pd.notna(cons_time) else 0.0
                self.total_processing_time = float(total_time) if pd.notna(total_time) else 0.0

                r1_raw = df.at[last_idx, 'Model1_Raw_Tokens']
                if pd.notna(r1_raw):
                    self.model1_raw_tokens = float(r1_raw)
        
                r2_raw = df.at[last_idx, 'Model2_Raw_Tokens']
                if pd.notna(r2_raw):
                    self.model2_raw_tokens = float(r2_raw)
        
                r1_b = df.at[last_idx, 'Model1_Batch1_Tokens']
                if pd.notna(r1_b):
                    self.model1_batch1_tokens = float(r1_b)
        
                r2_b = df.at[last_idx, 'Model2_Batch2_Tokens']
                if pd.notna(r2_b):
                    self.model2_batch2_tokens = float(r2_b)
        
                c_tok = df.at[last_idx, 'Consensus_Tokens']
                if pd.notna(c_tok):
                    self.consensus_tokens = float(c_tok)
                
                if self.verbose:
                    print(f"Resuming with cumulative times (seconds):")
                    print(f"  Model1 batch: {self.model1_batch_time:.1f}")
                    print(f"  Model2 batch: {self.model2_batch_time:.1f}")
                    print(f"  Consensus: {self.consensus_time:.1f}")
                    print(f"  Total: {self.total_processing_time:.1f}")
            else:
                # Initialize time tracking variables
                self.model1_batch_time = 0
                self.model2_batch_time = 0
                self.consensus_time = 0
                self.total_processing_time = 0
        else:
            # Initialize time tracking variables
            self.model1_batch_time = 0
            self.model2_batch_time = 0
            self.consensus_time = 0
            self.total_processing_time = 0


        if test:
            df = df.head(10)
            if self.verbose:
                print(df)
        
        # Store original columns for reference
        self.original_columns = [
            col for col in df.columns
            if not col.startswith(('Model1_', 'Model2_', 'Consensus_', 'Repair', 'Final_', 'Required_'))
        ]
    
        # -----------------------------------------------------------------
        # 2. Ensure required columns exist
        # -----------------------------------------------------------------
        needed_columns = COLUMN_NAMES + ['Consensus_Equation'] + ['Model1_raw_list_of_equations', 'Model2_raw_list_of_equations'] + MODEL_ERROR_COLUMNS + TIME_TRACKING_COLUMNS + TOKEN_TRACKING_COLUMNS
        for col in needed_columns:
            if col not in df.columns:
                df[col] = None
    
        # -----------------------------------------------------------------
        # 3. Batch 1: Call Model1 for all rows that need it
        # -----------------------------------------------------------------
        for idx, row in df.iterrows():
            start_time = time.time()
            model1_cache = self.load_or_create_cache(self.model_ids[0])
            print(f"Model1 processing for row {idx}")
            student_answer = str(row[self.original_columns[0]])
            
            if pd.isna(row['Model1_list_of_equations']) or row['Model1_list_of_equations'] in ('', 'Error in Extraction.'):
                if not student_answer.strip() or student_answer.strip() == '-':
                    # Handle blank answers
                    response1 = Default_Response
                    df.at[idx, 'Model1_raw_list_of_equations'] = f'[{self.extract_bracket_content(self.extract_sections(response1)["list_of_equations"])}]'
                    raw_elapsed = time.time() - start_time
                    self.model1_raw_time += raw_elapsed
                    self.model1_raw_tokens += 0
                    self.model1_batch1_tokens += 0
                    df.at[idx, 'Model1_Raw_Tokens'] = self.model1_raw_tokens
                    df.at[idx, 'Model1_Raw_Time'] = self.model1_raw_time
                    df.at[idx, 'Model1_Batch1_Tokens'] =  self.model1_batch1_tokens
                    self.store_extracted_sections_in_df(df, idx, "Model1", response1)
                else:
                    if student_answer in model1_cache:
                        if self.verbose:
                            print(f"Cache hit for Model1, row {idx}")
                        cached_data = model1_cache[student_answer]
                        
                        # Store the raw response
                        response1 = cached_data['raw_response']
                        df.at[idx, 'Model1_raw_list_of_equations'] = f'[{self.extract_bracket_content(self.extract_sections(response1)["list_of_equations"])}]'
                        self.model1_raw_tokens += 0
                        self.model1_batch1_tokens += 0
                        df.at[idx, 'Model1_Raw_Tokens'] = self.model1_raw_tokens
                        df.at[idx, 'Model1_Batch1_Tokens'] =  self.model1_batch1_tokens
                        raw_elapsed = time.time() - start_time
                        self.model1_raw_time += raw_elapsed
                        df.at[idx, 'Model1_Raw_Time'] = self.model1_raw_time
                        self.store_extracted_sections_in_df(df, idx, "Model1", response1)
                        
                        # Store the final equations after repair
                        if 'final_equations' in cached_data:
                            df.at[idx, 'Model1_list_of_equations'] = cached_data['final_equations']
                    
                    else:
                        # Get initial response from Model1
                        prompt = self.build_initial_prompt(student_answer)
                        response1, raw_tokens = self.get_llm_response(self.model_ids[0], prompt)
                        self.model1_raw_tokens += raw_tokens
                        self.model1_batch1_tokens += raw_tokens
                        df.at[idx, 'Model1_Batch1_Tokens'] = self.model1_batch1_tokens
                        df.at[idx, 'Model1_Raw_Tokens'] = self.model1_raw_tokens
                        #print(f"{response1=}")
                        df.at[idx, 'Model1_raw_list_of_equations'] = f'[{self.extract_bracket_content(self.extract_sections(response1)["list_of_equations"])}]'
                        self.store_extracted_sections_in_df(df, idx, "Model1", response1)
                        raw_elapsed = time.time() - start_time
                        
                        self.model1_raw_time += raw_elapsed
                        df.at[idx, 'Model1_Raw_Time'] = self.model1_raw_time
                        
                        # Attempt parsing and repair if needed
                        equations_str = df.at[idx, 'Model1_list_of_equations']
                        final_eq1, parsed1 = self.attempt_parse_and_repair(
                            equations_str, student_answer, self.model_ids[0], df, idx, "Model1"
                        )

                        model1_cache[student_answer] = {
                            'raw_response': response1,
                            'final_equations': df.at[idx, 'Model1_list_of_equations'],
                        }
                        
                        # Save the updated cache
                        self.save_cache(model1_cache, self.model_ids[0])

                # Record time and save
                elapsed = time.time() - start_time
                self.model1_batch_time += elapsed
                self.total_processing_time += elapsed
                df.at[idx, 'Model1_Batch1_Tokens'] = self.model1_batch1_tokens
                df.at[idx, 'Model1_Batch1_Fails'] = self.model1_fails
                df.at[idx, 'Model1_Batch1_Time'] = self.model1_batch_time
                df.at[idx, 'Total_Processing_Time'] = self.total_processing_time
                df.to_csv(PROCESSED_FILE, sep='\t', index=False, encoding='utf-8')
        
        # -----------------------------------------------------------------
        # 4. Batch 2: Call Model2 for all rows that need it
        # -----------------------------------------------------------------
        for idx, row in df.iterrows():
            start_time = time.time()
            # Load cache for this iteration
            model2_cache = self.load_or_create_cache(self.model_ids[1])
            
            print(f"Model2 processing for row {idx}")
            student_answer = str(row[self.original_columns[0]])
            
            if pd.isna(row['Model2_list_of_equations']) or row['Model2_list_of_equations'] in ('', 'Error in Extraction.'):
                if not student_answer.strip() or student_answer.strip() == '-':
                    # Handle blank answers
                    response2 = Default_Response
                    df.at[idx, 'Model2_raw_list_of_equations'] = f'[{self.extract_bracket_content(self.extract_sections(response2)["list_of_equations"])}]'
                    raw_elapsed = time.time() - start_time
                    self.model2_raw_time += raw_elapsed
                    self.model2_raw_tokens += 0
                    self.model2_batch2_tokens += 0
                    df.at[idx, 'Model2_Raw_Tokens'] = self.model2_raw_tokens
                    df.at[idx, 'Model2_Raw_Time'] = self.model2_raw_time
                    df.at[idx, 'Model2_Batch2_Tokens'] =  self.model2_batch2_tokens
                    self.store_extracted_sections_in_df(df, idx, "Model2", response2)
                else:
                    if student_answer in model2_cache:
                        if self.verbose:
                            print(f"Cache hit for Model2, row {idx}")
                        cached_data = model2_cache[student_answer]
                        
                        # Store the raw response
                        response2 = cached_data['raw_response']
                        df.at[idx, 'Model2_raw_list_of_equations'] = f'[{self.extract_bracket_content(self.extract_sections(response2)["list_of_equations"])}]'
                        self.model2_raw_tokens += 0
                        self.model2_batch2_tokens += 0
                        df.at[idx, 'Model2_Raw_Tokens'] = self.model2_raw_tokens
                        raw_elapsed = time.time() - start_time
                        self.model2_raw_time += raw_elapsed
                        df.at[idx, 'Model2_Raw_Time'] = self.model2_raw_time
                        df.at[idx, 'Model2_Batch2_Tokens'] =  self.model2_batch2_tokens
                        self.store_extracted_sections_in_df(df, idx, "Model2", response2)
                        
                        # Store the final equations after repair
                        if 'final_equations' in cached_data:
                            df.at[idx, 'Model2_list_of_equations'] = cached_data['final_equations']
                    
                    else:
                        # Get initial response from Model2
                        prompt = self.build_initial_prompt(student_answer)
                        response2, raw_tokens = self.get_llm_response(self.model_ids[1], prompt)
                        self.model2_raw_tokens += raw_tokens
                        self.model2_batch2_tokens += raw_tokens
                        df.at[idx, 'Model2_Raw_Tokens'] = self.model2_raw_tokens
                        df.at[idx, 'Model2_Batch2_Tokens'] = self.model2_batch2_tokens
                        #print(f"{response2=}")
                        df.at[idx, 'Model2_raw_list_of_equations'] = f'[{self.extract_bracket_content(self.extract_sections(response2)["list_of_equations"])}]'
                        self.store_extracted_sections_in_df(df, idx, "Model2", response2)
                        raw_elapsed = time.time() - start_time
                        
                        self.model2_raw_time += raw_elapsed
                        df.at[idx, 'Model2_Raw_Time'] = self.model2_raw_time
                        
                        # Attempt parsing and repair if needed
                        equations_str = df.at[idx, 'Model2_list_of_equations']
                        final_eq2, parsed2 = self.attempt_parse_and_repair(
                            equations_str, student_answer, self.model_ids[1], df, idx, "Model2"
                        )
        
                        model2_cache[student_answer] = {
                            'raw_response': response2,
                            'final_equations': df.at[idx, 'Model2_list_of_equations'],
                        }
                
                        # Save the updated cache
                        self.save_cache(model2_cache, self.model_ids[1])
        
                # Record time and save
                elapsed = time.time() - start_time
                self.model2_batch_time += elapsed
                self.total_processing_time += elapsed
                df.at[idx, 'Model2_Batch2_Tokens'] = self.model2_batch2_tokens
                df.at[idx, 'Model2_Batch2_Fails'] = self.model2_fails
                df.at[idx, 'Model2_Batch2_Time'] = self.model2_batch_time
                df.at[idx, 'Total_Processing_Time'] = self.total_processing_time
                df.to_csv(PROCESSED_FILE, sep='\t', index=False, encoding='utf-8')
        
        # -----------------------------------------------------------------
        # 5. Check for matches and attempt consensus where needed
        # -----------------------------------------------------------------
        if do_consensus:
            for idx, row in df.iterrows():
                start_time = time.time()
    
                consensus_cache_filename = f"consensus_cache_{sanitize_filename(self.model_ids[0])}_{sanitize_filename(self.model_ids[1])}.json"
                consensus_cache = self.load_consensus_cache(consensus_cache_filename)
                
                print(f"Consensus processing for row {idx}")
                student_answer = str(row[self.original_columns[0]])
    
                if pd.notna(row['Consensus_Equation']) and row['Consensus_Equation'] not in ('', 'Error in Extraction.'):
                    print(f"  Skipping row {idx} - consensus already exists")
                    continue
                    
                eq1 = row['Model1_list_of_equations'] or ""
                eq2 = row['Model2_list_of_equations'] or ""
        
                # Handle empty equations
                if not eq1.strip('[]') and not eq2.strip('[]'):
                    df.at[idx, 'Consensus_Equation'] = '[]'
                    elapsed = time.time() - start_time
                    self.consensus_time += elapsed
                    self.total_processing_time += elapsed
                    df.at[idx, 'Consensus_Time'] = self.consensus_time 
                    df.at[idx, 'Total_Processing_Time'] = self.total_processing_time
                    df.at[idx, 'Model1_Consensus_Fails'] = self.model1_fails
                    df.at[idx, 'Model2_Consensus_Fails'] = self.model2_fails
                    df.at[idx, 'Consensus_Tokens'] = self.consensus_tokens
                    df.to_csv(PROCESSED_FILE, sep='\t', index=False, encoding='utf-8')
                    continue
        
    
                cache_key = f"{student_answer}"
                
                if cache_key in consensus_cache:
                    print(f"  Consensus cache hit for row {idx}")
                    df.at[idx, 'Consensus_Equation'] = consensus_cache[cache_key]
                else:
                    # Try to reach consensus
                    consensus_equation = self.reach_consensus(student_answer, eq1, eq2, df, idx)
                    df.at[idx, 'Consensus_Equation'] = consensus_equation
                    
                    # Add to cache
                    consensus_cache[cache_key] = consensus_equation
                    self.save_consensus_cache(consensus_cache, consensus_cache_filename)
    
                # Record time and save
                elapsed = time.time() - start_time
                self.consensus_time += elapsed
                self.total_processing_time += elapsed
                df.at[idx, 'Consensus_Time'] = self.consensus_time 
                df.at[idx, 'Total_Processing_Time'] = self.total_processing_time
                df.at[idx, 'Model1_Consensus_Fails'] = self.model1_fails
                df.at[idx, 'Model2_Consensus_Fails'] = self.model2_fails
                df.at[idx, 'Consensus_Tokens'] = self.consensus_tokens
        
                # Save progress
                df.to_csv(PROCESSED_FILE, sep='\t', index=False, encoding='utf-8')
    
        # -----------------------------------------------------------------
        # 6. Final summary
        # -----------------------------------------------------------------
        if True: #self.verbose:
            total_rows = len(df)
            consensus_reached = df['Consensus_Equation'].notna().sum()
            print("\n" + "="*50)
            print("BATCH PROCESSING SUMMARY")
            print("="*50)
            print(f"Total rows: {total_rows}")
            print(f"Consensus reached: {consensus_reached}")
            print(f"Model1 fails: {self.model1_fails}")
            print(f"Model2 fails: {self.model2_fails}")
            print(f"Time elapsed (seconds):")
            print(f"  Model1 batch: {self.model1_batch_time:.1f}")
            print(f"  Model2 batch: {self.model2_batch_time:.1f}")
            print(f"  Consensus: {self.consensus_time:.1f}")
            print(f"  Total: {self.total_processing_time:.1f}")
            print(f"  Average per row: {self.total_processing_time/total_rows:.1f}")
            print("="*50)
        
        return df, self.model1_fails, self.model2_fails


In [None]:
# Instantiate the validator
validator = LLMResponseValidator(max_repair_attempts=3, verbose=False)

print(f"Processing file: {ORIGINAL_FILE}")
print(f"Results will be saved to: {PROCESSED_FILE}")

# Call the new batch method
processed_df, model1fails, model2fails = validator.process_dataframe_in_batches(test = False, do_consensus = True)

print("\nProcessing complete!")
print(f"Total rows processed: {len(processed_df)}")

# Print final statistics
print("\nFinal Statistics:")
#print(f"Total rows needing repair: {processed_df['RepairNeeded'].sum()}")

# Display first few rows of the processed dataframe
print("\nFirst few rows of processed data:")
display(processed_df.head())  # or just 'print(processed_df.head())'
print(f"{model1fails=}")
print(f"{model2fails=}")
