In [None]:
import numpy as np
import torch
from task import Task
import trainer
from tqdm import tqdm

def evaluate_model(model, tasks):
    """
    Evaluate model performance on a set of tasks with explicit shape prediction metrics.
    
    Args:
        model: The model to evaluate
        tasks: List of Task objects
        
    Returns:
        Dictionary of evaluation metrics including node-level accuracy and shape prediction metrics
    """
    total_tasks = len(tasks)
    total_train_grids = sum(len(task.train_pairs) for task in tasks)
    total_test_grids = sum(len(task.test_pairs) for task in tasks)
    total_grids = total_train_grids + total_test_grids
    
    correct_tasks = 0
    correct_train_grids = 0
    correct_test_grids = 0
    
    # Node-level accuracy tracking
    total_train_nodes = 0
    total_test_nodes = 0
    correct_train_nodes = 0
    correct_test_nodes = 0
    
    # Shape prediction metrics
    total_shape_predictions = 0
    correct_shape_predictions = 0
    
    # Evaluate each task
    with tqdm(total=len(tasks), desc="Evaluating model on tasks") as pbar:
        for task in tasks:
            # Run model to get predictions
            predictions = model.solve(task)
            
            # Check predictions match
            if len(predictions) != len(task.test_pairs):
                print(f"Warning: Number of predictions ({len(predictions)}) doesn't match number of test pairs ({len(task.test_pairs)}) for task {task.task_id}")
            
            # Count correct train grids and nodes
            train_correct = 0

            train_predictions = model.solve(task)
            
            for i, ((_, expected), predicted) in enumerate(zip(task.train_pairs, train_predictions)):
                expected_np = np.array(expected)
                
                # First check if shape is correct
                shape_is_correct = (predicted.shape == expected_np.shape)
                
                # For grid-level accuracy: if shape is wrong, it's automatically incorrect
                if not shape_is_correct:
                    is_correct = False
                else:
                    # Shapes match, now check content
                    is_correct = np.array_equal(predicted, expected_np)
                
                # For node-level metrics, still calculate based on the overlapping area
                # This gives us some measure of content accuracy even with wrong shape
                if not shape_is_correct:
                    # If shape doesn't match, resize for node comparison only
                    resized_prediction = np.zeros(expected_np.shape, dtype=predicted.dtype)
                    min_rows = min(predicted.shape[0], expected_np.shape[0])
                    min_cols = min(predicted.shape[1], expected_np.shape[1])
                    resized_prediction[:min_rows, :min_cols] = predicted[:min_rows, :min_cols]
                    predicted_for_node_check = resized_prediction
                else:
                    predicted_for_node_check = predicted
                
                # Calculate node-level accuracy
                train_grid_total_nodes = expected_np.size
                train_grid_correct_nodes = np.sum(predicted_for_node_check == expected_np)
                
                total_train_nodes += train_grid_total_nodes
                correct_train_nodes += train_grid_correct_nodes
                    
                # Update grid level metrics
                if is_correct:
                    train_correct += 1
                    correct_train_grids += 1
            
            # Count correct test grids and nodes
            test_correct = 0
            
            for i, ((_, expected), predicted) in enumerate(zip(task.test_pairs, predictions)):
                expected_np = np.array(expected)
                
                # Track shape prediction accuracy
                total_shape_predictions += 1
                shape_is_correct = (predicted.shape == expected_np.shape)
                if shape_is_correct:
                    correct_shape_predictions += 1
                    
                # For grid-level accuracy: if shape is wrong, it's automatically incorrect
                if not shape_is_correct:
                    is_correct = False
                else:
                    # Shapes match, now check content
                    is_correct = np.array_equal(predicted, expected_np)
                
                # For node-level metrics, still calculate based on the overlapping area
                if not shape_is_correct:
                    # If shape doesn't match, resize for node comparison only
                    resized_prediction = np.zeros(expected_np.shape, dtype=predicted.dtype)
                    min_rows = min(predicted.shape[0], expected_np.shape[0])
                    min_cols = min(predicted.shape[1], expected_np.shape[1])
                    resized_prediction[:min_rows, :min_cols] = predicted[:min_rows, :min_cols]
                    predicted_for_node_check = resized_prediction
                else:
                    predicted_for_node_check = predicted
                
                # Calculate node-level accuracy
                test_grid_total_nodes = expected_np.size
                test_grid_correct_nodes = np.sum(predicted_for_node_check == expected_np)
                
                total_test_nodes += test_grid_total_nodes
                correct_test_nodes += test_grid_correct_nodes
                    
                # Update grid level metrics
                if is_correct:
                    test_correct += 1
                    correct_test_grids += 1
            
            # A task is correct if all its test pairs are correct
            if test_correct == len(task.test_pairs):
                correct_tasks += 1
            pbar.update(1)
    
    # Calculate overall grid metrics
    correct_grids = correct_train_grids + correct_test_grids
    train_grid_accuracy = correct_train_grids / total_train_grids if total_train_grids > 0 else 0
    test_grid_accuracy = correct_test_grids / total_test_grids if total_test_grids > 0 else 0
    overall_grid_accuracy = correct_grids / total_grids if total_grids > 0 else 0
    task_accuracy = correct_tasks / total_tasks if total_tasks > 0 else 0
    
    # Calculate shape prediction metrics
    shape_accuracy = correct_shape_predictions / total_shape_predictions if total_shape_predictions > 0 else 0
    
    # Calculate node-level metrics
    total_nodes = total_train_nodes + total_test_nodes
    correct_nodes = correct_train_nodes + correct_test_nodes
    train_node_accuracy = correct_train_nodes / total_train_nodes if total_train_nodes > 0 else 0
    test_node_accuracy = correct_test_nodes / total_test_nodes if total_test_nodes > 0 else 0
    overall_node_accuracy = correct_nodes / total_nodes if total_nodes > 0 else 0
    
    # Print summary
    print(f"\nEvaluation Results:")
    print(f"Correct tasks: {correct_tasks}/{total_tasks} ({task_accuracy:.2%})")
    print(f"Correct test grids: {correct_test_grids}/{total_test_grids} ({test_grid_accuracy:.2%})")
    print(f"Shape prediction accuracy: {correct_shape_predictions}/{total_shape_predictions} ({shape_accuracy:.2%})")
    print(f"Correct train grids: {correct_train_grids}/{total_train_grids} ({train_grid_accuracy:.2%})")
    print(f"Correct total grids: {correct_grids}/{total_grids} ({overall_grid_accuracy:.2%})")
    print(f"\nNode-level Accuracy:")
    print(f"Test nodes: {correct_test_nodes}/{total_test_nodes} ({test_node_accuracy:.2%})")
    print(f"Train nodes: {correct_train_nodes}/{total_train_nodes} ({train_node_accuracy:.2%})")
    print(f"Overall nodes: {correct_nodes}/{total_nodes} ({overall_node_accuracy:.2%})")
    
    return {
        "task_accuracy": task_accuracy,
        "test_grid_accuracy": test_grid_accuracy,
        "shape_accuracy": shape_accuracy,
        "train_grid_accuracy": train_grid_accuracy,
        "overall_grid_accuracy": overall_grid_accuracy,
        "correct_tasks": correct_tasks,
        "total_tasks": total_tasks,
        "correct_test_grids": correct_test_grids,
        "total_test_grids": total_test_grids,
        "correct_shape_predictions": correct_shape_predictions,
        "total_shape_predictions": total_shape_predictions,
        "correct_train_grids": correct_train_grids,
        "total_train_grids": total_train_grids,
        "correct_grids": correct_grids,
        "total_grids": total_grids,
        # Node-level metrics
        "test_node_accuracy": test_node_accuracy,
        "train_node_accuracy": train_node_accuracy,
        "overall_node_accuracy": overall_node_accuracy,
        "correct_test_nodes": correct_test_nodes,
        "total_test_nodes": total_test_nodes,
        "correct_train_nodes": correct_train_nodes,
        "total_train_nodes": total_train_nodes,
        "correct_nodes": correct_nodes,
        "total_nodes": total_nodes
    }

def preprocess_task_graphs(tasks, padding_value=10):
    """
    Preprocess all graphs in the given tasks once, instead of during each solve call.
    
    Args:
        tasks: List of Task objects
        padding_value: Padding value for standardizing dimensions
        
    Returns:
        The tasks with preprocessed graphs
    """
    import torch
    
    expected_dim = 3  # Standardized input dimension
    
    for task in tasks:
        # Preprocess test graphs
        for test_graph in task.test_graphs:
            if hasattr(test_graph, 'x'):
                # Convert from one-hot to class labels if needed
                if test_graph.x.dim() == 2 and test_graph.x.size(1) == 11:
                    test_graph.x = test_graph.x.argmax(dim=1)

                # Ensure x is long and 2D
                test_graph.x = test_graph.x.long()
                if test_graph.x.dim() == 1:
                    test_graph.x = test_graph.x.unsqueeze(1)  # Shape: (nodes, 1)

                # Standardize shape to (nodes, expected_dim)
                if test_graph.x.size(1) < expected_dim:
                    pad = torch.full((test_graph.x.size(0), expected_dim), padding_value, dtype=torch.long)
                    pad[:, :test_graph.x.size(1)] = test_graph.x
                    test_graph.x = pad
                elif test_graph.x.size(1) > expected_dim:
                    test_graph.x = test_graph.x[:, :expected_dim]

                # Extract positional info
                test_graph.pos = test_graph.x[:, 1:3].float() if expected_dim >= 3 else None
                test_graph.x = test_graph.x[:, 0].long().unsqueeze(1)  # Final x: shape (nodes, 1)
                
                # Mark as preprocessed
                test_graph.preprocessed = True
        
        # If the task also has train graphs, preprocess them too
        if hasattr(task, 'train_graphs'):
            for train_graph in task.train_graphs:
                if hasattr(train_graph, 'x'):
                    # Convert from one-hot to class labels if needed
                    if train_graph.x.dim() == 2 and train_graph.x.size(1) == 11:
                        train_graph.x = train_graph.x.argmax(dim=1)
    
                    # Ensure x is long and 2D
                    train_graph.x = train_graph.x.long()
                    if train_graph.x.dim() == 1:
                        train_graph.x = train_graph.x.unsqueeze(1)  # Shape: (nodes, 1)
    
                    # Standardize shape to (nodes, expected_dim)
                    if train_graph.x.size(1) < expected_dim:
                        pad = torch.full((train_graph.x.size(0), expected_dim), padding_value, dtype=torch.long)
                        pad[:, :train_graph.x.size(1)] = train_graph.x
                        train_graph.x = pad
                    elif train_graph.x.size(1) > expected_dim:
                        train_graph.x = train_graph.x[:, :expected_dim]
    
                    # Extract positional info
                    train_graph.pos = train_graph.x[:, 1:3].float() if expected_dim >= 3 else None
                    train_graph.x = train_graph.x[:, 0].long().unsqueeze(1)  # Final x: shape (nodes, 1)
                    
                    # Mark as preprocessed
                    train_graph.preprocessed = True
    
    return tasks

In [2]:
import torch
import gc
from unified_shape import UnifiedReasoningModule
from nlm_shape import NLMReasoningModule
import trainer
from task5 import Blackboard

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define model path
UNIFIED_PATH = "output/models/unified_shape/unified_shape_final.pt"
NLM_PATH = "output/models/nlm_shape/nlm_shape_final.pt"
DATA_PATH = "precomputed_tasks/evaluation_400"

results_all_runs = []

# Load tasks
tasks = trainer.load_precomputed_tasks(DATA_PATH)


for i in range(5):
    print(f"\nStarting evaluation run {i+1}/5")
    
    # Clear CUDA cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    # Force garbage collection
    gc.collect()
    
    # Reset task states if needed
    for task in tasks:
        if hasattr(task, 'blackboard'):
            task.blackboard = Blackboard()
    
    # Initialize model
    # print("Initializing unified model...")
    # model = UnifiedReasoningModule(
    #     input_dim=3,
    #     hidden_dim=128,
    #     output_dim=11,
    #     device=device
    # )
    # model.load_complete_state(UNIFIED_PATH)

    
    # Initialize model and preprocess tasks for nlm
    trainer.preprocess_task_graphs(tasks)
    print("Initializing nlm model...")
    model = NLMReasoningModule(
            input_dim=3,
            hidden_dim=128,
            output_dim=11,
            device=device
        )
    model.load_complete_state(NLM_PATH)

    # Move model to device
    model.model = model.model.to(device)
    model.model.eval() # Set to evaluation mode
    
    # Run evaluation
    results = evaluate_model(model, tasks)
    results_all_runs.append(results)
    
    # Delete model to ensure clean state for next iteration
    del model
    
    # Clear CUDA cache again
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# Analyze results across all runs
print("\nResults across all runs:")
for i, run_results in enumerate(results_all_runs):
    print(f"Run {i+1}: {run_results}")

# Calculate average performance
if results_all_runs:
    avg_accuracy = sum(r.get('accuracy', 0) for r in results_all_runs) / len(results_all_runs)
    print(f"Average accuracy: {avg_accuracy:.4f}")

Using device: cuda
Loading precomputed tasks from precomputed_tasks/evaluation_400


Loading tasks from precomputed_tasks/evaluation_400: 100%|██████████| 400/400 [00:02<00:00, 144.07it/s]


Loaded 400 precomputed tasks

Starting evaluation run 1/5
Initializing nlm model...
Model state loaded from output/models/nlm_shape/nlm_shape_final.pt


Evaluating model on tasks:   0%|          | 2/400 [00:01<05:39,  1.17it/s]


KeyboardInterrupt: 

In [2]:
import torch
import gc
import numpy as np
from unified_module import UnifiedReasoningModule
from nlm_module import NLMReasoningModule
from task_adaptation_runner import TaskAdaptationRunner
import trainer
from task5 import Blackboard, Task
from tqdm import tqdm
import json

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define model paths
UNIFIED_MAML_PATH = "output/models/unified_shape/unified_maml_final_model.pt"
NLM_MAML_PATH = "output/models/nlm_shape/nlm_maml_final_model.pt"
USE_BLACKBOARD_INSIGHTS = True  # Whether to use insights from blackboard

def evaluate_task_adaptation(tasks, model_path, reasoning_module_class, method="maml", inner_lr=0.001, inner_steps=5):
    """
    Evaluate using task adaptation with the TaskAdaptationRunner
    
    Args:
        tasks: List of tasks to evaluate
        model_path: Path to the pre-trained meta-learning model
        reasoning_module_class: Class of reasoning module (UnifiedReasoningModule or NLMReasoningModule)
        method: Meta-learning method ("maml" or "proto")
        inner_lr: Inner loop learning rate
        inner_steps: Number of inner loop steps
        
    Returns:
        Dictionary with evaluation results including shape metrics
    """
    print(f"Evaluating with {method.upper()} task adaptation with ILR:{inner_lr} and IS:{inner_steps}...")
    
    # Determine reasoning method string
    reasoning_method = "unified" if reasoning_module_class.__name__ == "UnifiedReasoningModule" else "nlm"
    
    # Initialize task adaptation runner with the correct device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    runner = TaskAdaptationRunner(
        inner_lr=inner_lr,
        inner_steps=inner_steps,
        model_path=model_path,
        model_config={'input_dim': 3},
        reasoning_module_class=reasoning_module_class,
        method=method,
        device=device
    )
    
    # Create a wrapper class to make TaskAdaptationRunner compatible with evaluate_model
    class TaskAdaptationWrapper:
        def __init__(self, runner):
            self.runner = runner
            
        def solve(self, task):
            return self.runner.adapt_to_task(
                task=task,
                visualize=False,
                save_dir=None
            )
    
    # Create the wrapper instance
    model_wrapper = TaskAdaptationWrapper(runner)
    
    # Get full results from evaluate_model
    results = evaluate_model(model_wrapper, tasks)
    
    # Return the metrics without redundant recalculation
    return {
        "full_accuracy": results['test_grid_accuracy'],
        "shape_accuracy": results['shape_accuracy'],
        "content_accuracy": results['test_node_accuracy'],
        "task_accuracy": results['task_accuracy'],
        "test_grid_accuracy": results['test_grid_accuracy'],
        "test_node_accuracy": results['test_node_accuracy']
    }

def run_evaluation(tasks, reasoning_module_class, model_path, inner_lr, inner_steps):
    """Run multiple evaluation runs and return averaged results with shape metrics"""
    num_runs = 5  # Number of evaluation runs
    all_results = []
    
    for run in range(num_runs):
        print(f"\n=== Starting evaluation run {run+1}/{num_runs} ===")
        
        # Clear CUDA cache
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        # Force garbage collection
        gc.collect()
        
        # Reset tasks' blackboards if needed
        for task in tasks:
            if hasattr(task, 'blackboard'):
                task.blackboard = Blackboard()
        
        # Run evaluation with task adaptation
        run_results = evaluate_task_adaptation(
            tasks=tasks,
            model_path=model_path,
            reasoning_module_class=reasoning_module_class,
            inner_lr=inner_lr,
            inner_steps=inner_steps,
            method="maml"
        )
        
        all_results.append(run_results)
        
        # Print comprehensive metrics for this run
        print(f"Run {run+1} metrics:")
        print(f"  Full accuracy: {run_results.get('full_accuracy', 0.0):.4f} (shape + content)")
        print(f"  Shape accuracy: {run_results.get('shape_accuracy', 0.0):.4f} (correct dimensions)")
        print(f"  Content accuracy: {run_results.get('content_accuracy', 0.0):.4f} (overlapping cells)")
        print(f"  Grid accuracy: {run_results.get('test_grid_accuracy', 0.0):.4f}")
        print(f"  Node accuracy: {run_results.get('test_node_accuracy', 0.0):.4f}")
    
    # Calculate average performance for all metrics
    metrics = [
        "full_accuracy", "shape_accuracy", "content_accuracy", 
        "test_grid_accuracy", "test_node_accuracy", "task_accuracy"
    ]
    
    avg_metrics = {}
    for metric in metrics:
        # Use get() to handle cases where old metrics might be missing
        values = [result.get(metric, 0.0) for result in all_results]
        avg_metrics[f"average_{metric}"] = sum(values) / len(values) if values else 0.0
    
    # Print comprehensive average metrics
    print(f"\nAverage metrics across {num_runs} runs:")
    print(f"Full accuracy: {avg_metrics.get('average_full_accuracy', 0.0):.4f} (shape + content)")
    print(f"Shape accuracy: {avg_metrics.get('average_shape_accuracy', 0.0):.4f} (correct dimensions)")
    print(f"Content accuracy: {avg_metrics.get('average_content_accuracy', 0.0):.4f} (overlapping cells)")
    print(f"Task accuracy: {avg_metrics.get('average_task_accuracy', 0.0):.4f}")
    print(f"Grid accuracy: {avg_metrics.get('average_test_grid_accuracy', 0.0):.4f}")
    print(f"Node accuracy: {avg_metrics.get('average_test_node_accuracy', 0.0):.4f}")
    
    # Return comprehensive results
    return {
        "all_runs": all_results,
        **avg_metrics  # Include all average metrics
    }

if __name__ == "__main__":
    MODELS = ["unified", "nlm"]
    INNER_LR = 0.05
    INNER_STEPS = [5, 10, 15]
    
    for MODEL in MODELS:
        for IS in INNER_STEPS:
            
            # Load evaluation tasks
            print("Loading evaluation tasks...")
            tasks = trainer.load_precomputed_tasks("precomputed_tasks/evaluation_400")
            
            # Run evaluation
            if MODEL=="nlm":
                tasks = trainer.preprocess_task_graphs(tasks)
                results = run_evaluation(tasks, NLMReasoningModule, NLM_MAML_PATH, INNER_LR, IS)
            else:
                results = run_evaluation(tasks, UnifiedReasoningModule, UNIFIED_MAML_PATH, INNER_LR, IS)
            
            # Convert numpy values to Python native types for JSON serialization
            def convert_numpy(obj):
                if isinstance(obj, np.number):
                    return float(obj)
                elif isinstance(obj, np.ndarray):
                    return obj.tolist()
                elif isinstance(obj, dict):
                    return {k: convert_numpy(v) for k, v in obj.items()}
                elif isinstance(obj, list):
                    return [convert_numpy(i) for i in obj]
                return obj
            
            with open(f"{MODEL}_{IS}_05_task_adaptation_results.json", "w") as f:
                json.dump(convert_numpy(results), f, indent=2)
            
            print("Evaluation completed and results saved.")

Using device: cuda
Loading evaluation tasks...
Loading precomputed tasks from precomputed_tasks/evaluation_400


Loading tasks from precomputed_tasks/evaluation_400: 100%|██████████| 400/400 [00:02<00:00, 148.47it/s]


Loaded 400 precomputed tasks

=== Starting evaluation run 1/5 ===
Evaluating with MAML task adaptation with ILR:0.05 and IS:10...
Loaded MAML model from output/models/unified_shape/unified_maml_final_model.pt


Evaluating model on tasks: 100%|██████████| 400/400 [06:05<00:00,  1.10it/s]



Evaluation Results:
Correct tasks: 0/400 (0.00%)
Correct test grids: 0/419 (0.00%)
Shape prediction accuracy: 277/419 (66.11%)
Correct train grids: 0/1363 (0.00%)
Correct total grids: 0/1782 (0.00%)

Node-level Accuracy:
Test nodes: 52273/98515 (53.06%)
Train nodes: 44892/80688 (55.64%)
Overall nodes: 97165/179203 (54.22%)
Run 1 metrics:
  Full accuracy: 0.0000 (shape + content)
  Shape accuracy: 0.6611 (correct dimensions)
  Content accuracy: 0.5306 (overlapping cells)
  Grid accuracy: 0.0000
  Node accuracy: 0.5306

=== Starting evaluation run 2/5 ===
Evaluating with MAML task adaptation with ILR:0.05 and IS:10...
Loaded MAML model from output/models/unified_shape/unified_maml_final_model.pt


Evaluating model on tasks: 100%|██████████| 400/400 [06:07<00:00,  1.09it/s]



Evaluation Results:
Correct tasks: 0/400 (0.00%)
Correct test grids: 0/419 (0.00%)
Shape prediction accuracy: 277/419 (66.11%)
Correct train grids: 0/1363 (0.00%)
Correct total grids: 0/1782 (0.00%)

Node-level Accuracy:
Test nodes: 52273/98515 (53.06%)
Train nodes: 44892/80688 (55.64%)
Overall nodes: 97165/179203 (54.22%)
Run 2 metrics:
  Full accuracy: 0.0000 (shape + content)
  Shape accuracy: 0.6611 (correct dimensions)
  Content accuracy: 0.5306 (overlapping cells)
  Grid accuracy: 0.0000
  Node accuracy: 0.5306

=== Starting evaluation run 3/5 ===
Evaluating with MAML task adaptation with ILR:0.05 and IS:10...
Loaded MAML model from output/models/unified_shape/unified_maml_final_model.pt


Evaluating model on tasks: 100%|██████████| 400/400 [06:06<00:00,  1.09it/s]



Evaluation Results:
Correct tasks: 0/400 (0.00%)
Correct test grids: 0/419 (0.00%)
Shape prediction accuracy: 277/419 (66.11%)
Correct train grids: 0/1363 (0.00%)
Correct total grids: 0/1782 (0.00%)

Node-level Accuracy:
Test nodes: 52273/98515 (53.06%)
Train nodes: 44892/80688 (55.64%)
Overall nodes: 97165/179203 (54.22%)
Run 3 metrics:
  Full accuracy: 0.0000 (shape + content)
  Shape accuracy: 0.6611 (correct dimensions)
  Content accuracy: 0.5306 (overlapping cells)
  Grid accuracy: 0.0000
  Node accuracy: 0.5306

=== Starting evaluation run 4/5 ===
Evaluating with MAML task adaptation with ILR:0.05 and IS:10...
Loaded MAML model from output/models/unified_shape/unified_maml_final_model.pt


Evaluating model on tasks: 100%|██████████| 400/400 [06:06<00:00,  1.09it/s]



Evaluation Results:
Correct tasks: 0/400 (0.00%)
Correct test grids: 0/419 (0.00%)
Shape prediction accuracy: 277/419 (66.11%)
Correct train grids: 0/1363 (0.00%)
Correct total grids: 0/1782 (0.00%)

Node-level Accuracy:
Test nodes: 52273/98515 (53.06%)
Train nodes: 44892/80688 (55.64%)
Overall nodes: 97165/179203 (54.22%)
Run 4 metrics:
  Full accuracy: 0.0000 (shape + content)
  Shape accuracy: 0.6611 (correct dimensions)
  Content accuracy: 0.5306 (overlapping cells)
  Grid accuracy: 0.0000
  Node accuracy: 0.5306

=== Starting evaluation run 5/5 ===
Evaluating with MAML task adaptation with ILR:0.05 and IS:10...
Loaded MAML model from output/models/unified_shape/unified_maml_final_model.pt


Evaluating model on tasks: 100%|██████████| 400/400 [06:05<00:00,  1.09it/s]



Evaluation Results:
Correct tasks: 0/400 (0.00%)
Correct test grids: 0/419 (0.00%)
Shape prediction accuracy: 277/419 (66.11%)
Correct train grids: 0/1363 (0.00%)
Correct total grids: 0/1782 (0.00%)

Node-level Accuracy:
Test nodes: 52273/98515 (53.06%)
Train nodes: 44892/80688 (55.64%)
Overall nodes: 97165/179203 (54.22%)
Run 5 metrics:
  Full accuracy: 0.0000 (shape + content)
  Shape accuracy: 0.6611 (correct dimensions)
  Content accuracy: 0.5306 (overlapping cells)
  Grid accuracy: 0.0000
  Node accuracy: 0.5306

Average metrics across 5 runs:
Full accuracy: 0.0000 (shape + content)
Shape accuracy: 0.6611 (correct dimensions)
Content accuracy: 0.5306 (overlapping cells)
Task accuracy: 0.0000
Grid accuracy: 0.0000
Node accuracy: 0.5306
Evaluation completed and results saved.
Loading evaluation tasks...
Loading precomputed tasks from precomputed_tasks/evaluation_400


Loading tasks from precomputed_tasks/evaluation_400: 100%|██████████| 400/400 [00:02<00:00, 147.61it/s]


Loaded 400 precomputed tasks

=== Starting evaluation run 1/5 ===
Evaluating with MAML task adaptation with ILR:0.05 and IS:15...
Loaded MAML model from output/models/unified_shape/unified_maml_final_model.pt


Evaluating model on tasks: 100%|██████████| 400/400 [09:07<00:00,  1.37s/it]



Evaluation Results:
Correct tasks: 0/400 (0.00%)
Correct test grids: 0/419 (0.00%)
Shape prediction accuracy: 277/419 (66.11%)
Correct train grids: 0/1363 (0.00%)
Correct total grids: 0/1782 (0.00%)

Node-level Accuracy:
Test nodes: 52273/98515 (53.06%)
Train nodes: 44892/80688 (55.64%)
Overall nodes: 97165/179203 (54.22%)
Run 1 metrics:
  Full accuracy: 0.0000 (shape + content)
  Shape accuracy: 0.6611 (correct dimensions)
  Content accuracy: 0.5306 (overlapping cells)
  Grid accuracy: 0.0000
  Node accuracy: 0.5306

=== Starting evaluation run 2/5 ===
Evaluating with MAML task adaptation with ILR:0.05 and IS:15...
Loaded MAML model from output/models/unified_shape/unified_maml_final_model.pt


Evaluating model on tasks: 100%|██████████| 400/400 [09:09<00:00,  1.37s/it]



Evaluation Results:
Correct tasks: 0/400 (0.00%)
Correct test grids: 0/419 (0.00%)
Shape prediction accuracy: 277/419 (66.11%)
Correct train grids: 0/1363 (0.00%)
Correct total grids: 0/1782 (0.00%)

Node-level Accuracy:
Test nodes: 52273/98515 (53.06%)
Train nodes: 44892/80688 (55.64%)
Overall nodes: 97165/179203 (54.22%)
Run 2 metrics:
  Full accuracy: 0.0000 (shape + content)
  Shape accuracy: 0.6611 (correct dimensions)
  Content accuracy: 0.5306 (overlapping cells)
  Grid accuracy: 0.0000
  Node accuracy: 0.5306

=== Starting evaluation run 3/5 ===
Evaluating with MAML task adaptation with ILR:0.05 and IS:15...
Loaded MAML model from output/models/unified_shape/unified_maml_final_model.pt


Evaluating model on tasks: 100%|██████████| 400/400 [09:07<00:00,  1.37s/it]



Evaluation Results:
Correct tasks: 0/400 (0.00%)
Correct test grids: 0/419 (0.00%)
Shape prediction accuracy: 277/419 (66.11%)
Correct train grids: 0/1363 (0.00%)
Correct total grids: 0/1782 (0.00%)

Node-level Accuracy:
Test nodes: 52273/98515 (53.06%)
Train nodes: 44892/80688 (55.64%)
Overall nodes: 97165/179203 (54.22%)
Run 3 metrics:
  Full accuracy: 0.0000 (shape + content)
  Shape accuracy: 0.6611 (correct dimensions)
  Content accuracy: 0.5306 (overlapping cells)
  Grid accuracy: 0.0000
  Node accuracy: 0.5306

=== Starting evaluation run 4/5 ===
Evaluating with MAML task adaptation with ILR:0.05 and IS:15...
Loaded MAML model from output/models/unified_shape/unified_maml_final_model.pt


Evaluating model on tasks: 100%|██████████| 400/400 [09:07<00:00,  1.37s/it]



Evaluation Results:
Correct tasks: 0/400 (0.00%)
Correct test grids: 0/419 (0.00%)
Shape prediction accuracy: 277/419 (66.11%)
Correct train grids: 0/1363 (0.00%)
Correct total grids: 0/1782 (0.00%)

Node-level Accuracy:
Test nodes: 52273/98515 (53.06%)
Train nodes: 44892/80688 (55.64%)
Overall nodes: 97165/179203 (54.22%)
Run 4 metrics:
  Full accuracy: 0.0000 (shape + content)
  Shape accuracy: 0.6611 (correct dimensions)
  Content accuracy: 0.5306 (overlapping cells)
  Grid accuracy: 0.0000
  Node accuracy: 0.5306

=== Starting evaluation run 5/5 ===
Evaluating with MAML task adaptation with ILR:0.05 and IS:15...
Loaded MAML model from output/models/unified_shape/unified_maml_final_model.pt


Evaluating model on tasks: 100%|██████████| 400/400 [09:08<00:00,  1.37s/it]



Evaluation Results:
Correct tasks: 0/400 (0.00%)
Correct test grids: 0/419 (0.00%)
Shape prediction accuracy: 277/419 (66.11%)
Correct train grids: 0/1363 (0.00%)
Correct total grids: 0/1782 (0.00%)

Node-level Accuracy:
Test nodes: 52273/98515 (53.06%)
Train nodes: 44892/80688 (55.64%)
Overall nodes: 97165/179203 (54.22%)
Run 5 metrics:
  Full accuracy: 0.0000 (shape + content)
  Shape accuracy: 0.6611 (correct dimensions)
  Content accuracy: 0.5306 (overlapping cells)
  Grid accuracy: 0.0000
  Node accuracy: 0.5306

Average metrics across 5 runs:
Full accuracy: 0.0000 (shape + content)
Shape accuracy: 0.6611 (correct dimensions)
Content accuracy: 0.5306 (overlapping cells)
Task accuracy: 0.0000
Grid accuracy: 0.0000
Node accuracy: 0.5306
Evaluation completed and results saved.
Loading evaluation tasks...
Loading precomputed tasks from precomputed_tasks/evaluation_400


Loading tasks from precomputed_tasks/evaluation_400: 100%|██████████| 400/400 [00:02<00:00, 154.01it/s]


Loaded 400 precomputed tasks

=== Starting evaluation run 1/5 ===
Evaluating with MAML task adaptation with ILR:0.05 and IS:10...
Loaded MAML model from output/models/nlm_shape/nlm_maml_final_model.pt


Evaluating model on tasks: 100%|██████████| 400/400 [24:04<00:00,  3.61s/it]



Evaluation Results:
Correct tasks: 0/400 (0.00%)
Correct test grids: 0/419 (0.00%)
Shape prediction accuracy: 277/419 (66.11%)
Correct train grids: 0/1363 (0.00%)
Correct total grids: 0/1782 (0.00%)

Node-level Accuracy:
Test nodes: 52273/98515 (53.06%)
Train nodes: 44892/80688 (55.64%)
Overall nodes: 97165/179203 (54.22%)
Run 1 metrics:
  Full accuracy: 0.0000 (shape + content)
  Shape accuracy: 0.6611 (correct dimensions)
  Content accuracy: 0.5306 (overlapping cells)
  Grid accuracy: 0.0000
  Node accuracy: 0.5306

=== Starting evaluation run 2/5 ===
Evaluating with MAML task adaptation with ILR:0.05 and IS:10...
Loaded MAML model from output/models/nlm_shape/nlm_maml_final_model.pt


Evaluating model on tasks: 100%|██████████| 400/400 [24:05<00:00,  3.61s/it]



Evaluation Results:
Correct tasks: 0/400 (0.00%)
Correct test grids: 0/419 (0.00%)
Shape prediction accuracy: 277/419 (66.11%)
Correct train grids: 0/1363 (0.00%)
Correct total grids: 0/1782 (0.00%)

Node-level Accuracy:
Test nodes: 52273/98515 (53.06%)
Train nodes: 44892/80688 (55.64%)
Overall nodes: 97165/179203 (54.22%)
Run 2 metrics:
  Full accuracy: 0.0000 (shape + content)
  Shape accuracy: 0.6611 (correct dimensions)
  Content accuracy: 0.5306 (overlapping cells)
  Grid accuracy: 0.0000
  Node accuracy: 0.5306

=== Starting evaluation run 3/5 ===
Evaluating with MAML task adaptation with ILR:0.05 and IS:10...
Loaded MAML model from output/models/nlm_shape/nlm_maml_final_model.pt


Evaluating model on tasks: 100%|██████████| 400/400 [23:59<00:00,  3.60s/it]



Evaluation Results:
Correct tasks: 0/400 (0.00%)
Correct test grids: 0/419 (0.00%)
Shape prediction accuracy: 277/419 (66.11%)
Correct train grids: 0/1363 (0.00%)
Correct total grids: 0/1782 (0.00%)

Node-level Accuracy:
Test nodes: 52273/98515 (53.06%)
Train nodes: 44892/80688 (55.64%)
Overall nodes: 97165/179203 (54.22%)
Run 3 metrics:
  Full accuracy: 0.0000 (shape + content)
  Shape accuracy: 0.6611 (correct dimensions)
  Content accuracy: 0.5306 (overlapping cells)
  Grid accuracy: 0.0000
  Node accuracy: 0.5306

=== Starting evaluation run 4/5 ===
Evaluating with MAML task adaptation with ILR:0.05 and IS:10...
Loaded MAML model from output/models/nlm_shape/nlm_maml_final_model.pt


Evaluating model on tasks: 100%|██████████| 400/400 [23:39<00:00,  3.55s/it]



Evaluation Results:
Correct tasks: 0/400 (0.00%)
Correct test grids: 0/419 (0.00%)
Shape prediction accuracy: 277/419 (66.11%)
Correct train grids: 0/1363 (0.00%)
Correct total grids: 0/1782 (0.00%)

Node-level Accuracy:
Test nodes: 52273/98515 (53.06%)
Train nodes: 44892/80688 (55.64%)
Overall nodes: 97165/179203 (54.22%)
Run 4 metrics:
  Full accuracy: 0.0000 (shape + content)
  Shape accuracy: 0.6611 (correct dimensions)
  Content accuracy: 0.5306 (overlapping cells)
  Grid accuracy: 0.0000
  Node accuracy: 0.5306

=== Starting evaluation run 5/5 ===
Evaluating with MAML task adaptation with ILR:0.05 and IS:10...
Loaded MAML model from output/models/nlm_shape/nlm_maml_final_model.pt


Evaluating model on tasks: 100%|██████████| 400/400 [23:55<00:00,  3.59s/it]



Evaluation Results:
Correct tasks: 0/400 (0.00%)
Correct test grids: 0/419 (0.00%)
Shape prediction accuracy: 277/419 (66.11%)
Correct train grids: 0/1363 (0.00%)
Correct total grids: 0/1782 (0.00%)

Node-level Accuracy:
Test nodes: 52273/98515 (53.06%)
Train nodes: 44892/80688 (55.64%)
Overall nodes: 97165/179203 (54.22%)
Run 5 metrics:
  Full accuracy: 0.0000 (shape + content)
  Shape accuracy: 0.6611 (correct dimensions)
  Content accuracy: 0.5306 (overlapping cells)
  Grid accuracy: 0.0000
  Node accuracy: 0.5306

Average metrics across 5 runs:
Full accuracy: 0.0000 (shape + content)
Shape accuracy: 0.6611 (correct dimensions)
Content accuracy: 0.5306 (overlapping cells)
Task accuracy: 0.0000
Grid accuracy: 0.0000
Node accuracy: 0.5306
Evaluation completed and results saved.
Loading evaluation tasks...
Loading precomputed tasks from precomputed_tasks/evaluation_400


Loading tasks from precomputed_tasks/evaluation_400: 100%|██████████| 400/400 [00:02<00:00, 155.31it/s]


Loaded 400 precomputed tasks

=== Starting evaluation run 1/5 ===
Evaluating with MAML task adaptation with ILR:0.05 and IS:15...
Loaded MAML model from output/models/nlm_shape/nlm_maml_final_model.pt


Evaluating model on tasks: 100%|██████████| 400/400 [26:35<00:00,  3.99s/it]



Evaluation Results:
Correct tasks: 0/400 (0.00%)
Correct test grids: 0/419 (0.00%)
Shape prediction accuracy: 277/419 (66.11%)
Correct train grids: 0/1363 (0.00%)
Correct total grids: 0/1782 (0.00%)

Node-level Accuracy:
Test nodes: 52273/98515 (53.06%)
Train nodes: 44892/80688 (55.64%)
Overall nodes: 97165/179203 (54.22%)
Run 1 metrics:
  Full accuracy: 0.0000 (shape + content)
  Shape accuracy: 0.6611 (correct dimensions)
  Content accuracy: 0.5306 (overlapping cells)
  Grid accuracy: 0.0000
  Node accuracy: 0.5306

=== Starting evaluation run 2/5 ===
Evaluating with MAML task adaptation with ILR:0.05 and IS:15...
Loaded MAML model from output/models/nlm_shape/nlm_maml_final_model.pt


Evaluating model on tasks: 100%|██████████| 400/400 [26:37<00:00,  3.99s/it]



Evaluation Results:
Correct tasks: 0/400 (0.00%)
Correct test grids: 0/419 (0.00%)
Shape prediction accuracy: 277/419 (66.11%)
Correct train grids: 0/1363 (0.00%)
Correct total grids: 0/1782 (0.00%)

Node-level Accuracy:
Test nodes: 52273/98515 (53.06%)
Train nodes: 44892/80688 (55.64%)
Overall nodes: 97165/179203 (54.22%)
Run 2 metrics:
  Full accuracy: 0.0000 (shape + content)
  Shape accuracy: 0.6611 (correct dimensions)
  Content accuracy: 0.5306 (overlapping cells)
  Grid accuracy: 0.0000
  Node accuracy: 0.5306

=== Starting evaluation run 3/5 ===
Evaluating with MAML task adaptation with ILR:0.05 and IS:15...
Loaded MAML model from output/models/nlm_shape/nlm_maml_final_model.pt


Evaluating model on tasks: 100%|██████████| 400/400 [26:39<00:00,  4.00s/it]



Evaluation Results:
Correct tasks: 0/400 (0.00%)
Correct test grids: 0/419 (0.00%)
Shape prediction accuracy: 277/419 (66.11%)
Correct train grids: 0/1363 (0.00%)
Correct total grids: 0/1782 (0.00%)

Node-level Accuracy:
Test nodes: 52273/98515 (53.06%)
Train nodes: 44892/80688 (55.64%)
Overall nodes: 97165/179203 (54.22%)
Run 3 metrics:
  Full accuracy: 0.0000 (shape + content)
  Shape accuracy: 0.6611 (correct dimensions)
  Content accuracy: 0.5306 (overlapping cells)
  Grid accuracy: 0.0000
  Node accuracy: 0.5306

=== Starting evaluation run 4/5 ===
Evaluating with MAML task adaptation with ILR:0.05 and IS:15...
Loaded MAML model from output/models/nlm_shape/nlm_maml_final_model.pt


Evaluating model on tasks: 100%|██████████| 400/400 [26:39<00:00,  4.00s/it]



Evaluation Results:
Correct tasks: 0/400 (0.00%)
Correct test grids: 0/419 (0.00%)
Shape prediction accuracy: 277/419 (66.11%)
Correct train grids: 0/1363 (0.00%)
Correct total grids: 0/1782 (0.00%)

Node-level Accuracy:
Test nodes: 52273/98515 (53.06%)
Train nodes: 44892/80688 (55.64%)
Overall nodes: 97165/179203 (54.22%)
Run 4 metrics:
  Full accuracy: 0.0000 (shape + content)
  Shape accuracy: 0.6611 (correct dimensions)
  Content accuracy: 0.5306 (overlapping cells)
  Grid accuracy: 0.0000
  Node accuracy: 0.5306

=== Starting evaluation run 5/5 ===
Evaluating with MAML task adaptation with ILR:0.05 and IS:15...
Loaded MAML model from output/models/nlm_shape/nlm_maml_final_model.pt


Evaluating model on tasks: 100%|██████████| 400/400 [26:41<00:00,  4.00s/it]


Evaluation Results:
Correct tasks: 0/400 (0.00%)
Correct test grids: 0/419 (0.00%)
Shape prediction accuracy: 277/419 (66.11%)
Correct train grids: 0/1363 (0.00%)
Correct total grids: 0/1782 (0.00%)

Node-level Accuracy:
Test nodes: 52273/98515 (53.06%)
Train nodes: 44892/80688 (55.64%)
Overall nodes: 97165/179203 (54.22%)
Run 5 metrics:
  Full accuracy: 0.0000 (shape + content)
  Shape accuracy: 0.6611 (correct dimensions)
  Content accuracy: 0.5306 (overlapping cells)
  Grid accuracy: 0.0000
  Node accuracy: 0.5306

Average metrics across 5 runs:
Full accuracy: 0.0000 (shape + content)
Shape accuracy: 0.6611 (correct dimensions)
Content accuracy: 0.5306 (overlapping cells)
Task accuracy: 0.0000
Grid accuracy: 0.0000
Node accuracy: 0.5306
Evaluation completed and results saved.





### LLM Module

In [None]:
import os
import json
import numpy as np
import torch
from tqdm.notebook import tqdm
from task4 import Task
from llm_module import LLMReasoningModule
import traceback
import gc
import time

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Paths
DATA_DIR = "data/training"
RESULTS_DIR = "output/evaluation/llm"
os.makedirs(RESULTS_DIR, exist_ok=True)

# Load tasks
def load_tasks(directory, limit=None):
    """Load tasks from directory with optional limit"""
    tasks = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".json"):
                file_path = os.path.join(root, file)
                with open(file_path, "r") as f:
                    try:
                        data = json.load(f)
                        if "train" not in data or "test" not in data:
                            print(f"Warning: Invalid task format in {file_path}")
                            continue
                        
                        task = Task(
                            task_id=os.path.basename(file_path),
                            train_pairs=[(pair["input"], pair["output"]) for pair in data["train"]],
                            test_pairs=[(pair["input"], pair["output"]) for pair in data["test"]],
                        )
                        tasks.append(task)
                        
                        if limit and len(tasks) >= limit:
                            return tasks
                    except Exception as e:
                        print(f"Error loading {file_path}: {e}")
    return tasks

# Initialize LLM module with your fine-tuned model
def create_llm_module(model_name="gpt-4", api_key=None, temperature=0.3):
    """Create LLM module with specified model"""
    # Get API key from environment if not provided
    api_key = api_key or os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise ValueError("OpenAI API key not provided. Set OPENAI_API_KEY environment variable.")
        
    # Create module
    llm_module = LLMReasoningModule(
        model=model_name,
        api_key=api_key,
        temperature=temperature,
        max_tokens=2048,
        log_path="logs/llm_evaluation",
        cache_responses=True
    )
    
    return llm_module

# Evaluate a single prediction
def evaluate_prediction(prediction, target):
    """Check if prediction exactly matches target"""
    prediction = np.array(prediction)
    target = np.array(target)
    
    # Check if shapes match
    if prediction.shape != target.shape:
        return {
            "exact_match": False,
            "shape_match": False,
            "accuracy": 0.0
        }
    
    # Shape matches, check cell-by-cell accuracy
    total_cells = target.size
    matching_cells = np.sum(prediction == target)
    cell_accuracy = matching_cells / total_cells if total_cells > 0 else 0.0
    
    return {
        "exact_match": np.array_equal(prediction, target),
        "shape_match": True,
        "accuracy": float(cell_accuracy)
    }

# Function to examine blackboard state
def examine_blackboard(task, prefix=""):
    """Examine and print blackboard state"""
    if not hasattr(task, 'blackboard'):
        print(f"{prefix}No blackboard found")
        return {}
    
    # Get blackboard state
    blackboard_info = {}
    
    # Extract reasoning history
    reasoning_history = task.get_reasoning_history()
    blackboard_info["reasoning_steps"] = len(reasoning_history)
    
    # Extract confidence scores
    if hasattr(task.blackboard, 'confidence_scores'):
        blackboard_info["confidence_scores"] = task.blackboard.confidence_scores

    # Check for transformations
    if hasattr(task.blackboard, 'knowledge_base'):
        # New blackboard format
        transformations_keys = [k for k in task.blackboard.knowledge_base.keys() 
                                if 'transformation' in k]
        blackboard_info["has_transformations"] = len(transformations_keys) > 0
        blackboard_info["transformation_keys"] = transformations_keys
        
        # Count total knowledge items
        blackboard_info["knowledge_items"] = len(task.blackboard.knowledge_base)
    
    # For readable output, print summary
    if prefix:
        print(f"{prefix}Blackboard summary:")
        print(f"{prefix}  - Reasoning steps: {blackboard_info['reasoning_steps']}")
        print(f"{prefix}  - Knowledge items: {blackboard_info.get('knowledge_items', 'N/A')}")
        print(f"{prefix}  - Has transformations: {blackboard_info.get('has_transformations', 'N/A')}")
        
    return blackboard_info

# Detailed evaluation of LLM module
def evaluate_llm_module(llm_module, tasks, verbose=True, save_path=None):
    """Evaluate LLM module on tasks"""
    results = {
        "model_name": llm_module.model,
        "task_results": {},
        "overall_accuracy": 0.0,
        "exact_matches": 0,
        "total_grids": 0,
        "errors": [],
        "timing": {
            "total_time": 0,
            "avg_time_per_task": 0
        }
    }
    
    start_time = time.time()
    
    # Process each task
    for i, task in enumerate(tqdm(tasks, desc=f"Evaluating {llm_module.model}")):
        if verbose:
            print(f"\n\n{'='*80}")
            print(f"Task {i+1}/{len(tasks)}: {task.task_id}")
            print(f"{'='*80}")
        
        task_start_time = time.time()
        
        try:
            # Examine blackboard before LLM reasoning
            if verbose:
                print("\nInitial blackboard state:")
                examine_blackboard(task, prefix="  ")
            
            # Use the module's solve method
            if verbose:
                print("\nSolving with LLM module...")
            
            predictions = llm_module.solve(task)
            
            # Examine blackboard after LLM reasoning
            if verbose:
                print("\nBlackboard state after LLM reasoning:")
                blackboard_info = examine_blackboard(task, prefix="  ")
            
            # Evaluate each prediction
            task_exact_matches = 0
            task_total = len(task.test_pairs)
            prediction_results = []
            
            for i, (_, target_grid) in enumerate(task.test_pairs):
                if i < len(predictions):
                    eval_result = evaluate_prediction(predictions[i], target_grid)
                    prediction_results.append(eval_result)
                    task_exact_matches += int(eval_result["exact_match"])
                    
                    if verbose:
                        print(f"\nTest example {i+1}:")
                        print(f"  - Exact match: {eval_result['exact_match']}")
                        print(f"  - Cell accuracy: {eval_result['accuracy']:.4f}")
            
            # Calculate task accuracy
            task_accuracy = task_exact_matches / task_total if task_total > 0 else 0.0
            task_duration = time.time() - task_start_time
            
            # Store results
            results["task_results"][task.task_id] = {
                "exact_match_accuracy": task_accuracy,
                "exact_matches": task_exact_matches,
                "total": task_total,
                "prediction_details": prediction_results,
                "execution_time": task_duration,
            }
            
            # Update overall counts
            results["exact_matches"] += task_exact_matches
            results["total_grids"] += task_total
            
            if verbose:
                print(f"\nTask summary:")
                print(f"  - Exact matches: {task_exact_matches}/{task_total}")
                print(f"  - Task accuracy: {task_accuracy:.4f}")
                print(f"  - Execution time: {task_duration:.2f} seconds")
            
        except Exception as e:
            error_info = {
                "task_id": task.task_id,
                "error": str(e),
                "traceback": traceback.format_exc()
            }
            results["errors"].append(error_info)
            if verbose:
                print(f"\nError evaluating task {task.task_id}: {e}")
                print(traceback.format_exc())
    
    # Calculate overall metrics
    total_time = time.time() - start_time
    results["overall_accuracy"] = (
        results["exact_matches"] / results["total_grids"] 
        if results["total_grids"] > 0 else 0.0
    )
    results["timing"]["total_time"] = total_time
    results["timing"]["avg_time_per_task"] = total_time / len(tasks) if tasks else 0
    
    # Print summary
    print(f"\n{llm_module.model} Evaluation Results:")
    print(f"Overall Accuracy: {results['overall_accuracy']:.4f}")
    print(f"Exact Matches: {results['exact_matches']}/{results['total_grids']}")
    print(f"Total evaluation time: {total_time:.2f} seconds")
    print(f"Average time per task: {results['timing']['avg_time_per_task']:.2f} seconds")
    
    if results["errors"]:
        print(f"Encountered {len(results['errors'])} errors during evaluation")
    
    # Save results if requested
    if save_path:
        with open(save_path, "w") as f:
            json.dump(results, f, indent=2)
        print(f"Results saved to {save_path}")
    
    return results

# Main evaluation function
def main(model_name="gpt-4", limit_tasks=5, verbose=True):
    print(f"Loading up to {limit_tasks} tasks...")
    tasks = load_tasks(DATA_DIR, limit=limit_tasks)
    print(f"Loaded {len(tasks)} tasks")
    
    try:
        # Create LLM module with specified model
        print(f"Creating LLM module with model: {model_name}")
        llm_module = create_llm_module(model_name=model_name)
        
        # Evaluate the module
        print("Evaluating LLM module...")
        results_path = os.path.join(RESULTS_DIR, f"{model_name.replace('-', '_')}_evaluation.json")
        results = evaluate_llm_module(
            llm_module=llm_module,
            tasks=tasks,
            verbose=verbose,
            save_path=results_path
        )
        
        return results
    
    except Exception as e:
        print(f"Critical error in evaluation: {e}")
        traceback.print_exc()
        return {"error": str(e), "traceback": traceback.format_exc()}

# Run the main function
if __name__ == "__main__":
    # You can adjust these parameters as neededwhich
    results = main(
        model_name="ft:gpt-4o-mini-2024-07-18:personal:arc-agi-blackboard:BJoraD1a",  # Replace with your fine-tuned model name
        limit_tasks=3,
        verbose=True         # Set to True for detailed output
    )

Loading up to 3 tasks...
Loaded 3 tasks
Creating LLM module with model: ft:gpt-4o-mini-2024-07-18:personal:arc-agi-blackboard:BJoraD1a
Loaded 7 cached responses
Evaluating LLM module...


Evaluating ft:gpt-4o-mini-2024-07-18:personal:arc-agi-blackboard:BJoraD1a:   0%|          | 0/3 [00:00<?, ?it/…



Task 1/3: a85d4709.json

Initial blackboard state:
  Blackboard summary:
    - Reasoning steps: 0
    - Knowledge items: 1
    - Has transformations: False

Solving with LLM module...
Analyzing grid transformation pattern.

BLACKBOARD_OUTPUT:
```json
```
{"confidence": 0.6, "logical_predicates": {"transform_from_0_to_3": ["0", "2", "3", "4", "6", "8"], "transform_from_5_to_3": ["1", "5", "7"], "transform_from_0_to_4": ["0", "2", "3", "4", "6", "8"], "transform_from_5_to_4": ["1", "5", "7"], "transform_from_5_to_2": ["0", "4", "6"]}, "transformations": [], "predictions": [{"test_index": 0, "input_grid": [[0, 0, 5], [5, 0, 0], [0, 5, 0]], "output_grid": [[3, 3, 3], [2, 2, 2], [4, 4, 4]], "changes": {"total_changes": 9, "percentage_changed": 100.0, "changes_by_color": {"0->3": 2, "5->3": 1, "5->2": 1, "0->2": 2, "0->4": 2, "5->4": 1}}}]}
```

PREDICTED OUTPUTS:
Test Example 1 Input:
0 0 5
5 0 0
0 5 0

Test Example 1 Output:
3 3 3
2 2 2
4 4 4


Error parsing blackboard data: Expecting va