In [None]:
import tyro
from pathlib import Path
from entropix.config import MODEL_CONFIGS, create_model_params
from entropix.weights import load_weights
from entropix.sampler import EntropixSampler
from evals.gpqa_eval import GPQAEval

def main(
    weights_path: Path = Path("weights/1B-Instruct"),
    variant: str = "diamond",
    num_examples: int | None = None,  # For testing with fewer examples
    n_repeats: int = 1,
):
    # 1. Setup model and sampler
    model_params = create_model_params(MODEL_CONFIGS["1B"])
    xfmr_weights = load_weights(weights_path.absolute(), model_params=model_params)
    
    sampler = EntropixSampler(
        xfmr_weights=xfmr_weights,
        model_params=model_params,
    )
    
    # 2. Create and run eval
    evaluator = GPQAEval(
        variant=variant,
        num_examples=num_examples,
        n_repeats=n_repeats
    )
    
    # 3. Run evaluation
    results = evaluator(sampler)
    
    # 4. Print results
    print(f"GPQA {variant} Results:")
    print(f"Average Score: {results.score:.3f}")
    print(f"Number of Examples: {len(evaluator.examples)}")
    print("\nDetailed Metrics:")
    for metric, value in results.metrics.items():
        print(f"{metric}: {value}")

if __name__ == "__main__":
    tyro.cli(main)