# Context Horizons Benchmark Analysis

This notebook analyzes the results of the LLM context benchmarking suite.

In [None]:
import os
import json
import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set style
sns.set_theme(style="whitegrid", context="paper", font_scale=1.2)
%matplotlib inline

## Load Results

In [None]:
RESULTS_DIR = "results"
PLOTS_DIR = "plots"

def load_results():
    results = []
    files = glob.glob(os.path.join(RESULTS_DIR, "*_results.json"))
    for f in files:
        try:
            with open(f, "r") as file:
                data = json.load(file)
                if "experiment_metadata" in data:
                    results.append({"type": "detailed", "metadata": data["experiment_metadata"], "results": data["results"]})
                else:
                    results.append({"type": "standard", "data": data})
        except Exception as e:
            print(f"Error loading {f}: {e}")
    return results

results = load_results()
print(f"Loaded {len(results)} result files."")

## Experiment 1: Needle in Haystack Analysis

We analyze the accuracy of retrieval at different positions (Start, Middle, End).

In [None]:
data = []
for res in results:
    if res.get("type") == "standard":
        model = res["data"]["model"]
        for pos, metrics in res["data"]["exp1_needle"].items():
            data.append({
                "Model": model,
                "Position": pos.capitalize(),
                "Accuracy": metrics["accuracy"]
            })

if data:
    df = pd.DataFrame(data)
    pivot_df = df.pivot(index="Model", columns="Position", values="Accuracy")
    plt.figure(figsize=(10, 6))
    sns.heatmap(pivot_df, annot=True, cmap="RdYlGn", vmin=0, vmax=1)
    plt.title("Needle in Haystack Accuracy")
    plt.show()

## Experiment 2: Context Scaling

Analyzing how accuracy and latency degrade as context size increases.

In [None]:
data = []
for res in results:
    if res.get("type") == "standard":
        model = res["data"]["model"]
        for entry in res["data"]["exp2_size"]:
            data.append({
                "Model": model,
                "Tokens": entry["token_count"],
                "Accuracy": entry["accuracy"],
                "Latency": entry["latency"]
            })

if data:
    df = pd.DataFrame(data)
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    sns.lineplot(data=df, x="Tokens", y="Accuracy", hue="Model", marker="o", ax=ax1)
    ax1.set_title("Accuracy vs Context Size")
    sns.lineplot(data=df, x="Tokens", y="Latency", hue="Model", marker="s", ax=ax2)
    ax2.set_title("Latency vs Context Size")
    plt.show()

## Experiment 3: RAG vs Full Context

Comparing the efficiency and accuracy of RAG against stuffing the full context.

In [None]:
data = []
for res in results:
    if res.get("type") == "standard":
        model = res["data"]["model"]
        rag_data = res["data"].get("exp3_rag", {})
        if rag_data:
            data.append({"Model": model, "Method": "Full", "Latency": rag_data.get("full_context", {}).get("latency", 0)})
            data.append({"Model": model, "Method": "RAG", "Latency": rag_data.get("rag", {}).get("latency", 0)})

if data:
    df = pd.DataFrame(data)
    plt.figure(figsize=(10, 6))
    sns.barplot(data=df, x="Model", y="Latency", hue="Method")
    plt.yscale("log")
    plt.title("Latency: RAG vs Full Context (Log Scale)")
    plt.show()