# C2C Efficiency Evaluation

**Research Question:** Is latent communication (C2C) faster/cheaper than natural language?

**Date:** 2025-11-05

**Metrics:**
- Token count comparison (C2C vs text)
- Latency comparison
- Information density
- Compression ratio

In [None]:
# 1. Add parent directory to path for code imports
import sys
sys.path.append('../code')

# 2. Import required libraries
import time       # For timing measurements
import torch      # For PyTorch operations
import numpy as np  # For numerical operations
import pandas as pd  # For data analysis
import matplotlib.pyplot as plt  # For visualization
from transformers import AutoModelForCausalLM, AutoTokenizer  # For model loading

# 3. Import C2C communication modules
from communication.ai_to_ai_comm import (
    RosettaModel,            # Multi-model orchestrator
    create_c2c_projector,    # Create projector network
    generate_kv_cache_index, # Generate cache routing index
)

# 4. Import harness experiment tracking
from harness import ExperimentConfig, get_tracker

# 5. Check device availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

## 1. Setup Models and Tasks

In [None]:
# Test prompts covering different communication scenarios
test_prompts = [
    "Explain the concept of machine learning",
    "Describe the feeling of excitement",
    "What are the implications of quantum computing?",
    "Summarize the theory of evolution",
    "Compare democracy and authoritarianism",
]

# Load models (reuse from quickstart or reload)
base_model_name = "Qwen/Qwen2-0.5B-Instruct"
source_model_name = "Qwen/Qwen2-1.5B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(base_model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Loaded {len(test_prompts)} test prompts")

## 2. Benchmark C2C Communication

Measure latency and token usage for C2C communication

In [None]:
def benchmark_c2c(rosetta, prompt, max_new_tokens=50):
    """Benchmark C2C communication for a single prompt."""
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    cache_idx = generate_kv_cache_index(sequence_length=10, model_idx=1)
    
    start_time = time.time()
    with torch.no_grad():
        outputs = rosetta.generate(
            cache_idx=cache_idx,
            input_ids=inputs.input_ids,
            max_new_tokens=max_new_tokens,
            do_sample=False,
        )
    latency = time.time() - start_time
    
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    tokens_in = len(inputs.input_ids[0])
    tokens_out = len(outputs[0]) - tokens_in
    
    return {
        "latency_s": latency,
        "tokens_in": tokens_in,
        "tokens_out": tokens_out,
        "output_text": output_text,
    }

# TODO: Setup RosettaModel (or import from previous notebook)
# c2c_results = [benchmark_c2c(rosetta, prompt) for prompt in test_prompts]
print("C2C benchmarking function defined")

## 3. Benchmark Text-Based Communication

Baseline: two-step text generation (source -> text -> base)

In [None]:
def benchmark_text_based(source_model, base_model, prompt, max_new_tokens=50):
    """Benchmark text-based communication.
    
    Step 1: Source model generates text response
    Step 2: Base model processes that text
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    # Step 1: Source model generates
    start_time = time.time()
    with torch.no_grad():
        source_outputs = source_model.generate(
            inputs.input_ids,
            max_new_tokens=max_new_tokens,
            do_sample=False,
        )
    
    intermediate_text = tokenizer.decode(source_outputs[0], skip_special_tokens=True)
    
    # Step 2: Base model processes intermediate text
    base_inputs = tokenizer(intermediate_text, return_tensors="pt").to(device)
    with torch.no_grad():
        base_outputs = base_model.generate(
            base_inputs.input_ids,
            max_new_tokens=max_new_tokens,
            do_sample=False,
        )
    latency = time.time() - start_time
    
    output_text = tokenizer.decode(base_outputs[0], skip_special_tokens=True)
    tokens_in = len(inputs.input_ids[0])
    tokens_intermediate = len(source_outputs[0])
    tokens_out = len(base_outputs[0])
    total_tokens = tokens_intermediate + tokens_out
    
    return {
        "latency_s": latency,
        "tokens_in": tokens_in,
        "tokens_out": tokens_out,
        "total_tokens": total_tokens,
        "output_text": output_text,
    }

print("Text-based benchmarking function defined")

## 4. Run Experiments

Compare C2C vs text-based communication across all test prompts

In [None]:
# TODO: Uncomment and run after models are loaded
# results = []
# 
# for prompt in test_prompts:
#     c2c_result = benchmark_c2c(rosetta, prompt)
#     text_result = benchmark_text_based(source_model, base_model, prompt)
#     
#     results.append({
#         "prompt": prompt,
#         "c2c_latency": c2c_result["latency_s"],
#         "text_latency": text_result["latency_s"],
#         "c2c_tokens": c2c_result["tokens_out"],
#         "text_tokens": text_result["total_tokens"],
#         "speedup": text_result["latency_s"] / c2c_result["latency_s"],
#         "token_savings": 1 - (c2c_result["tokens_out"] / text_result["total_tokens"]),
#     })
# 
# df = pd.DataFrame(results)
# print(df)

print("Experiment code ready (uncomment to run)")

## 5. Visualize Results

In [None]:
# TODO: Uncomment after running experiments
# fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# 
# # Latency comparison
# axes[0].bar(['C2C', 'Text'], [df['c2c_latency'].mean(), df['text_latency'].mean()])
# axes[0].set_ylabel('Latency (s)')
# axes[0].set_title('Average Latency Comparison')
# 
# # Token usage comparison
# axes[1].bar(['C2C', 'Text'], [df['c2c_tokens'].mean(), df['text_tokens'].mean()])
# axes[1].set_ylabel('Tokens')
# axes[1].set_title('Average Token Usage')
# 
# plt.tight_layout()
# plt.show()
# 
# print(f"\nAverage Speedup: {df['speedup'].mean():.2f}x")
# print(f"Average Token Savings: {df['token_savings'].mean()*100:.1f}%")

print("Visualization code ready")

## 6. Track Results

Log to harness experiment tracker

In [None]:
config = ExperimentConfig(
    experiment_name="c2c_efficiency_eval",
    task_type="c2c_evaluation",
    strategy="cache_to_cache",
    provider="local",
    model=f"{base_model_name}_to_{source_model_name}",
)

tracker = get_tracker()
run_dir = tracker.start_experiment(config)

# TODO: Log results after experiments run
# for idx, row in df.iterrows():
#     result = ExperimentResult(
#         config=config,
#         task_input=row['prompt'],
#         output=f"C2C: {row['c2c_latency']:.2f}s, Text: {row['text_latency']:.2f}s",
#         latency_s=row['c2c_latency'],
#         eval_scores={
#             "speedup": row['speedup'],
#             "token_savings": row['token_savings'],
#         },
#         success=True,
#     )
#     tracker.log_result(result)

summary = tracker.finish_experiment()
print(f"Experiment logged in: {run_dir}")

## Key Findings

**Expected Results:**
1. C2C should reduce token count (no intermediate text generation)
2. Latency depends on projector efficiency vs text generation speed
3. Information density may be higher in C2C (direct semantic transfer)

**Research Questions:**
- Does C2C provide measurable efficiency gains?
- What is the overhead of cache projection vs text tokenization?
- Does efficiency vary by task type or concept complexity?