# Addition Task Training Notebook

This notebook trains a small transformer from scratch to perform k=4 digit addition.

## Task Format

- Input: `"1234 + 5678 ="`
- Output: `"6912"`

## Evaluation

- In-distribution (ID): Same format as training
- Out-of-distribution (OOD): `"1 234 + 5 678="` → accepts `"6912"` or `"6 912"` (any spacing)


## 1. Setup and Installation


In [None]:
# Install dependencies
!pip install -q torch transformers datasets trl wandb hydra-core matplotlib seaborn pandas

# Check GPU
import torch

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(
        f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB"
    )


In [None]:
# Mount Google Drive or upload files
# Option 1: If repo is in Drive
# from google.colab import drive
# drive.mount('/content/drive')
# import sys
# sys.path.append('/content/drive/MyDrive/path/to/repo')

# Option 2: Upload files directly (run this cell and upload the src/ folder)
# from google.colab import files
# uploaded = files.upload()

# Option 3: Clone from GitHub
# !git clone https://github.com/yourusername/cbai_tt.git
# import sys
# sys.path.append('/content/cbai_tt')

# For now, we'll assume the repo is available
import sys
import os

os.makedirs("src", exist_ok=True)


## 2. Task Setup Documentation


In [None]:
# Task configuration
task_setup = {
    "task": "Addition of two k-digit numbers",
    "k": 4,
    "input_format": "1234 + 5678 =",
    "output_format": "6912",
    "model_architecture": {
        "num_layers": 4,
        "hidden_size": 128,
        "num_attention_heads": 4,
        "intermediate_size": 512,
        "vocab_size": 13,  # 0-9, +, =, space
    },
    "tokenizer": {
        "type": "character-level",
        "vocab_size": 13,
        "tokens": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "+", "=", " "],
    },
}

print("Task Setup:")
print(f"  Task: {task_setup['task']}")
print(f"  k: {task_setup['k']}")
print(f"  Format: {task_setup['input_format']} → {task_setup['output_format']}")
print(f"  Model: {task_setup['model_architecture']}")
print(
    f"  Tokenizer: {task_setup['tokenizer']['type']} with {task_setup['tokenizer']['vocab_size']} tokens"
)


## 3. Data Generation and Diagnostics


In [None]:
# Import necessary modules
import random
import numpy as np
from datasets import Dataset

from src.tokenizer import CharTokenizer
from src.dataset import FinetuningDataset, get_datasets

# Initialize tokenizer
tokenizer = CharTokenizer()
print(f"Tokenizer vocab size: {len(tokenizer)}")
print(f"Sample encode: {tokenizer.encode('1234 + 5678 =')}")
print(f"Sample decode: {tokenizer.decode([1, 2, 3, 4, 10, 1, 5, 6, 7, 8, 11])}")


In [None]:
# Generate datasets
seed = 1
k = 4
train_size = 50000
val_size = 10000  # Match test size for better balance
test_id_size = 10000
test_ood_size = 10000

dataset = FinetuningDataset(
    seed=seed,
    tokenizer=tokenizer,
    apply_chat_template=False,
    k=k,
    train_size=train_size,
    val_size=val_size,
    test_id_size=test_id_size,
    test_ood_size=test_ood_size,
)

train_dataset, val_dataset, test_id_dataset = dataset.generate_data()

print(f"Train size: {len(train_dataset)}")
print(f"Val size: {len(val_dataset)}")
print(f"Test ID size: {len(test_id_dataset)}")

# Show samples
print("\nSample training examples:")
for i in range(3):
    ex = train_dataset[i]
    print(f"  {ex['prompt']} → {ex['completion']}")


In [None]:
# Verify no overlap between splits
train_pairs = {(ex["a"], ex["b"]) for ex in train_dataset}
val_pairs = {(ex["a"], ex["b"]) for ex in val_dataset}
test_pairs = {(ex["a"], ex["b"]) for ex in test_id_dataset}

overlap_train_val = len(train_pairs & val_pairs)
overlap_train_test = len(train_pairs & test_pairs)
overlap_val_test = len(val_pairs & test_pairs)

print(f"Overlap train-val: {overlap_train_val} (should be 0)")
print(f"Overlap train-test: {overlap_train_test} (should be 0)")
print(f"Overlap val-test: {overlap_val_test} (should be 0)")

data_diagnostics = {
    "train_size": len(train_dataset),
    "val_size": len(val_dataset),
    "test_id_size": len(test_id_dataset),
    "overlap_train_val": overlap_train_val,
    "overlap_train_test": overlap_train_test,
    "overlap_val_test": overlap_val_test,
}


In [None]:
# Plot distribution of sums
import matplotlib.pyplot as plt

train_sums = [ex["result"] for ex in train_dataset]

plt.figure(figsize=(10, 4))
plt.hist(train_sums, bins=50, edgecolor="black")
plt.xlabel("Sum (a + b)")
plt.ylabel("Frequency")
plt.title("Distribution of Sums in Training Data")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("sum_distribution.png", dpi=150)
plt.show()

print(f"Sum range: {min(train_sums)} to {max(train_sums)}")
print(f"Mean sum: {np.mean(train_sums):.2f}")
print(f"Std sum: {np.std(train_sums):.2f}")


## 4. Training Execution


In [None]:
# Training configuration
import time
import wandb
from transformers import AutoConfig, AutoModelForCausalLM
from trl import SFTConfig, SFTTrainer

# Hyperparameters
training_config = {
    "batch_size": 64,  # Batch size per GPU (64 examples per training step)
    "learning_rate": 5e-4,
    "num_epochs": 20,  # Reduced to prevent overfitting, will select best epoch later
    "gradient_accumulation_steps": 1,
    "warmup_ratio": 0.0,
    "weight_decay": 0.0,
    "max_grad_norm": 1.0,
}

model_config = {
    "num_hidden_layers": 4,
    "hidden_size": 128,
    "num_attention_heads": 4,
    "intermediate_size": 512,
    "vocab_size": len(tokenizer),
}

print("Training Configuration:")
print(f"  Batch size: {training_config['batch_size']}")
print(f"  Learning rate: {training_config['learning_rate']}")
print(f"  Epochs: {training_config['num_epochs']}")
print(f"\nModel Configuration:")
for k, v in model_config.items():
    print(f"  {k}: {v}")

# Track GPU info
if torch.cuda.is_available():
    gpu_info = {
        "device": torch.cuda.get_device_name(0),
        "memory_gb": torch.cuda.get_device_properties(0).total_memory / 1e9,
    }
    print(f"\nGPU: {gpu_info['device']}")
    print(f"GPU Memory: {gpu_info['memory_gb']:.2f} GB")
else:
    gpu_info = None


In [None]:
# Initialize wandb (optional - comment out if not using)
# wandb.login()
# wandb.init(
#     project="addition-task",
#     config={**training_config, **model_config}
# )


In [None]:
# Create model from scratch
config = AutoConfig.from_pretrained("gpt2")
config.num_hidden_layers = model_config["num_hidden_layers"]
config.hidden_size = model_config["hidden_size"]
config.num_attention_heads = model_config["num_attention_heads"]
config.intermediate_size = model_config["intermediate_size"]
config.vocab_size = model_config["vocab_size"]

model = AutoModelForCausalLM.from_config(config)
print(f"Model created with {sum(p.numel() for p in model.parameters())} parameters")
print(
    f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}"
)


In [None]:
# Setup training arguments
output_dir = "./checkpoints"
os.makedirs(output_dir, exist_ok=True)

effective_batch_size = (
    training_config["batch_size"] * training_config["gradient_accumulation_steps"]
)
steps_per_epoch = len(train_dataset) // effective_batch_size
total_steps = steps_per_epoch * training_config["num_epochs"]

training_args = SFTConfig(
    output_dir=output_dir,
    report_to="wandb" if wandb.run else None,
    logging_strategy="steps",
    logging_steps=max(1, steps_per_epoch // 10),
    num_train_epochs=training_config["num_epochs"],
    completion_only_loss=True,
    per_device_train_batch_size=training_config["batch_size"],
    per_device_eval_batch_size=training_config["batch_size"],
    save_steps=max(1, steps_per_epoch // 5),
    save_strategy="steps",
    save_total_limit=5,
    eval_steps=max(1, steps_per_epoch // 5),
    eval_strategy="steps",
    gradient_accumulation_steps=training_config["gradient_accumulation_steps"],
    learning_rate=training_config["learning_rate"],
    lr_scheduler_type="linear",
    weight_decay=training_config["weight_decay"],
    warmup_ratio=training_config["warmup_ratio"],
    max_grad_norm=training_config["max_grad_norm"],
    bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
    fp16=False,
    prediction_loss_only=True,
    optim="adamw_torch_fused",
)

print(f"Steps per epoch: {steps_per_epoch}")
print(f"Total steps: {total_steps}")


In [None]:
# Create trainer
trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=training_args,
)

# Track training time
start_time = time.time()
print(f"Starting training at {time.strftime('%Y-%m-%d %H:%M:%S')}")

# Train
trainer.train()

end_time = time.time()
training_time = end_time - start_time
print(
    f"\nTraining completed in {training_time / 60:.2f} minutes ({training_time:.2f} seconds)"
)

# Save model and tokenizer
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model and tokenizer saved to {output_dir}")

training_details = {
    **training_config,
    **model_config,
    "training_time_seconds": training_time,
    "training_time_minutes": training_time / 60,
    "gpu_info": gpu_info,
}


## 5. Training Results Visualization


In [None]:
# Load training logs
import json

# Get training history from trainer
if hasattr(trainer.state, "log_history"):
    logs = trainer.state.log_history

    # Extract loss and eval loss
    train_losses = [
        log["loss"] for log in logs if "loss" in log and "eval_loss" not in log
    ]
    eval_losses = [log["eval_loss"] for log in logs if "eval_loss" in log]
    steps = [log["step"] for log in logs if "loss" in log or "eval_loss" in log]

    # Plot training curves
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    if train_losses:
        plt.plot(range(len(train_losses)), train_losses, label="Train Loss")
    if eval_losses:
        plt.plot(range(len(eval_losses)), eval_losses, label="Val Loss")
    plt.xlabel("Step")
    plt.ylabel("Loss")
    plt.title("Training and Validation Loss")
    plt.legend()
    plt.grid(True, alpha=0.3)

    plt.subplot(1, 2, 2)
    if train_losses:
        plt.plot(range(len(train_losses)), train_losses, label="Train Loss")
    if eval_losses:
        plt.plot(range(len(eval_losses)), eval_losses, label="Val Loss")
    plt.xlabel("Step")
    plt.ylabel("Loss (log scale)")
    plt.title("Training and Validation Loss (Log Scale)")
    plt.yscale("log")
    plt.legend()
    plt.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.savefig("training_curves.png", dpi=150)
    plt.show()

    print(f"Final train loss: {train_losses[-1] if train_losses else 'N/A'}")
    print(f"Final val loss: {eval_losses[-1] if eval_losses else 'N/A'}")
else:
    print("No training logs available")


In [None]:
# Load trained model
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(output_dir)
tokenizer = CharTokenizer.from_pretrained(output_dir)
model.eval()

print("Model and tokenizer loaded")


In [None]:
# Evaluation function
def evaluate_dataset(model, tokenizer, dataset, max_new_tokens=8):
    """Evaluate model on a dataset."""
    model.eval()
    exact_correct = 0
    normalized_correct = 0
    total = 0

    def normalize_whitespace(text):
        return "".join(text.split()) if text else ""

    sample_predictions = []

    for i, example in enumerate(dataset):
        prompt = example["prompt"]
        ground_truth = example["completion"]

        # Encode prompt
        input_ids = tokenizer.encode(prompt)
        input_tensor = torch.tensor([input_ids])

        # Generate
        with torch.no_grad():
            outputs = model.generate(
                input_tensor,
                max_new_tokens=max_new_tokens,
                do_sample=False,  # Greedy decoding
                pad_token_id=tokenizer.pad_token_id,
            )

        # Decode only the new tokens
        generated_ids = outputs[0][len(input_ids) :]
        prediction = tokenizer.decode(generated_ids).strip()

        # Evaluate
        exact_match = prediction == ground_truth
        normalized_match = normalize_whitespace(prediction) == normalize_whitespace(
            ground_truth
        )

        if exact_match:
            exact_correct += 1
        if normalized_match:
            normalized_correct += 1
        total += 1

        # Store samples
        if len(sample_predictions) < 10:
            sample_predictions.append(
                {
                    "prompt": prompt,
                    "ground_truth": ground_truth,
                    "prediction": prediction,
                    "exact_match": exact_match,
                    "normalized_match": normalized_match,
                }
            )

        if (i + 1) % 1000 == 0:
            print(f"Evaluated {i + 1}/{len(dataset)} examples")

    exact_accuracy = exact_correct / total if total > 0 else 0
    normalized_accuracy = normalized_correct / total if total > 0 else 0

    return {
        "exact_accuracy": exact_accuracy,
        "normalized_accuracy": normalized_accuracy,
        "exact_correct": exact_correct,
        "normalized_correct": normalized_correct,
        "total": total,
        "sample_predictions": sample_predictions,
    }


In [None]:
# Evaluate on ID test set
print("Evaluating on ID test set...")
id_results = evaluate_dataset(model, tokenizer, test_id_dataset)

print(f"\nID Test Results:")
print(
    f"  Exact match accuracy: {id_results['exact_accuracy']:.4f} ({id_results['exact_correct']}/{id_results['total']})"
)
print(
    f"  Normalized accuracy: {id_results['normalized_accuracy']:.4f} ({id_results['normalized_correct']}/{id_results['total']})"
)

print("\nSample ID predictions:")
for i, sample in enumerate(id_results["sample_predictions"][:5]):
    print(
        f"  {i + 1}. {sample['prompt']} → GT: '{sample['ground_truth']}', Pred: '{sample['prediction']}' "
    )
    print(
        f"     Exact: {sample['exact_match']}, Normalized: {sample['normalized_match']}"
    )


In [None]:
# Generate and evaluate on OOD test set
print("Generating OOD test set...")

# Get exclude pairs from ID splits
all_id_pairs = train_pairs | val_pairs | test_pairs
test_ood_dataset = dataset.generate_ood_data(exclude_pairs=all_id_pairs)

print(f"OOD test size: {len(test_ood_dataset)}")
print("\nSample OOD examples:")
for i in range(3):
    ex = test_ood_dataset[i]
    print(f"  {ex['prompt']} → {ex['completion']}")

print("\nEvaluating on OOD test set...")
ood_results = evaluate_dataset(model, tokenizer, test_ood_dataset)

print(f"\nOOD Test Results:")
print(
    f"  Exact match accuracy: {ood_results['exact_accuracy']:.4f} ({ood_results['exact_correct']}/{ood_results['total']})"
)
print(
    f"  Normalized accuracy: {ood_results['normalized_accuracy']:.4f} ({ood_results['normalized_correct']}/{ood_results['total']})"
)

print("\nSample OOD predictions:")
for i, sample in enumerate(ood_results["sample_predictions"][:5]):
    print(
        f"  {i + 1}. {sample['prompt']} → GT: '{sample['ground_truth']}', Pred: '{sample['prediction']}' "
    )
    print(
        f"     Exact: {sample['exact_match']}, Normalized: {sample['normalized_match']}"
    )


In [None]:
# Save results
results = {
    "id_test": {
        "exact": id_results["exact_accuracy"],
        "normalized": id_results["normalized_accuracy"],
        "exact_correct": id_results["exact_correct"],
        "normalized_correct": id_results["normalized_correct"],
        "total": id_results["total"],
    },
    "ood_test": {
        "exact": ood_results["exact_accuracy"],
        "normalized": ood_results["normalized_accuracy"],
        "exact_correct": ood_results["exact_correct"],
        "normalized_correct": ood_results["normalized_correct"],
        "total": ood_results["total"],
    },
}

with open("results.json", "w") as f:
    json.dump(results, f, indent=2)

print("Results saved to results.json")


## 7. Results Visualization and Tables


In [None]:
# Create results table
import pandas as pd

results_table = pd.DataFrame(
    {
        "Test Set": ["ID Test", "OOD Test"],
        "Exact Match": [results["id_test"]["exact"], results["ood_test"]["exact"]],
        "Normalized Match": [
            results["id_test"]["normalized"],
            results["ood_test"]["normalized"],
        ],
    }
)

print("Results Summary:")
print(results_table.to_string(index=False))

# Save as CSV
results_table.to_csv("results_table.csv", index=False)


In [None]:
# Plot accuracy comparison
fig, ax = plt.subplots(figsize=(8, 5))

x = ["ID Test", "OOD Test"]
exact_accs = [results["id_test"]["exact"], results["ood_test"]["exact"]]
norm_accs = [results["id_test"]["normalized"], results["ood_test"]["normalized"]]

x_pos = np.arange(len(x))
width = 0.35

ax.bar(x_pos - width / 2, exact_accs, width, label="Exact Match", alpha=0.8)
ax.bar(x_pos + width / 2, norm_accs, width, label="Normalized Match", alpha=0.8)

ax.set_ylabel("Accuracy")
ax.set_title("Model Performance: ID vs OOD Test Sets")
ax.set_xticks(x_pos)
ax.set_xticklabels(x)
ax.legend()
ax.grid(True, alpha=0.3, axis="y")
ax.set_ylim([0, 1])

plt.tight_layout()
plt.savefig("accuracy_comparison.png", dpi=150)
plt.show()


## 8. Report Summary Generation


In [None]:
# Compile all information for report
report_data = {
    "task_setup": task_setup,
    "data_diagnostics": data_diagnostics,
    "training_details": training_details,
    "results": results,
}

# Save to JSON
with open("report_data.json", "w") as f:
    json.dump(report_data, f, indent=2)

print("Report data saved to report_data.json")

# Print summary for easy copy-paste
print("\n" + "=" * 60)
print("REPORT SUMMARY")
print("=" * 60)

print("\n1. Task Setup:")
print(f"   Format: {task_setup['input_format']} → {task_setup['output_format']}")
print(f"   k: {task_setup['k']}")
print(f"   Model: {task_setup['model_architecture']}")
print(
    f"   Tokenizer: {task_setup['tokenizer']['type']} ({task_setup['tokenizer']['vocab_size']} tokens)"
)

print("\n2. Data Design:")
print(f"   Train: {data_diagnostics['train_size']}")
print(f"   Val: {data_diagnostics['val_size']}")
print(f"   Test ID: {data_diagnostics['test_id_size']}")
print(
    f"   Overlaps: {data_diagnostics['overlap_train_val']}, {data_diagnostics['overlap_train_test']}, {data_diagnostics['overlap_val_test']}"
)

print("\n3. Training Details:")
print(f"   Batch size: {training_config['batch_size']}")
print(f"   Learning rate: {training_config['learning_rate']}")
print(f"   Epochs: {training_config['num_epochs']}")
print(f"   Training time: {training_details['training_time_minutes']:.2f} minutes")
if gpu_info:
    print(f"   GPU: {gpu_info['device']}")

print("\n4. Results:")
print(
    f"   ID Test - Exact: {results['id_test']['exact']:.4f}, Normalized: {results['id_test']['normalized']:.4f}"
)
print(
    f"   OOD Test - Exact: {results['ood_test']['exact']:.4f}, Normalized: {results['ood_test']['normalized']:.4f}"
)

print("\n" + "=" * 60)


## 9. Interpretation and Discussion

Use the cells below to add your interpretation and discussion:

### Interpretation

- What did the model learn?
- How well does it generalize?
- What patterns do you observe in errors?

### Discussion

- How did you design the task format and why?
- How did you design the train/test split and why?
- What remains uncertain?
- What would you do next?
- Why is this task hard for the model?
- What could improve its performance?
