In [None]:
import torch
import random
import numpy as np
from datasets import Dataset
from transformers import (
    BartTokenizer, 
    BartForConditionalGeneration, 
    BartConfig,
    Seq2SeqTrainer, 
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
import wandb


# Set best available device
device = torch.device("cuda" if torch.cuda.is_available() else 
                      "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")


# Initialize wandb
wandb.init(project="addition-bart", name="fixed-addition-model")

# Addition Task with BART Sequence-to-Sequence Model

This project demonstrates:

1. **Task**: Building a model that adds two integers (e.g., "5 + 7 = ?" → "12")
2. **Architecture**: Using BART, a pre-trained sequence-to-sequence transformer model
3. **Implementation**:
    - Configuring a smaller BART model from Hugging Face
    - Creating a custom dataset of addition problems
    - Fine-tuning the model on this mathematical task

BART combines a bidirectional encoder (like BERT) with an autoregressive decoder (like GPT), making it well-suited for various sequence transformation tasks including our numeric addition problem.

In [None]:

# Initialize tokenizer 
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

# Define model configuration
config = BartConfig(
    vocab_size=len(tokenizer),  # Important: match with expanded tokenizer
    d_model=128,
    encoder_layers=3,
    decoder_layers=3,
    encoder_attention_heads=2,
    decoder_attention_heads=2,
    decoder_ffn_dim=512,
    encoder_ffn_dim=512,
    max_position_embeddings=64,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    decoder_start_token_id=tokenizer.bos_token_id
)

# Initialize model from scratch with our config
model = BartForConditionalGeneration(config)
print(f"Model parameters: {model.num_parameters():,}")


## Dataset Generation

We'll create synthetic physics data by:
- Sampling input parameters from appropriate distributions
- Computing outputs using known physical relationships
- Adding noise to simulate measurement uncertainty
- Splitting into training and validation sets

This data will help us evaluate our model's ability to learn the underlying physical laws.

In [None]:

# Generate dataset directly with Hugging Face structures
def generate_dataset(num_examples=5000, min_num=1, max_num=100, train_ratio=0.8):
    examples = []
    for _ in range(num_examples):
        num1 = random.randint(min_num, max_num)
        num2 = random.randint(min_num, max_num)
        examples.append({
            "input_text": f"{num1} + {num2} = ?",
            "target_text": str(num1 + num2)
        })
    
    # Create and split datasets
    dataset = Dataset.from_dict({
        "input_text": [ex["input_text"] for ex in examples],
        "target_text": [ex["target_text"] for ex in examples]
    })
    return dataset.train_test_split(test_size=1-train_ratio, seed=42)


# Create datasets
dataset_dict = generate_dataset()

# Verify data
print("\nSample Data:")
for i in range(3):
    print(f"Sample {i+1}: Input: '{dataset_dict['train']['input_text'][i]}', Target: '{dataset_dict['train']['target_text'][i]}'")


## Data Tokenization

For our addition model to process the data effectively:

- We'll convert our numerical inputs into a format suitable for the model

In [None]:
# Define preprocessing function - fixed tokenization approach
def preprocess_function(examples):
    # Tokenize inputs
    model_inputs = tokenizer(
        examples["input_text"],
        padding="max_length",
        truncation=True,
        max_length=16,
        return_tensors=None  # Return python lists
    )
    
    # Tokenize targets
    labels = tokenizer(
        text_target=examples["target_text"],
        padding="max_length",
        truncation=True,
        max_length=8,
        return_tensors=None  # Return python lists
    )
    
    # Replace padding token id with -100 in labels (transformers convention)
    for i in range(len(labels["input_ids"])):
        labels["input_ids"][i] = [
            -100 if token == tokenizer.pad_token_id else token
            for token in labels["input_ids"][i]
        ]
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Preprocess datasets
tokenized_datasets = dataset_dict.map(
    preprocess_function, 
    batched=True, 
    remove_columns=["input_text", "target_text"]
)

# Verify tokenized data
print("\nVerifying tokenized data:")
print(f"Input_ids sample: {tokenized_datasets['train'][0]['input_ids'][:10]}")
print(f"Labels sample: {tokenized_datasets['train'][0]['labels'][:10]}")



# FINALLY WE TRAIN! 

### Data Collator
The Data Collator is responsible for batching our processed examples together for efficient training. It performs several critical functions:

- **Padding**: Ensures all sequences in a batch have the same length by adding padding tokens
- **Tensor conversion**: Converts data from Python lists to PyTorch tensors
- **Special handling for labels**: Properly masks padded tokens in labels with -100 so they don't contribute to the loss

### Training Arguments
We configure the training process with parameters like:
- Learning rate and optimization settings
- Batch sizes and number of epochs
- Evaluation and checkpointing frequency
- Logging configuration for monitoring training progress

After setting up these components, we'll initialize the trainer and start the training process to teach our model how to add numbers!

In [None]:
# Set up data collator (without using deprecated features)
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding="max_length",
    max_length=16,
    pad_to_multiple_of=8
)

# Training arguments with better learning rate
training_args = Seq2SeqTrainingArguments(
    output_dir="./addition_results",
    run_name="addition-bart-fixed",
    num_train_epochs=20,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    learning_rate=5e-4,  
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="wandb",
    logging_strategy="steps",
    logging_steps=100,
    predict_with_generate=True,
    generation_max_length=8,
    warmup_ratio=0.1,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8
)

# Define trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

# Train the model
trainer.train()
print("Training completed!")

# Save the model
model.save_pretrained("./addition_model")

# Clean up wandb
wandb.finish()

In [None]:

# Define test function
def test_addition(model, tokenizer, num1, num2):
    input_text = f"{num1} + {num2} = ?"
    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    
    # Move model to device for prediction
    model.to(device)
    model.eval()
    
    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            max_length=8,
            num_beams=4,
            early_stopping=True
        )
    
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return prediction.strip()

# Load best model for testing
model = BartForConditionalGeneration.from_pretrained("./addition_model")
model.to(device)

# Test cases random

test_cases = [(random.randint(1, 100), random.randint(1, 100)) for _ in range(10)]

print("\nTesting the model:")
correct = 0
for num1, num2 in test_cases:
    # Check if in dataset_dict

    expected = str(num1 + num2)
    predicted = test_addition(model, tokenizer, num1, num2)
    print(f"{num1} + {num2} = {predicted} (Expected: {expected})")
    if predicted == expected:
        correct += 1

print(f"\nAccuracy: {correct/len(test_cases):.2%}")
