# English to Swedish Poetry Translation with Unsloth

This notebook fine-tunes a language model using Unsloth to translate English poetry to Swedish.

## Hardware Requirements
- GPU: RTX 3060 (12GB) or better
- RAM: 16GB+ recommended

## Dataset
- Training data: `english_to_swedish_poetry_translation.json`
- **790 translation examples** from **46 poems**
- Format: Alpaca (instruction, input, output)
- Poets: Viktor Rydberg, Verner von Heidenstam, Esaias Tegn√©r
- Multiple granularities: full poems, stanzas, and multi-line excerpts

## 1. Install Dependencies

In [None]:
# Install Unsloth and dependencies
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

## 2. Import Libraries

In [None]:
from unsloth import FastLanguageModel
import torch
import json
from datasets import Dataset
from trl import SFTTrainer
from transformers import TrainingArguments
import os

## 3. Configuration

In [None]:
# Model configuration
max_seq_length = 2048  # Unsloth supports RoPE Scaling internally
dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True  # Use 4bit quantization to reduce memory usage

# Training configuration
EPOCHS = 2  # Reduced from 3 due to larger dataset (790 examples)
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4
LEARNING_RATE = 2e-4
MAX_STEPS = -1  # Set to -1 for full training
WARMUP_STEPS = 10  # Increased for larger dataset

# Train/validation split
VALIDATION_SPLIT = 0.05  # Use 5% for validation

# Paths
DATA_PATH = "../data/english_to_swedish_poetry_translation.json"
OUTPUT_DIR = "./outputs/translation_model"

print(f"Configuration:")
print(f"  Epochs: {EPOCHS}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Gradient accumulation: {GRADIENT_ACCUMULATION_STEPS}")
print(f"  Effective batch size: {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")
print(f"  Learning rate: {LEARNING_RATE}")
print(f"  Validation split: {VALIDATION_SPLIT * 100}%")

## 4. Load Model

In [None]:
# Load model with Unsloth optimizations
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/llama-3.2-3b-instruct",  # Choose from Unsloth's optimized models
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

print(f"Model loaded: {model.config.model_type}")
print(f"Vocabulary size: {len(tokenizer)}")

## 5. Configure LoRA for Fine-tuning

In [None]:
# Add LoRA adapters for efficient fine-tuning
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank - higher = more parameters but potentially better quality
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",     # Supports any, but = "none" is optimized
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,  # Rank stabilized LoRA
    loftq_config=None, # LoftQ
)

print("LoRA adapters configured")

## 6. Load and Prepare Dataset

In [None]:
# Load the alpaca-formatted JSON data
with open(DATA_PATH, 'r', encoding='utf-8') as f:
    data = json.load(f)

print(f"Loaded {len(data)} training examples")
print(f"\nExample entry:")
print(f"Instruction: {data[0]['instruction']}")
print(f"Input: {data[0]['input'][:100]}...")
print(f"Output: {data[0]['output'][:100]}...")

In [None]:
# Define the alpaca prompt template
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise generation will go on forever
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Convert to HuggingFace Dataset
dataset = Dataset.from_list(data)
dataset = dataset.map(formatting_prompts_func, batched=True)

# Split into train and validation
dataset_split = dataset.train_test_split(test_size=VALIDATION_SPLIT, seed=3407)
train_dataset = dataset_split['train']
eval_dataset = dataset_split['test']

print(f"Total examples: {len(dataset)}")
print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(eval_dataset)}")
print(f"\nFormatted example (first 500 chars):")
print(train_dataset[0]['text'][:500])

## 7. Configure Training

In [None]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # Can make training 5x faster for short sequences
    args=TrainingArguments(
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        warmup_steps=WARMUP_STEPS,
        max_steps=MAX_STEPS,
        num_train_epochs=EPOCHS,
        learning_rate=LEARNING_RATE,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=10,  # Log every 10 steps for larger dataset
        eval_strategy="steps",  # Evaluate during training
        eval_steps=50,  # Evaluate every 50 steps
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir=OUTPUT_DIR,
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,  # Load best model after training
        metric_for_best_model="eval_loss",
    ),
)

print("Trainer configured")
print(f"Effective batch size: {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")
print(f"Total training steps per epoch: {len(train_dataset) // (BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS)}")
print(f"Total training steps: {len(train_dataset) // (BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS) * EPOCHS}")
print(f"Evaluation every {50} steps")

## 8. Train the Model

In [None]:
# Show GPU stats before training
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
# Start training
trainer_stats = trainer.train()

# Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)

print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

## 9. Test the Model

In [None]:
# Enable native 2x faster inference
FastLanguageModel.for_inference(model)

# Test translation
test_instruction = "Translate the following English poem to Swedish."
test_input = """The moon walks her silent way,
The snow shines white on fir trees gray,
The snow shines white on the buildings.
Only the goblin is waking."""

# Format the input
prompt = alpaca_prompt.format(
    test_instruction,
    test_input,
    "",  # output - leave blank for generation
)

inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

print("Testing translation...\n")
print(f"Input English text:\n{test_input}\n")
print("=" * 50)

outputs = model.generate(
    **inputs,
    max_new_tokens=256,
    use_cache=True,
    temperature=0.7,
    top_p=0.9,
)

decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

# Extract just the response part
response = decoded_output.split("### Response:")[-1].strip()

print(f"\nSwedish translation:\n{response}")

## 10. More Test Examples

In [None]:
def translate_to_swedish(english_text):
    """Helper function to translate English poetry to Swedish"""
    prompt = alpaca_prompt.format(
        "Translate the following English text to Swedish.",
        english_text,
        "",
    )
    
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        use_cache=True,
        temperature=0.7,
        top_p=0.9,
    )
    
    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    response = decoded.split("### Response:")[-1].strip()
    return response

# Test with different examples
test_examples = [
    "Where are they now, they who sat at table with the banquet's song?",
    "Farewell now, warm sun! Feel how good our rest was!",
    "The stars are sparkling, gleaming there.",
]

print("Testing multiple translations:\n")
print("=" * 70)

for i, test in enumerate(test_examples, 1):
    print(f"\nTest {i}:")
    print(f"English: {test}")
    translation = translate_to_swedish(test)
    print(f"Swedish: {translation}")
    print("-" * 70)

## 11. Save the Model

In [None]:
# Save LoRA adapters only (much smaller)
model.save_pretrained("translation_model_lora")
tokenizer.save_pretrained("translation_model_lora")

print("LoRA adapters saved to: translation_model_lora/")

In [None]:
# Optional: Save merged model (base + LoRA) in 16bit
# This creates a standalone model that doesn't need LoRA adapters
model.save_pretrained_merged(
    "translation_model_merged_16bit",
    tokenizer,
    save_method="merged_16bit",
)

print("Merged 16-bit model saved to: translation_model_merged_16bit/")

In [None]:
# Optional: Save as 4-bit quantized GGUF for llama.cpp
# Useful for running locally with CPU or smaller GPUs
model.save_pretrained_gguf(
    "translation_model",
    tokenizer,
    quantization_method="q4_k_m",
)

print("GGUF model saved to: translation_model/")

## 12. Push to Hugging Face Hub (Optional)

In [None]:
# Uncomment and configure to push to Hugging Face
# from huggingface_hub import login
# login(token="YOUR_HF_TOKEN")

# model.push_to_hub("your-username/english-swedish-poetry-translator", token="YOUR_HF_TOKEN")
# tokenizer.push_to_hub("your-username/english-swedish-poetry-translator", token="YOUR_HF_TOKEN")