In [None]:
!pip install transformers==4.51.0  # Use this specific version to avoid embedding mismatch bug
!pip install -U datasets
!pip install -U accelerate
!pip install -U peft
!pip install -U trl
!pip install -U bitsandbytes

In [None]:
from huggingface_hub import login
import os

# Set your Hugging Face token (from environment variable or enter directly)
hf_token = os.environ.get("HF_TOKEN")  # Or set it directly here
login(hf_token)

In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Choose your model
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"  # Or other Llama model

# Configure quantization for memory efficiency
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto", 
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    trust_remote_code=True,
)

model.config.use_cache = False

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

In [None]:
from datasets import load_dataset

# Load your dataset - replace with your actual data source
dataset = load_dataset("your_dataset_name_or_path")

# Define formatting function for your data
def formatting_prompts_func(examples):
    # Adjust based on your dataset structure
    inputs = examples["input_column"]
    outputs = examples["output_column"]
    
    # Format according to Llama chat template
    texts = []
    for input_text, output_text in zip(inputs, outputs):
        # Add EOS token if needed
        if not output_text.endswith(tokenizer.eos_token):
            output_text += tokenizer.eos_token
            
        # Create messages format
        messages = [
            {"role": "user", "content": input_text},
            {"role": "assistant", "content": output_text}
        ]
        
        # Apply chat template
        text = tokenizer.apply_chat_template(messages, tokenize=False)
        texts.append(text)
    
    return {"text": texts}

# Process dataset
processed_dataset = dataset.map(formatting_prompts_func, batched=True)

In [None]:
from peft import LoraConfig, get_peft_model

# LoRA configuration
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=16,  # Reduced from 64 to save memory
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
)

# Apply LoRA to model
model = get_peft_model(model, peft_config)

In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer

# Training arguments optimized for A100 40GB
training_arguments = TrainingArguments(
    output_dir="./finetuned-llama",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,  # Simulates larger batch size
    optim="adamw_torch_fused",  # or "paged_adamw_32bit"
    num_train_epochs=1,
    logging_steps=10,
    save_strategy="steps",
    save_steps=100,
    learning_rate=1e-4,  # Reduced from 2e-4 for stability
    fp16=True,  # Use this for A100
    bf16=False,  # Only use bf16 if you have adequate hardware support
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",  # Changed from constant for better convergence
    group_by_length=True,
    report_to="tensorboard",
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
)

In [None]:
# Prepare data collator
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Create train/validation split if not already done
if "validation" not in processed_dataset:
    processed_dataset = processed_dataset.train_test_split(test_size=0.05)

# Initialize the Trainer
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["validation"],
    data_collator=data_collator,
    peft_config=peft_config,
)

In [None]:
# Train the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./finetuned-llama-final")
tokenizer.save_pretrained("./finetuned-llama-final")

In [None]:
from transformers import pipeline

# Load your fine-tuned model
model_path = "./finetuned-llama-final"
pipe = pipeline(
    "text-generation",
    model=model_path,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# Test with a sample prompt
messages = [
    {"role": "user", "content": "Your test prompt here"}
]
outputs = pipe(messages, max_new_tokens=128, do_sample=True)
print(outputs[0]["generated_text"])

Key Differences from Failed Examples
Looking at the Reddit post about the "terrible llama 3.2 vision finetune," here are some improvements I've incorporated in the code above:

1) Improved LoRA Configuration: The failed example used r=8 with only ["q_proj", "v_proj"] as target modules. I've increased to r=16 and targeted more modules.
2) Learning Rate: Lowered from 2e-4 to 1e-4 for better stability.
3) Learning Schedule: Changed from constant to cosine learning rate schedule.
4) Validation Integration: Added proper validation split to monitor overfitting.
5) Batch Processing: Optimized batch size and gradient accumulation for your hardware.

These changes should help address the overfitting issues mentioned in the Reddit post while making efficient use of your A100's memory.
For working specifically with vision models, you would need to make further adjustments to handle image inputs, but this code provides the foundation for text-based fine-tuning of Llama models.