In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import load_dataset
from peft import LoraConfig, get_peft_model

# --- 1️⃣ Load tokenizer and model ---
model_name = "AI-Sweden-Models/Llama-3-8B-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="auto",
    load_in_4bit=True,
)

# --- 2️⃣ LoRA setup ---
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:07<00:00,  1.91s/it]


trainable params: 3,407,872 || all params: 8,033,669,120 || trainable%: 0.0424


In [2]:
# --- 3️⃣ Load dataset ---
dataset = load_dataset("json", data_files="swe-prompts.json")["train"]

# --- 4️⃣ Fixed preprocessing function ---
def preprocess(example, max_length=512):
    """
    Properly handles multi-turn conversations and masks only system/user tokens.
    """
    # Get the full conversation
    full_text = tokenizer.apply_chat_template(
        example["messages"],
        tokenize=False,
        add_generation_prompt=False
    )
    
    # Tokenize the full conversation
    tokenized = tokenizer(
        full_text,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors=None
    )
    
    # Initialize labels as a copy of input_ids
    labels = tokenized["input_ids"].copy()
    
    # Method 1: Mask everything initially, then unmask assistant responses
    labels = [-100] * len(labels)  # Mask everything first
    
    # Build conversation incrementally to find assistant response positions
    current_pos = 0
    
    for i, msg in enumerate(example["messages"]):
        if msg["role"] == "assistant":
            # Find where this assistant message starts and ends
            # Build conversation up to this point (without the assistant message)
            conv_before = example["messages"][:i]
            if conv_before:
                text_before = tokenizer.apply_chat_template(
                    conv_before, 
                    tokenize=False, 
                    add_generation_prompt=True
                )
                tokens_before = tokenizer(text_before, add_special_tokens=False)["input_ids"]
                start_pos = len(tokens_before)
            else:
                start_pos = 0
            
            # Build conversation including this assistant message
            conv_including = example["messages"][:i+1]
            text_including = tokenizer.apply_chat_template(
                conv_including, 
                tokenize=False, 
                add_generation_prompt=False
            )
            tokens_including = tokenizer(text_including, add_special_tokens=False)["input_ids"]
            end_pos = len(tokens_including)
            
            # Unmask the assistant tokens (but keep some special tokens masked)
            if start_pos < len(labels) and end_pos <= len(labels):
                for j in range(start_pos, min(end_pos, len(labels))):
                    if j < len(tokenized["input_ids"]):
                        labels[j] = tokenized["input_ids"][j]
    
    tokenized["labels"] = labels
    
    # Count trainable tokens
    num_trainable = len([l for l in labels if l != -100])
    print(f"Trainable tokens contributing to loss: {num_trainable}/{len(labels)}")
    
    return tokenized

tokenized_dataset = dataset.map(preprocess, remove_columns=dataset.column_names, batched=False)

# --- 5️⃣ Data collator ---
data_collator = DataCollatorForSeq2Seq(tokenizer, padding=True)

# --- 6️⃣ Training arguments ---
training_args = TrainingArguments(
    output_dir="/media/petter/Data/lora-chat-model",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    dataloader_drop_last=True,  # Helps with consistency
)

# --- 7️⃣ Trainer setup ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

# --- 8️⃣ Start training ---
trainer.train()

Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.76s/ examples]
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.


Trainable tokens contributing to loss: 278/512


Step,Training Loss


TrainOutput(global_step=3, training_loss=1.0672237078348796, metrics={'train_runtime': 3.8191, 'train_samples_per_second': 0.786, 'train_steps_per_second': 0.786, 'total_flos': 69196792725504.0, 'train_loss': 1.0672237078348796, 'epoch': 3.0})