# Finetuning Models in Parallel
This is a jupyter notebook to train models from datasets in huggingface

In [1]:
!pip install accelerate transformers datasets huggingface-hub

^C


In [None]:
from accelerate import notebook_launcher

import warnings
warnings.filterwarnings("ignore")

def training_function():
"""Training function is using autotokenizer to tokenize datasets and start finetune session"""

     # 1. Import libraries inside the function
    from transformers import (
        AutoTokenizer,
        AutoModelForCausalLM,
        Trainer,
        TrainingArguments,
        DataCollatorForLanguageModeling
    )
    from datasets import load_dataset
    from huggingface_hub import login

    # 2. Login to Hugging Face

    login(token="YOUR_HF_TOKEN_HERE")

     # 3. Load model and tokenizer
    model_name = "meta-llama/Meta-Llama-3-8B"
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Ensure pad token is set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id

    model = AutoModelForCausalLM.from_pretrained(model_name)

    # 4. Load dataset
    dataset = load_dataset("Unisyn-corp/lead-vision", split="train")

    # 5. Define tokenization
    def tokenize_function(examples):
        merged_texts = []
        for msg_list in examples["messages"]:
            contents = [m["content"] for m in msg_list]
            merged_text = " ".join(contents)
            merged_texts.append(merged_text)

        return tokenizer(
            merged_texts,
            truncation=True,
            padding="max_length",
            max_length=512
        )

    tokenized_dataset = dataset.map(tokenize_function, batched=True)

    # 6. Create data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    # 7. Define DeepSpeed config
    deepspeed_config = {
        "train_batch_size": "auto",
        "gradient_accumulation_steps": "auto",
        "zero_optimization": {
            "stage": 2,
            "contiguous_gradients": True,
            "overlap_comm": True,
            "reduce_scatter": True,
            "reduce_bucket_size": 5e8,
            "allgather_bucket_size": 5e8
        }
    }

    # 8. Training arguments
    training_args = TrainingArguments(
        output_dir="Lead-Vision-Finetuned-8B",
        per_device_train_batch_size=1,
        bf16=True,  # Overridden by ds_config if needed
        num_train_epochs=3,
        logging_steps=50,
        save_steps=200,
        gradient_accumulation_steps=4,
        deepspeed=deepspeed_config
    )

    # 9. Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=data_collator
    )

    # 10. Resume training from checkpoint
    trainer.train(
        resume_from_checkpoint="Lead-Vision-Finetuned-8B/checkpoint-3000"
    )

# 11. Launch training
notebook_launcher(training_function, num_processes=8)
