In [None]:
%%capture
!pip install unsloth
!pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 8192 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name= "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj", 
                      # "embed_tokens", "lm_head",
                     ],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    # use_rslora = False,  # We support rank stabilized LoRA
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

<a name="Data"></a>
### Data Prep
We now use the Alpaca dataset from [yahma](https://huggingface.co/datasets/yahma/alpaca-cleaned), which is a filtered version of 52K of the original [Alpaca dataset](https://crfm.stanford.edu/2023/03/13/alpaca.html). You can replace this code section with your own data prep.

**[NOTE]** To train only on completions (ignoring the user's input) read TRL's docs [here](https://huggingface.co/docs/trl/sft_trainer#train-on-completions-only).

**[NOTE]** Remember to add the **EOS_TOKEN** to the tokenized output!! Otherwise you'll get infinite generations!

If you want to use the `llama-3` template for ShareGPT datasets, try our conversational [notebook](https://colab.research.google.com/drive/1XamvWYinY6FOSX9GLvnqSjjsNflxdhNc?usp=sharing).

For text completions like novel writing, try this [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing).

In [None]:
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def print_dataset_info(dataset):
    """
    Print information about the dataset.

    Args:
        dataset: The dataset to print information about
    """
    print("Dataset loaded successfully.")
    print(f"Dataset info: {dataset}")

In [None]:
def load_fast_apply_coding_dataset():
    """
    Load the fast-apply-coding dataset from Hugging Face.

    Returns:
        dataset: The loaded dataset with only 'original_code', 'update_snippet', and 'final_code' columns
    """
    
    from datasets import load_dataset
    dataset = load_dataset("quocdat25/fast-apply-coding")
    # Select only the required columns
    dataset = dataset.map(lambda example: {
        'original_code': example['original_code'],
        'update_snippet': example['update_snippet'],
        'final_code': example['final_code']
    })
    return dataset

def formatting_prompts_func(examples):
    original_codes = examples["original_code"]
    update_snippets = examples["update_snippet"]
    final_codes = examples["final_code"]
    texts = []
    
    prompt_template = """Merge all changes from the update snippet to the code below, ensuring that every modification is fully integrated. 
Maintain the code's structure, order, comments, and indentation precisely. 
Do not use any placeholders, ellipses, or omit any sections in <updated-code>.
Only output the updated code; do not include any additional text, explanations, or fences.
\n
<update>{update_snippet}</update>
\n
<code>{original_code}</code>
\n
The updated code MUST be enclosed in <updated-code> tags.
Here's the updated-code with fully integrated changes, start the tag now:
<updated-code>{final_code}</updated-code>"""

    for original_code, update_snippet, final_code in zip(original_codes, update_snippets, final_codes):
        text = prompt_template.format(
            original_code=original_code,
            update_snippet=update_snippet,
            final_code=final_code
        ) + EOS_TOKEN
        texts.append(text)

    return {"text": texts}

In [None]:
dataset = load_fast_apply_coding_dataset()

filtered_dataset = dataset
if False:
    TOKEN_LIMIT = 2500
    def filter_by_token_count(example):
        return example['Token Count'] <= TOKEN_LIMIT
    filtered_dataset = dataset.filter(filter_by_token_count)

formatted_dataset = filtered_dataset.map(formatting_prompts_func, batched=True, remove_columns=filtered_dataset["train"].column_names)

print_dataset_info(formatted_dataset)

print("\nFormatted Dataset Sample:")
print(formatted_dataset["train"].select(range(1)))

In [None]:
print(formatted_dataset["train"].select(range(1))[0]['text'])

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = formatted_dataset["train"],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 10,
        gradient_accumulation_steps = 4,
        warmup_steps = 15,
        num_train_epochs = 1, # Set this for 1 full training run.
        # max_steps = 70,
        learning_rate = 1e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.02,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
    ),
)

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
version='13.3-Llama3.2-8B'
hf_token="hf_..."

In [None]:
model.save_pretrained(f"lora_model/v{version}")
tokenizer.save_pretrained(f"lora_model/v{version}")

model.push_to_hub(f"quocdat25/fast-apply_lora-4b-v0.{version}", token=hf_token)
tokenizer.push_to_hub(f"quocdat25/fast-apply_lora-4b-v0.{version}", token=hf_token)



In [None]:
model.push_to_hub_merged(f"quocdat25/fast-apply-16bit-v0.{version}", tokenizer, save_method="merged_16bit", token=hf_token)

In [None]:
# model.push_to_hub_merged(f"quocdat25/vLLM-fast-apply-4bit-v0.{version}", tokenizer, save_method = "merged_4bit_forced", token = hf_token)

In [None]:
! bash -c "runpodctl stop pod $RUNPOD_POD_ID"

In [None]:
raise Exception