# Train LoRAs with HuggingFace APIs

### Install
pip install pytorch transformers datasets peft jupyterlab ipywidgets

### Notes
grimm = dts.load_dataset("Eugenememe/grimms")

Merging LoRA can be achieved with [add_weighted_adapter](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraModel.add_weighted_adapter)

In [None]:
# Set this var to download everything to the directory where this notebook is.
# Goes under "./hub"
%env HF_HOME=.

In [None]:
import transformers as tfs
import datasets as dts
import accelerate
import peft
import torch

This cell simply fetches the model from Hugging Face Hub. We're using their SmolLM-135M model here, which has 135M parameters and a context window of 2048. However, we're limited the size of all our data to 1024 to limit memory usage.

In [None]:
smol_lm = "HuggingFaceTB/SmolLM-135M"
# smol_lm = "HuggingFaceTB/SmolLM-360M"
# smol_lm = "HuggingFaceTB/SmolLM-1.7B"
amd_slm = "amd/AMD-Llama-135m"

def load_model(name: str):
    MAX_LEN=1024 # Reduce the usable context size to save VRAM
    
    config = tfs.AutoConfig.from_pretrained(name)
    model = tfs.AutoModelForCausalLM.from_pretrained(name)
    tokenizer = tfs.AutoTokenizer.from_pretrained(
        name,
        model_max_length=MAX_LEN
    )

    # TODO Required for PEFT to use gradient checkpointing https://github.com/huggingface/peft/issues/137
    # model.enable_input_require_grads()
        
    # chatml - requires <|im_start|> and <|im_end|> special tokens.
    # If they don't exist, tokenizer.add_special_tokens and model.resize_token_embeddings can be used, but
    # these tokens would come with randomly initialized embeddings and need finetuning.
    # Standard LoRA does not train input embeddings so this probably won't work without full fine tune.
    # See for details on chat template https://huggingface.co/docs/transformers/main/chat_templating#what-template-should-i-use
    tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
    # Using eos as the pad token seems common practice.
    tokenizer.pad_token = tokenizer.eos_token

    return config, model, tokenizer

cfg, mdl, tok = load_model(smol_lm)

# Instruct LoRA
First we'll train a LoRA for instruction following using the dolly-15k dataset.

In [None]:
# Instruct dataset. A possible alternative is "tatsu-lab/alpaca"
dolly = dts.load_dataset("databricks/databricks-dolly-15k")

In [None]:
# tok is captured from the global namespace
# x keys are instruction, context, response, category
def dolly_chat(x):
    chat = [
        {"role": "system", "content": "{}".format(x["context"])},
        {"role": "user", "content": "{}".format(x["instruction"])},
        {"role": "assistant", "content": "{}".format(x["response"])},
    ]
    chat_formatted = tok.apply_chat_template(
        chat,
        tokenize=False,
        add_generation_prompt=True,
    )

    # TODO Padding to max length always seems to result in static VRAM usage, but
    # is slower on average since many samples are much shorter than max_length.
    # Want to debug why peak VRAM fluctuates a lot when length can vary, as this sometimes
    # OOMs midway through training.
    tokenized = tok(chat_formatted, padding="max_length", truncation=True)

    return {"text": chat_formatted, "input_ids": tokenized["input_ids"]}

dset_w_tokenized = dolly["train"].map(dolly_chat)

In [None]:
print(len(dset_w_tokenized["input_ids"][0]))
print(dset_w_tokenized["text"][0])

In [None]:
rank = 32

# I got much better results by training embed_tokens. It's possible <|im_start|> <|im_end|> never
# appeared in the pretraining, but were included in the tokenizer anyway.
lora_config = peft.LoraConfig(
    r=rank, # This is the "rank"
    target_modules=["q_proj", "v_proj", "embed_tokens"],
    task_type=peft.TaskType.CAUSAL_LM,
    lora_alpha=rank, # Rule of thumb seems to be 1-2x the rank.
    lora_dropout=0.05
)
lora_model = peft.get_peft_model(mdl, lora_config)
lora_model.print_trainable_parameters()

In [None]:
len(dset_w_tokenized)

In [None]:
lr=1e-5
batchsize=8
epochs=1

args = tfs.TrainingArguments(
    output_dir='./finetune',
    optim='adamw_torch',
    learning_rate=lr,
    num_train_epochs=epochs,
    per_device_train_batch_size=batchsize,
    per_device_eval_batch_size=batchsize,
    gradient_accumulation_steps=1,
    gradient_checkpointing=False,
    save_strategy="no",
    weight_decay=0.01,
    push_to_hub=False,
    report_to='none',
    torch_empty_cache_steps=100,
    bf16=True,
    tf32=True,
)

collator = tfs.DataCollatorForLanguageModeling(tok, mlm=False)

trainer = tfs.Trainer(
    model=lora_model,
    args=args,
    train_dataset=dset_w_tokenized,
    processing_class=tok,
    data_collator=collator
)


In [None]:
trainer.train()

In [None]:
# From the wikipedia page for owls.
sys = "From beak to tail, an American crow measures 40–50 cm (16–20 in), almost half of which is tail. Its wingspan is 85–100 cm (33–39 in). Mass varies from about 300 to 600 g (11 to 21 oz), with males tending to be larger than females. Plumage is all black, with iridescent feathers. It looks much like other all-black corvids. They are very intelligent, and adaptable to human environments. The most usual call is CaaW!-CaaW!-CaaW! They can be distinguished from the common raven (C. corax) because American crows are smaller and the beak is slightly less pronounced; from the fish crow (C. ossifragus) because American crows do not hunch and fluff their throat feathers when they call; and from the carrion crow (C. corone) by size, as the carrion crow is larger and of a stockier build. "
question = "Can you tell me about crows?"
chat = [
    {"role": "system", "content": f"{sys}"},
    {"role": "user", "content": f"{question}"},
]

text = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
inputs = tok(text, return_tensors='pt', truncation=True).to(lora_model.device)
print("Prompt has", len(inputs["input_ids"][0]), "tokens")

with torch.no_grad():
    output = lora_model.generate(
        **inputs,
        do_sample=True,
        pad_token_id=tok.pad_token_id,
        # Change the following 4 parameters to control how the outputs are sampled.
        max_new_tokens=128,
        temperature=0.5,
        top_k=50,
        repetition_penalty=1.10,
    )
    
    output = tok.batch_decode(output)[0]
    print(output)


In [None]:
lora_model.save_pretrained(save_directory="mlhi-lora-instruct", save_embedding_layers=True)

# Corpus LoRA
Next we're going to try a separate LoRA on the same base model with a different dataset, a text dump of all the Grimm Fairytale stories.

In [None]:
# Load a fresh copy of the model
cfg, mdl, tok = load_model(smol_lm)

In [None]:
grimm = dts.load_dataset("Eugenememe/grimms")

In [None]:
def grimm_corpus(x):
    story = tok(x["story"], padding="max_length", truncation=False)

    return {"text": story, "input_ids": story["input_ids"]}

grimm_tokenized = grimm["train"].map(grimm_corpus)

In [None]:
# Adapted from https://github.com/huggingface/transformers/issues/18075
from dataclasses import dataclass
import random


@dataclass
class RandomCropDataCollator(tfs.DataCollatorForLanguageModeling):
    random_truncation_token_length: int = 1024

    def __call__(self, features):
        for f in features:
            original_token_length = len(f['input_ids'])

            if self.random_truncation_token_length < original_token_length:
                start_truncation = random.randint(0, original_token_length-self.random_truncation_token_length)
                # print(original_token_length, start_truncation)
                # f['input_ids'] = f['input_ids'][:start_truncation] + f['input_ids'][start_truncation+self.random_truncation_token_length:]
                f['input_ids'] = f['input_ids'][start_truncation : start_truncation+self.random_truncation_token_length]
                # f['attention_mask'] = f['attention_mask'][:start_truncation] + f['attention_mask'][start_truncation+self.random_truncation_token_length:]
                end_shape = len(f['input_ids'])
                # print(original_token_length, "-------->", end_shape)
        return super().__call__(features)


In [None]:
crop_collator = RandomCropDataCollator(
    tok,
    random_truncation_token_length=tok.model_max_length,
    mlm=False
)

In [None]:
lora_config = peft.LoraConfig(
    r=rank, # Keep this rank the same as the instruct model
    target_modules=["q_proj", "v_proj"],
    task_type=peft.TaskType.CAUSAL_LM,
    lora_alpha=rank,
    lora_dropout=0.05
)

lora_model = peft.get_peft_model(mdl, lora_config)
lora_model.print_trainable_parameters()

lr=1e-5
batchsize=8
epochs=10

args = tfs.TrainingArguments(
    output_dir='./finetune',
    optim='adamw_torch',
    learning_rate=lr,
    num_train_epochs=epochs,
    per_device_train_batch_size=batchsize,
    per_device_eval_batch_size=batchsize,
    gradient_accumulation_steps=1,
    gradient_checkpointing=False,
    save_strategy="no",
    weight_decay=0.01,
    push_to_hub=False,
    report_to='none',
    torch_empty_cache_steps=100,
    bf16=True,
    tf32=True,
)

trainer = tfs.Trainer(
    model=lora_model,
    args=args,
    train_dataset=grimm_tokenized,
    processing_class=tok,
    data_collator=crop_collator
)


In [None]:
trainer.train()

In [None]:
text = "Once upon a time,"
inputs = tok(text, return_tensors='pt', truncation=True).to(lora_model.device)
print("Prompt has", len(inputs["input_ids"][0]), "tokens")

with torch.no_grad():
    output = lora_model.generate(
        **inputs,
        do_sample=True,
        pad_token_id=tok.pad_token_id,
        # Change the following 4 parameters to control how the outputs are sampled.
        max_new_tokens=128,
        temperature=0.5,
        top_k=50,
        repetition_penalty=1.10,
    )
    
    output = tok.batch_decode(output)[0]
    print(output)

In [None]:
lora_model.save_pretrained(save_directory="mlhi-lora-corpus")

# Loading LoRA back for inference

In [None]:
# Load a fresh copy of the model
cfg, mdl, tok = load_model(smol_lm)

In [None]:
adapted_model = peft.PeftModel.from_pretrained(mdl, "mlhi-lora-instruct", adapter_name="mlhi-lora-instruct")
print(adapted_model.active_adapters)

In [None]:
adapted_model.load_adapter("mlhi-lora-corpus", adapter_name="mlhi-lora-corpus")
adapters = ["mlhi-lora-corpus", "mlhi-lora-instruct"]
weights = [0.5, 0.5]
adapter_name = "merged"
adapted_model.add_weighted_adapter(adapters, weights, adapter_name, combination_type="svd")

In [None]:
adapted_model.set_adapter("merged")
print(adapted_model.active_adapters)

In [None]:
sys=""
question = "Can you summarize the story of Cinderella?"
chat = [
    {"role": "system", "content": f"{sys}"},
    {"role": "user", "content": f"{question}"},
]

text = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
inputs = tok(text, return_tensors='pt', truncation=True).to(adapted_model.device)
print("Prompt has", len(inputs["input_ids"][0]), "tokens")

with torch.no_grad():
    output = mdl.generate(
        **inputs,
        do_sample=True,
        pad_token_id=tok.pad_token_id,
        # Change the following 4 parameters to control how the outputs are sampled.
        max_new_tokens=128,
        temperature=0.5,
        top_k=50,
        repetition_penalty=1.10,
    )
    
    output = tok.batch_decode(output)[0]
    print(output)