# Train LoRAs with HuggingFace APIs

### Install
pip install pytorch transformers datasets peft jupyterlab ipywidgets

### Notes
grimm = dts.load_dataset("Eugenememe/grimms")

Merging LoRA can be achieved with [add_weighted_adapter](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraModel.add_weighted_adapter)

In [1]:
# Set this var to download everything to the directory where this notebook is.
# Goes under "./hub"
%env HF_HOME=.

env: HF_HOME=.


In [2]:
import transformers as tfs
import datasets as dts
import accelerate
import peft
import torch

I couldn't find many options under 1B, but we want something small so we can train on limited hardware.

In [3]:
amd_slm = "amd/AMD-Llama-135m"

def load_model(name: str):
    config = tfs.AutoConfig.from_pretrained(name)
    model = tfs.AutoModelForCausalLM.from_pretrained(name)
    tokenizer = tfs.AutoTokenizer.from_pretrained(name)

    # Required for PEFT to use gradient checkpointing https://github.com/huggingface/peft/issues/137
    model.enable_input_require_grads()

    # This method is probably not suitable for basic LoRA b/c it requires adding new tokens.
    if False:
        # One of our LoRAs will teach the model a chat/instruct format, so we need to add the marker tokens used by chat_ml.
        # These tokens will be randomly initialized and trained by the LoRA.
        tokenizer.add_special_tokens(special_tokens_dict={"pad_token":"<|pad|>","additional_special_tokens": ["<|im_start|>", "<|im_end|>"]})
    
        # Chatml
        tokenizer.chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}"
    
        # Extend the model's token embedding matrix to add the new tokens.
        model.resize_token_embeddings(len(tokenizer))
        
    # Mistral
    tokenizer.chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ ' [INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
    # TODO Not sure if this is okay, but LoRA does not train input embeddings, so I don't want to introduce a new token.
    # Maybe we can just find another model with a pad token.
    tokenizer.pad_token = tokenizer.eos_token

    return config, model, tokenizer

# I chose AMD-Llama-135m because...
# - Small, there aren't many models in the 100m range.
# - 2048 context window
# - Llama-like
# Most other models on HF hub should work with this notebook.
cfg, mdl, tok = load_model(amd_slm)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


In [4]:
# Instruct dataset. A possible alternative is "tatsu-lab/alpaca"
dolly = dts.load_dataset("databricks/databricks-dolly-15k")

In [5]:
# tok captured from the global namespace
# x keys are instruction, context, response, category
def dolly_chat(x):
    instruction = tok(x["instruction"], truncation=True)
    chat = [
        {"role": "user", "content": "{}".format(x["instruction"])},
        {"role": "assistant", "content": "{}".format(x["response"])},
    ]
    chat_formatted = tok.apply_chat_template(chat, tokenize=False, add_generate_prompt=True)
    tokenized = tok.apply_chat_template(chat, tokenize=True, add_generate_prompt=True)

    return {"text": chat_formatted, "input_ids": tokenized}

dset_w_tokenized = dolly["train"].map(dolly_chat)

Map:   0%|          | 0/15011 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [6]:
lora_config = peft.LoraConfig(
    r=8, # This is the "rank"
    target_modules=["q_proj", "v_proj"],
    task_type=peft.TaskType.CAUSAL_LM,
    lora_alpha=16, # Rule of thumb seems to be 2x the rank.
    lora_dropout=0.05
)
lora_model = peft.get_peft_model(mdl, lora_config)
lora_model.print_trainable_parameters()

trainable params: 294,912 || all params: 134,400,768 || trainable%: 0.2194


In [7]:
len(dset_w_tokenized)

15011

In [9]:
lr=1e-4
batchsize=4
epochs=3

args = tfs.TrainingArguments(
    output_dir='./finetune',
    optim='adamw_torch',
    learning_rate=lr,
    num_train_epochs=epochs,
    per_device_train_batch_size=batchsize,
    per_device_eval_batch_size=batchsize,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    save_strategy="no",
    weight_decay=0.01,
    push_to_hub=False,
    report_to='none',
    bf16=True,
    tf32=True,
)

collator = tfs.DataCollatorForLanguageModeling(tok, mlm=False)

trainer = tfs.Trainer(
    model=lora_model,
    args=args,
    train_dataset=dset_w_tokenized,
    processing_class=tok,
    data_collator=collator
)


In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss


In [None]:
sys = "Owls are birds from the order Strigiformes which includes over 200 species of mostly solitary and nocturnal birds of prey typified by an upright stance, a large, broad head, binocular vision, binaural hearing, sharp talons, and feathers adapted for silent flight. Exceptions include the diurnal northern hawk-owl and the gregarious burrowing owl. "
question = "What are some common qualities of owls?"
chat = [
    {"role": "user", "content": f"{question}"},
]

text = tok.apply_chat_template(chat, tokenize=False, add_generate_prompt=True)
inputs = tok(text, return_tensors='pt', truncation=True).to(lora_model.device)
print(type(inputs))

with torch.no_grad():
    output = lora_model.generate(
        **inputs,
        do_sample=True,
        pad_token_id=tok.pad_token_id,
        # Change the following 4 parameters to control how the outputs are sampled.
        max_new_tokens=128,
        temperature=0.50,
        top_k=50,
        repetition_penalty=1.10,
    )
    
    output = tok.batch_decode(output)[0]
    print(output)
