# Simplified LoRA Implementation

In [1]:
from google.colab import drive
drive.mount('/content/drive/')#, force_remount=True)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
save_dir = '/content/drive/MyDrive/notebook-data'

#### Install Dependencies

In [3]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/peft.git git+https://github.com/huggingface/transformers.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


#### Confirm CUDA

In [4]:
import torch
torch.cuda.is_available()

True

#### Load Base Model

In [5]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
import bitsandbytes as bnb

from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    "bigscience/bloom-560m",
    torch_dtype=torch.float16,
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained("bigscience/tokenizer")

##### View Model Summary

In [6]:
print(model)

BloomForCausalLM(
  (transformer): BloomModel(
    (word_embeddings): Embedding(250880, 1024)
    (word_embeddings_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (h): ModuleList(
      (0-23): 24 x BloomBlock(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (self_attention): BloomAttention(
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): BloomMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (gelu_impl): BloomGelu()
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (

Freeze all parameters

In [7]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

#### Helper Function

In [8]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

#### Obtain LoRA Model

In [9]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 786432 || all params: 560001024 || trainable%: 0.14043402892063284


#### Load Sample Dataset

In [10]:
import datasets

wikitext = datasets.load_dataset('wikitext', 'wikitext-103-raw-v1', split='train[:10%]')

```
### INPUT
{context}

### FIXED
{commas-fixed}
```

In [11]:
wikitext

Dataset({
    features: ['text'],
    num_rows: 180135
})

In [12]:
def create_prompt(text):
  prompt_template = f"### INPUT\n{text.replace(',', '')}\n\n### FIXED\n{text}</s>"
  return prompt_template

mapped_wikitext = wikitext.map(lambda samples: tokenizer(create_prompt(samples['text'])), batch_size=32)

In [13]:
print(tokenizer.decode(mapped_wikitext[1]["input_ids"]))

### INPUT
 = Valkyria Chronicles III = 


### FIXED
 = Valkyria Chronicles III = 
</s>


#### Train LoRA

In [14]:
torch.cuda.empty_cache()

In [15]:
import transformers

trainer = transformers.Trainer(
    model=model,
    train_dataset=mapped_wikitext,
    # eval_dataset=mapped_wikitext["validation"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        max_steps=1000,
        learning_rate=1e-3,
        fp16=True,
        logging_steps=1,
        output_dir=save_dir,
        save_steps=.1,
        save_total_limit=3,
        # evaluation_strategy="steps",
        # eval_steps=.1,
        # load_best_model_at_end=True,
        # metric_for_best_model=TODO
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
trainer.train()

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,3.0788
2,3.1952
3,2.1837
4,2.5418
5,4.5083
6,2.0963
7,2.4768
8,4.344
9,2.7503
10,5.2083


TrainOutput(global_step=1000, training_loss=1.3154083265066148, metrics={'train_runtime': 1097.8147, 'train_samples_per_second': 7.287, 'train_steps_per_second': 0.911, 'total_flos': 3300074161176576.0, 'train_loss': 1.3154083265066148, 'epoch': 0.04})

In [22]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = f"{save_dir}/checkpoint-1000"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=False, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
comma_model = PeftModel.from_pretrained(model, peft_model_id).cuda()

In [31]:
from IPython.display import display, Markdown

def make_inference(text_wrong_commas):
  batch = tokenizer(f"### INPUT\n{text_wrong_commas.replace(',', '')}\n\n### FIXED\n", return_tensors='pt')
  batch = {k: v.cuda() for k, v in batch.items()}

  with torch.cuda.amp.autocast():
    output_tokens = comma_model.generate(**batch, max_new_tokens=200)

  display(Markdown((tokenizer.decode(output_tokens[0], skip_special_tokens=True))))

In [35]:
context = "Afterwards however he moved to Paris France."

make_inference(context)

{'input_ids': tensor([[105311, 191657,    189,  23691,  12548,  14789,   1683,  38033,    427,
           9316,   7138,   6149, 105311, 128778,   5871,    189]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}


### INPUT
Afterwards however he moved to Paris France.

### FIXED
Afterwards however he moved to Paris France.