### Parameter Efficient Fine-Tuning
In this notebook, we're gonna fine-tune large language models within limited GPU memory.

In [1]:
%pip install --quiet transformers==4.34.1 accelerate==0.24.0 sentencepiece==0.1.99 optimum==1.13.2 peft==0.5.0 bitsandbytes==0.41.2.post2

import torch
import torch.nn as nn
import torch.nn.functional as F

import transformers
from tqdm.auto import tqdm, trange
assert torch.cuda.is_available(), "you need cuda for this part"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.0/261.0 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.0/301.0 kB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.6/85.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m44.0 MB/s[0m e

In [2]:
model_name = 'Enoch/llama-7b-hf'

# loading Llama tokenizer ...
tokenizer = transformers.LlamaTokenizer.from_pretrained(model_name, device_map=device)
tokenizer.pad_token_id = tokenizer.eos_token_id

# ... and the model itself
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name, device_map='auto', low_cpu_mem_usage=True, offload_state_dict=True,
    load_in_4bit=True, torch_dtype=torch.float32,  # weights are 4-bit; layernorms and activations are fp32
)
for param in model.parameters():
    param.requires_grad=False

model.gradient_checkpointing_enable()  # only store a small subset of activations, re-compute the rest.
model.enable_input_require_grads()     # override an implementation quirk in gradient checkpoints that disables backprop unless inputs require grad
# more on gradient checkpointing: https://pytorch.org/docs/stable/checkpoint.html https://arxiv.org/abs/1604.06174

Downloading tokenizer_config.json:   0%|          | 0.00/218 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Downloading config.json:   0%|          | 0.00/511 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/33 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00002-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00003-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00004-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00005-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00006-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00007-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00008-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00009-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00010-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00011-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00012-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00013-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00014-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00015-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00016-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00017-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00018-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00019-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00020-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00021-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00022-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00023-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00024-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00025-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00026-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00027-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00028-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00029-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00030-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00031-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00032-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00033-of-00033.bin:   0%|          | 0.00/524M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

Downloading generation_config.json:   0%|          | 0.00/151 [00:00<?, ?B/s]

### Prompt tuning: the story of a fox 

![img](https://i.imgur.com/Ux3qQAu.png) (source: theodd1souts.fandom.com)

In [3]:
prompt = 'A quick brown fox'
batch = tokenizer(prompt, return_tensors='pt', return_token_type_ids=False).to(device)

for i in range(10):
    next_token = model(**batch).logits[0, -1].argmax(-1).reshape(1, 1)
    batch['input_ids'] = torch.cat([batch['input_ids'], next_token], dim=-1)
    batch['attention_mask'] = torch.cat([batch['attention_mask'], torch.ones_like(next_token)], dim=-1)

print("\nOutput:", tokenizer.decode(batch['input_ids'][0].cpu().numpy().tolist()))


Output: <s>A quick brown fox jumps over the lazy dog.
A quick


What a blatant lie! This particular fox assures you that it didn't in fact jump over the lazy dog. No, sir! The fox was just minding its own business. __We train the model to say truth: no dog was jumped over today.__

In [4]:
the_truth = "A quick brown fox did not jump over the lazy dog. Besides, that dog deserved it anyway!"
batch = tokenizer(the_truth, return_tensors='pt', return_token_type_ids=False).to(device)
outputs = model(**batch)

next_word_logits = outputs.logits[:, :-1]
true_next_tokens = batch['input_ids'][:, 1:]
loss = F.cross_entropy(next_word_logits.flatten(0, 1), true_next_tokens.flatten(0, 1))

print("Loss:", loss)

Loss: tensor(3.0725, device='cuda:0', grad_fn=<NllLossBackward0>)


Except, we can't train the entire model - that would be 28GB gradients in float32. Instead, let's run [prompt tuning](https://arxiv.org/abs/2104.08691).

![img](https://i.imgur.com/VwNNKnb.png)


In [10]:
class WordEmbeddingsWithLearnedPrompts(nn.Module):
    """
    To perform prompt tuning, you will need to replace model's original word embeddings with a layer - THIS layer
     - that inserts trainable prompts instead of the first N token embeddings. """

    def __init__(self, word_embeddings: nn.Embedding, num_prompts: int):
        super().__init__()
        self.original_word_embeddings = word_embeddings
        self.num_prompts = num_prompts
        self.learnable_prompts = nn.Parameter(
            torch.randn(1, num_prompts, word_embeddings.embedding_dim), requires_grad=True)

    def forward(self, input_ids: torch.LongTensor):
        # input_ids shape: [batch_size, seq length]
        assert input_ids.dtype == torch.int64
        assert input_ids.shape[1] > self.num_prompts
        assert torch.all(input_ids[:, :self.num_prompts] == tokenizer.pad_token_id).item(), "don't forget to prepend several BOS tokens to input_ids"

        # embed input_ids, but replace the first :num_prompts: tokens with self.learnable_prompts
        # This is because we will prepend :num_prompts: padding tokens at the beginning

        # After that, we must produce a word embedding vector for each token in input_ids,
        # except that the first :num_prompts: vectors should equal learnable_prompts;
        # any additional vectors after first :num_prompts: ones should be embedded as usual

        embs = self.original_word_embeddings(input_ids)
        out = torch.cat((self.learnable_prompts, embs[:, num_prompts:, :]), dim = 1)

        return out

In [11]:
num_prompts = 16
test_emb_layer = WordEmbeddingsWithLearnedPrompts(model.model.embed_tokens, num_prompts=num_prompts).to(device)
test_input_ids = tokenizer("a cat say on a may", return_tensors='pt')['input_ids'].to(device)

space_for_prompts = torch.full([len(test_input_ids), num_prompts], fill_value=tokenizer.pad_token_id,
                               dtype=torch.int64, device=device)
test_inputs_with_prompts = torch.cat([space_for_prompts, test_input_ids], dim=1)

with torch.cuda.amp.autocast():
  test_prompt_embeddings = test_emb_layer(test_inputs_with_prompts)

assert test_prompt_embeddings.shape[:2] == test_inputs_with_prompts.shape
assert test_prompt_embeddings.shape[-1] == model.config.hidden_size
assert torch.allclose(test_prompt_embeddings[:, :num_prompts], test_emb_layer.learnable_prompts.float())
assert torch.allclose(test_prompt_embeddings[:, num_prompts:], model.model.embed_tokens(test_input_ids).float())
print("Looks legit!")

Looks legit!


__Now that it works,__ let's inject learnable prompts into the main model and teach it about foxes.

In [12]:
assert isinstance(model.model.embed_tokens, nn.Embedding), "you have already replaced the embedding layer. If the replacement is broken, please reload the model"

model.model.embed_tokens = WordEmbeddingsWithLearnedPrompts(model.model.embed_tokens, num_prompts=num_prompts).to(device)

opt = torch.optim.Adam([model.model.embed_tokens.learnable_prompts], lr=0.01)

In [13]:
the_truth = "A quick brown fox did not jump over the lazy dog. Besides, that dog deserved it anyway!"
batch = tokenizer(the_truth, return_tensors='pt', return_token_type_ids=False).to(device)
space_for_prompts = torch.full([len(test_input_ids), num_prompts], fill_value=tokenizer.pad_token_id,
                               dtype=torch.int64, device=device)
batch['input_ids'] = torch.cat([space_for_prompts, batch['input_ids']], dim=1)
batch['attention_mask'] = torch.cat([torch.ones_like(space_for_prompts), batch['attention_mask']], dim=1)

outputs = model(**batch)
next_word_logits = outputs.logits[:, num_prompts : -1, :]
true_next_tokens = batch['input_ids'][:, num_prompts + 1:]
loss = F.cross_entropy(next_word_logits.flatten(0, 1), true_next_tokens.flatten(0, 1))
print("Loss:", loss)

i = 0
while loss.item() > 0.1:
  i += 1
  outputs = model(**batch)
  next_word_logits = outputs.logits[:, num_prompts : -1, :]
  true_next_tokens = batch['input_ids'][:, num_prompts + 1:]
  loss = F.cross_entropy(next_word_logits.flatten(0, 1), true_next_tokens.flatten(0, 1))
  opt.zero_grad()
  loss.backward()
  opt.step()
  if i % 5 == 0:
    print("Loss:", loss)


assert loss.item() <= 0.1
print("Good job!")

Loss: tensor(7.1027, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(6.0922, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(5.1451, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(4.2497, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(3.3428, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(2.6435, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(1.9583, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(1.4028, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(0.9304, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(0.5886, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(0.3501, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(0.1992, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(0.1155, device='cuda:0', grad_fn=<NllLossBackward0>)
Good job!


In [14]:
prompt = 'A quick brown fox'
batch = tokenizer(prompt, return_tensors='pt', return_token_type_ids=False).to(device)
batch['input_ids'] = torch.cat([space_for_prompts, batch['input_ids']], dim=1)
batch['attention_mask'] = torch.cat([torch.ones_like(space_for_prompts), batch['attention_mask']], dim=1)


for i in range(15):
    next_token = model(**batch).logits[0, -1].argmax(-1).reshape(1, 1)
    batch['input_ids'] = torch.cat([batch['input_ids'], next_token], dim=-1)
    batch['attention_mask'] = torch.cat([batch['attention_mask'], torch.ones_like(next_token)], dim=-1)

print("\nOutput:", tokenizer.decode(batch['input_ids'][0, num_prompts:].cpu().numpy().tolist()))

# the model deny that the fox jumped over the lazy dog


Output: <s>A quick brown fox did not jump over the lazy dog. Besides, that dog deserved it


Действительно получилось как и должно было

### Using HuggingFace PEFT 

[`peft`](https://huggingface.co/docs/peft/index) is a transformer's sister library that allows you to apply various __p__arameter __e__fficient __f__ine-__t__uning methods to pre-trained transformers. The library imlements both prompt tuning, prefix tuning, as well as several adapter-based techniques under a common interface:



In [18]:
import peft
assert isinstance(model.model.embed_tokens, nn.Embedding), "please reload the model"

peft_config = peft.PromptTuningConfig(task_type=peft.TaskType.CAUSAL_LM, num_virtual_tokens=16)
model = peft.get_peft_model(model, peft_config)  # note: for most peft methods, this line also modifies model in-place
print("Trainable parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad))
print("Total parameters (excluding quantization):", sum(p.numel() for p in model.parameters()))

Trainable parameters: 65536
Total parameters (excluding quantization): 3500478464


In [None]:
# We optimize the PEFT-wrapped model to achieve next token prediction loss < 0.1, but this time using PEFT
# Please note: we no longer need to prepend PAD tokens, but you still need to skip :num_virtual_tokens: first logits.
# Finally, generate the sentence to make sure that the model learned the truth.

In [19]:
opt = torch.optim.Adam(model.parameters(), lr=0.01)

In [27]:
the_truth = "A quick brown fox did not jump over the lazy dog. Besides, that dog deserved it anyway!"
batch = tokenizer(the_truth, return_tensors='pt', return_token_type_ids=False).to(device)

outputs = model(**batch)
next_word_logits = outputs.logits[:, peft_config.num_virtual_tokens : -1, :]
true_next_tokens = batch['input_ids'][:, 1:]
print(next_word_logits.size(), true_next_tokens.size())
loss = F.cross_entropy(next_word_logits.flatten(0, 1), true_next_tokens.flatten(0, 1))
print("Loss:", loss)

i = 0
while loss.item() > 0.1:
  i += 1
  outputs = model(**batch)
  next_word_logits = outputs.logits[:, peft_config.num_virtual_tokens : -1, :]
  true_next_tokens = batch['input_ids'][:, 1:]
  loss = F.cross_entropy(next_word_logits.flatten(0, 1), true_next_tokens.flatten(0, 1))
  opt.zero_grad()
  loss.backward()
  opt.step()
  if i % 5 == 0:
    print("Loss:", loss)


assert loss.item() <= 0.1
print("Good job!")

torch.Size([1, 22, 32000]) torch.Size([1, 22])
Loss: tensor(7.4109, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(5.7420, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(4.5398, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(3.6194, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(2.8103, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(2.1042, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(1.4752, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(0.9519, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(0.5722, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(0.3161, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(0.1742, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(0.1004, device='cuda:0', grad_fn=<NllLossBackward0>)
Good job!


In [30]:
prompt = 'A quick brown fox'
batch = tokenizer(prompt, return_tensors='pt', return_token_type_ids=False).to(device)

for i in range(15):
    next_token = model(**batch).logits[0, -1].argmax(-1).reshape(1, 1)
    batch['input_ids'] = torch.cat([batch['input_ids'], next_token], dim=-1)
    batch['attention_mask'] = torch.cat([batch['attention_mask'], torch.ones_like(next_token)], dim=-1)

print("\nOutput:", tokenizer.decode(batch['input_ids'][0, :].cpu().numpy().tolist()))


Output: <s>A quick brown fox did not jump over the lazy dog. Besides, that dog deserved it


Тоже отлично :)

### Parameter-efficient finetuning with LoRA 

When training on more serious tasks, we can use low-rank adapters based on the [LoRA paper](https://arxiv.org/pdf/2106.09685.pdf).

The core idea is to add low-rank adapters __in parallel with existing linear layers,__ like this:
<center><img src="https://i.imgur.com/6bQLNiG.png" width=240px></center>

In the original LoRA paper, the adapters were only added to attention projection matrices. However, [subsequent works](https://arxiv.org/abs/2305.14314) show that it is useful to adapt FFNs as well. But before we do any training, we need to implement the basic LoRA layer.

In [3]:
# re-load the model to remove any previous PEFT tuners
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name, device_map='auto', low_cpu_mem_usage=True, offload_state_dict=True,
    load_in_4bit=True, torch_dtype=torch.float32,  # weights are 4-bit; layernorms and activations are fp32
)
for param in model.parameters():
    param.requires_grad=False
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

In [4]:
class LoRALayer(nn.Module):
    """Wraps a linear layer with LoRA-like adapter. Wraps an existing OPT linear layer"""
    def __init__(self, module: nn.Linear, rank: int):
        super().__init__()
        self.module = module  # pre-trained (frozen) linear layer
        self.adapter_A = nn.Parameter(torch.empty(module.in_features, rank, device=module.weight.device))
        nn.init.kaiming_uniform_(self.adapter_A, a=5 ** 0.5)
        self.adapter_B = nn.Parameter(torch.zeros(rank, module.out_features, device=module.weight.device))

    def forward(self, input):
        # Apply self.module and LoRA adapter, return the sum (self.module outputs + adapter outputs)
        adapter_A_out = F.linear(input, self.adapter_A.transpose(0, 1))
        adapter_outputs = F.linear(adapter_A_out, self.adapter_B.transpose(0, 1))
        return self.module(input) + adapter_outputs

In [21]:
# test our implementation
test_linear = nn.Linear(128, 128)
test_linear.weight.data[...] = torch.eye(128)
test_adapter = LoRALayer(test_linear, rank=8)

assert torch.allclose(test_adapter(torch.ones(1, 1, 128)), test_linear.bias + 1), "please check your forward pass"

test_adapter.adapter_A.data[...] = torch.linspace(0.1, -0.5, 128 * 8).view(128, 8)
test_adapter.adapter_B.data[...] = torch.linspace(0.5, -0.1, 128 * 8).view(8, 128)
test_linear.bias.data[...] = torch.linspace(1., -1., 128)

dummy_loss = F.mse_loss(test_adapter(torch.ones(1, 128) / 128).squeeze(), torch.linspace(-1, 1, 128))
assert torch.allclose(dummy_loss, torch.tensor(1.3711389), rtol=0, atol=1e-4)
dummy_loss.backward()
assert all(w.grad is not None for w in [test_adapter.adapter_A, test_adapter.adapter_B]), "some adapter weights have no grad"
assert torch.allclose(test_adapter.adapter_A.grad.sum(), torch.tensor(-0.60158), rtol=0, atol=1e-4), "bad grad w.r.t. A"
assert torch.allclose(test_adapter.adapter_B.grad.sum(), torch.tensor(0.9931), rtol=0, atol=1e-4), "bad grad w.r.t. B"
# note: bad grad means that your code is different from LoRA paper OR that your code is not autograd-friendly (e.g. no_grad)
del dummy_loss, test_linear, test_adapter
print("All tests passed!")

All tests passed!


### Apply LoRA to the model

The code below applies LoRA adapters on top of Q/K/V linear layers in Llama attention. We may also choose to modify other layers:
* self_attn.o_proj - attention output projection
* mlp.up_proj, mlp.gate_proj, mlp.down_proj - transformer feedforward layers
* lm_head - output LM head

In [8]:
lora_rank = 8

for name, module in model.model.layers.named_modules():
    if 'LlamaDecoderLayer' in repr(type(module)):
        module.self_attn.q_proj = LoRALayer(module.self_attn.q_proj, rank=lora_rank).to(device)
        module.self_attn.k_proj = LoRALayer(module.self_attn.k_proj, rank=lora_rank).to(device)
        module.self_attn.v_proj = LoRALayer(module.self_attn.v_proj, rank=lora_rank).to(device)

assert sum(isinstance(module, LoRALayer) for module in model.modules()) == 96  # for Llama-7B

In [9]:
batch = tokenizer("This model wants to share its greatest secret:", return_tensors='pt', return_token_type_ids=False)
# test a single training step, make sure we get meaningful gradients
with torch.cuda.amp.autocast(dtype=torch.float32):
    out = model.forward(**batch)
    (out.logits.norm() / 100).backward()

for i, module in enumerate(model.modules()):
    if isinstance(module, LoRALayer):
        assert module.adapter_B.grad is not None
        assert module.adapter_B.grad.norm().item() > 0

model.zero_grad(set_to_none=True)
print("Grad check successful, well done!")

Grad check successful, well done!


### Example how to train model

The example below shows how to train the LoRA adapters on a dummy dataset. 

In [5]:
import datasets

Немного смотрим как датасет устроен, чтобы потом оформить похожим образом наш codeparrot

In [6]:
data = datasets.load_dataset("Abirate/english_quotes", split="train[:32]") # 32 lines

Downloading readme:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/647k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
data[0]

{'quote': '“Be yourself; everyone else is already taken.”',
 'author': 'Oscar Wilde',
 'tags': ['be-yourself',
  'gilbert-perreira',
  'honesty',
  'inspirational',
  'misattributed-oscar-wilde',
  'quote-investigator']}

In [8]:
data = data.map(lambda samples: tokenizer(samples['quote']), batched=True)

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

In [9]:
data[0]

{'quote': '“Be yourself; everyone else is already taken.”',
 'author': 'Oscar Wilde',
 'tags': ['be-yourself',
  'gilbert-perreira',
  'honesty',
  'inspirational',
  'misattributed-oscar-wilde',
  'quote-investigator'],
 'input_ids': [1, 1346, 3629, 7535, 29936, 14332, 1683, 338, 2307, 4586, 3178],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
# checking if the model can learn. Change max_steps for proper training
data = datasets.load_dataset("Abirate/english_quotes", split="train[:32]") # 32 lines
data = data.map(lambda samples: tokenizer(samples['quote']), batched=True)
model._hf_peft_config_loaded = True  # silence a warning from HF trainer

trainer = transformers.Trainer(
    model=model, train_dataset=data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=2, gradient_accumulation_steps=1,
        # note: if you want larger batch size, increase gradient_accumulation_steps
        warmup_steps=250, max_steps=100, learning_rate=2e-4, fp16=True,
        logging_steps=1, output_dir='outputs', report_to=None),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
# if you see cache warnings, set `model.config.use_cache = False` to silence them. Please re-enable for inference!

trainer.train()

Downloading readme:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/647k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
1,1.8912
2,1.696
3,0.8969
4,1.7447
5,1.1681
6,0.7311
7,1.5252
8,1.0626
9,0.6691
10,1.4262


TrainOutput(global_step=100, training_loss=0.5376536306366324, metrics={'train_runtime': 149.9708, 'train_samples_per_second': 1.334, 'train_steps_per_second': 0.667, 'total_flos': 621258424123392.0, 'train_loss': 0.5376536306366324, 'epoch': 6.25})

### *Actually* train the model 

We fine-tune the model to _generate python code_. More specifically,

* __dataset:__ use [codeparrot-clean](https://huggingface.co/datasets/codeparrot/codeparrot-clean) or any other data containing python code. Since you do not need much data for this excercise, it is enough to use just shorter validation subset of `codeparrots`
* __preprocessing:__ select python code based on file extentions (.py)  (may skip in case of codeparrot - it is 100% python)
* __short lines:__ take the first 512 characters of each line
* __adapter type:__ use LoRA as defined above __plus at least one of:__
   - extra adapter on lm_head
   - extra adapter on MLP components (mlp.*)
   - trainable input embeddings (requires tweaking memory usage)

* __training:__ we do not have to train to convergence. If all goes well, our model should `.generate` code after 500 steps. Please use batch size of at least 4 (4 x 1 x 512 tokens) using `gradient_accumulation_steps=4`.

Грузим датасет

In [10]:
data = datasets.load_dataset("codeparrot/codeparrot-clean-valid")

Downloading readme:   0%|          | 0.00/401 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/142M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Смотрим что тут у нас

In [11]:
data = data['train']
data[0]

{'repo_name': 'pansapiens/mytardis',
 'path': 'tardis/apps/mx_views/views.py',
 'copies': '3',
 'size': '2892',
 'content': 'from django.conf import settings\nfrom django.core.paginator import Paginator, InvalidPage, EmptyPage\nfrom django.http import HttpResponse\n\nfrom tardis.tardis_portal.auth import decorators as authz\nfrom tardis.tardis_portal.models import Dataset\nfrom tardis.tardis_portal.shortcuts import get_experiment_referer\nfrom tardis.tardis_portal.shortcuts import render_response_index\n\n\n@authz.dataset_access_required\ndef view_full_dataset(request, dataset_id):\n    """Displays a MX Dataset and associated information.\n\n    Shows a full (hundreds of images) dataset its metadata and a list\n    of associated files with the option to show metadata of each file\n    and ways to download those files.  With write permission this page\n    also allows uploading and metadata editing.\n\n    Settings for this view:\n    INSTALLED_APPS += ("tardis.apps.mx_views",)\n    DAT

Обрезаем все примеры до 512 символов

In [12]:
data = data.map(lambda samples: {'content': samples['content'][:512]})
data[0]

Map:   0%|          | 0/61373 [00:00<?, ? examples/s]

{'repo_name': 'pansapiens/mytardis',
 'path': 'tardis/apps/mx_views/views.py',
 'copies': '3',
 'size': '2892',
 'content': 'from django.conf import settings\nfrom django.core.paginator import Paginator, InvalidPage, EmptyPage\nfrom django.http import HttpResponse\n\nfrom tardis.tardis_portal.auth import decorators as authz\nfrom tardis.tardis_portal.models import Dataset\nfrom tardis.tardis_portal.shortcuts import get_experiment_referer\nfrom tardis.tardis_portal.shortcuts import render_response_index\n\n\n@authz.dataset_access_required\ndef view_full_dataset(request, dataset_id):\n    """Displays a MX Dataset and associated information.\n\n ',
 'license': 'bsd-3-clause',
 'hash': -8726488663588781404,
 'line_mean': 37.0526315789,
 'line_max': 79,
 'alpha_frac': 0.6549100968,
 'autogenerated': False}

Токенизируем

In [13]:
data = data.map(lambda samples: tokenizer(samples['content']), batched=True)

Map:   0%|          | 0/61373 [00:00<?, ? examples/s]

Добавляем Лора-адаптеры к self_attn и mlp

In [14]:
lora_rank = 8

for name, module in model.model.layers.named_modules():
    if 'LlamaDecoderLayer' in repr(type(module)):
        module.self_attn.q_proj = LoRALayer(module.self_attn.q_proj, rank=lora_rank).to(device)
        module.self_attn.k_proj = LoRALayer(module.self_attn.k_proj, rank=lora_rank).to(device)
        module.self_attn.v_proj = LoRALayer(module.self_attn.v_proj, rank=lora_rank).to(device)
        module.mlp.up_proj = LoRALayer(module.mlp.up_proj, rank=lora_rank).to(device)
        module.mlp.gate_proj = LoRALayer(module.mlp.gate_proj, rank=lora_rank).to(device)
        module.mlp.down_proj  = LoRALayer(module.mlp.down_proj, rank=lora_rank).to(device)

In [16]:
sum(isinstance(module, LoRALayer) for module in model.modules())

192

Стало в два раза больше Лора-layers, что логично, мы же еще к mlp добавили адаптеры

In [15]:
batch = tokenizer("This model wants to share its greatest secret:", return_tensors='pt', return_token_type_ids=False)
# test a single training step, make sure we get meaningful gradients
with torch.cuda.amp.autocast(dtype=torch.float32):
    out = model.forward(**batch)
    (out.logits.norm() / 100).backward()

for i, module in enumerate(model.modules()):
    if isinstance(module, LoRALayer):
        assert module.adapter_B.grad is not None
        assert module.adapter_B.grad.norm().item() > 0

model.zero_grad(set_to_none=True)
print("Grad check successful, well done!")

Grad check successful, well done!


In [16]:
prompts =  ['', 'import', 'from', 'while', 'try', 'if', 'for', 'torch']  # feel free to add a few more that are not 100% assiciated with Python

# <A WHOLE LOT OF YOUR CODE>
# generate baseline samples with the selected prompts before finetuning
# please feel free to use transformers.Trainer (as above) or your custom training code
# after the training concludes, please show examples of text generated by your model. It is expected to look like Python code fragments
# print the generation examples nicely (suggestion: use pandas or HTML) for easier comparison
# note: your LoRA-enhanced model can run generation the same way as the non-trained model (above)

Сначала посмотрим что модель генерирует до файн тюна

In [20]:
before_finetune = []
for i, prompt in enumerate(prompts):
  batch = tokenizer(prompt, return_tensors='pt', return_token_type_ids=False).to(device)

  for i in range(50):
      next_token = model(**batch).logits[0, -1].argmax(-1).reshape(1, 1)
      batch['input_ids'] = torch.cat([batch['input_ids'], next_token], dim=-1)
      batch['attention_mask'] = torch.cat([batch['attention_mask'], torch.ones_like(next_token)], dim=-1)

  out = tokenizer.decode(batch['input_ids'][0].cpu().numpy().tolist())
  before_finetune.append(out)
  print("\nOutput:", out)


Output: <s>▶▶ 2019-2020 School Year
The 2019-2020 school year is here! We are so excited to welcome our new students and families to the school. We are also excited

Output: <s> import Foundation

public extension NSURL {
    public var absoluteString: String {
        return String(cString: CFBundleGetBundleWithURL(self).UTF8String)
    }
}</s><s>package com.google.

Output: <s>from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys

from absl import flags

from tensorflow.python import pywrap

Output: <s>while(1)
while(1) {
    // do something
}
\end{code}

Comment: This is not the same as the OP's code.

Comment: @Jeffrey: It's

Output: <s>try to find the best solution for your needs.
We are a team of professionals with a long experience in the field of web development.
We are a team of professionals with a long experience in the field of web development. We are a

Output: <s>if ( !window.atmosphere ) {
    

Иногда пытается сгенерировать что-то похожее на код, но тут явно не везде питон. В некоторых случаях вообще какой-то текст

Теперь обучим на питонячем коде :)

In [21]:
model._hf_peft_config_loaded = True  # silence a warning from HF trainer

trainer = transformers.Trainer(
    model=model, train_dataset=data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=2, gradient_accumulation_steps=4,
        # note: if you want larger batch size, increase gradient_accumulation_steps
        warmup_steps=250, max_steps=100, learning_rate=2e-4, fp16=True,
        logging_steps=1, output_dir='outputs', report_to=None),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
# if you see cache warnings, set `model.config.use_cache = False` to silence them. Please re-enable for inference!

trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
1,1.3947
2,1.4262
3,1.3524
4,1.0935
5,1.4304
6,1.4579
7,1.174
8,1.0476
9,1.1603
10,1.5437


TrainOutput(global_step=100, training_loss=1.2879127061367035, metrics={'train_runtime': 960.3161, 'train_samples_per_second': 0.833, 'train_steps_per_second': 0.104, 'total_flos': 5653524985233408.0, 'train_loss': 1.2879127061367035, 'epoch': 0.01})

In [23]:
after_finetune = []
for i, prompt in enumerate(prompts):
  batch = tokenizer(prompt, return_tensors='pt', return_token_type_ids=False).to(device)

  for i in range(50):
      next_token = model(**batch).logits[0, -1].argmax(-1).reshape(1, 1)
      batch['input_ids'] = torch.cat([batch['input_ids'], next_token], dim=-1)
      batch['attention_mask'] = torch.cat([batch['attention_mask'], torch.ones_like(next_token)], dim=-1)

  out = tokenizer.decode(batch['input_ids'][0].cpu().numpy().tolist())
  after_finetune.append(out)
  print("\nOutput:", out)




Output: <s># -*- coding: utf-8 -*-
#
# Copyright 2015, 2016, 2017, 2018, 2019, 2

Output: <s>import os
import sys
import time
import logging
import logging.handlers
import logging.config
import logging.config_file
import logging.manager
import logging.rootlogger
import logging.handlers
import logging.hand

Output: <s>from django.contrib import admin
from django.contrib.auth.models import Group
from django.contrib.contenttypes.fields import GenericForeignKey
from django.contrib.contenttypes.models import ContentType
from django.contrib.

Output: <s>while (true) {
    if (fgets(buf, sizeof(buf), fp) != NULL) {
        if (buf[0] == '#') {
            continue;
        }
        if (buf[0

Output: <s>try:
    from setuptools import setup
except ImportError:
    from distutils.core import setup

setup(
    name='py-slack-bot',
    version='0.1.0',
   

Output: <s>if ( ! defined( 'ABSPATH' ) ) {
    exit;
}

/**
 * Class WPML_TM_Widget_Translation_Status
 *
 * @package WPML
 * @subpackage


Уже везде виден код, а не текст, но не всегда он питонячий, к сожалению. Обучим еще

In [26]:
model._hf_peft_config_loaded = True  # silence a warning from HF trainer

trainer = transformers.Trainer(
    model=model, train_dataset=data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=2, gradient_accumulation_steps=4,
        # note: if you want larger batch size, increase gradient_accumulation_steps
        warmup_steps=250, max_steps=100, learning_rate=2e-5, fp16=True,
        logging_steps=1, output_dir='outputs', report_to=None),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
# if you see cache warnings, set `model.config.use_cache = False` to silence them. Please re-enable for inference!

trainer.train()

Step,Training Loss
1,1.1675
2,1.2235
3,1.1017
4,0.8293
5,1.1343
6,1.2507
7,1.0102
8,0.7834
9,0.9059
10,1.3665


TrainOutput(global_step=100, training_loss=1.093346555829048, metrics={'train_runtime': 957.9758, 'train_samples_per_second': 0.835, 'train_steps_per_second': 0.104, 'total_flos': 5653524985233408.0, 'train_loss': 1.093346555829048, 'epoch': 0.01})

In [27]:
after_finetune1 = []
for i, prompt in enumerate(prompts):
  batch = tokenizer(prompt, return_tensors='pt', return_token_type_ids=False).to(device)

  for i in range(50):
      next_token = model(**batch).logits[0, -1].argmax(-1).reshape(1, 1)
      batch['input_ids'] = torch.cat([batch['input_ids'], next_token], dim=-1)
      batch['attention_mask'] = torch.cat([batch['attention_mask'], torch.ones_like(next_token)], dim=-1)

  out = tokenizer.decode(batch['input_ids'][0].cpu().numpy().tolist())
  after_finetune1.append(out)
  print("\nOutput:", out)




Output: <s># -*- coding: utf-8 -*-
#
# Copyright (C) 2015-2016, 2018-2020, 2021 by the

Output: <s>import os
import sys
import time
import logging

from django.core.management import BaseCommand
from django.core.management.base import NoArgsCommand

from django.contrib.auth.models import User
from django.

Output: <s>from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np

from tensorflow.python.framework import dtypes
from tensorflow.python.framework

Output: <s>while (true) {
    if (input.hasNext()) {
        String word = input.next();
        if (word.length() > 0) {
            System.out.println(word);
        }


Output: <s>try:
    from setuptools import setup
except ImportError:
    from distutils.core import setup

setup(
    name='py-slack-api',
    version='0.1.0',
   

Output: <s>if ( ! defined('BASEPATH')) exit('No direct script access allowed');

class M_user extends CI_Model {

    public functio

Ладно, это я хотел сэкономить время, но видимо не получится (все еще не везде питон) :) Обучим честно 500 итераций, как посоветовали выше

In [17]:
model._hf_peft_config_loaded = True  # silence a warning from HF trainer

trainer = transformers.Trainer(
    model=model, train_dataset=data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=2, gradient_accumulation_steps=4,
        # note: if you want larger batch size, increase gradient_accumulation_steps
        warmup_steps=250, max_steps=500, learning_rate=2e-4, fp16=True,
        logging_steps=1, output_dir='outputs', report_to=None),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
# if you see cache warnings, set `model.config.use_cache = False` to silence them. Please re-enable for inference!

trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
1,1.3947
2,1.4262
3,1.3487
4,1.0935
5,1.4328
6,1.4592
7,1.1757
8,1.0517
9,1.1652
10,1.5472




UnboundLocalError: ignored

Это он не смог сохранить, так что все норм - модель обучилась

In [18]:
after_finetune2 = []
for i, prompt in enumerate(prompts):
  batch = tokenizer(prompt, return_tensors='pt', return_token_type_ids=False).to(device)

  for i in range(50):
      next_token = model(**batch).logits[0, -1].argmax(-1).reshape(1, 1)
      batch['input_ids'] = torch.cat([batch['input_ids'], next_token], dim=-1)
      batch['attention_mask'] = torch.cat([batch['attention_mask'], torch.ones_like(next_token)], dim=-1)

  out = tokenizer.decode(batch['input_ids'][0].cpu().numpy().tolist())
  after_finetune2.append(out)
  print("\nOutput:", out)




Output: <s># -*- coding: utf-8 -*-
#
# Copyright (c) 2012-2015, 2016-2017, 2018-2

Output: <s>import os
import sys
import time
import json
import logging
import requests
import requests_cache
import requests_cache.backends
import requests_cache.backends.redis
import requests_cache.backends.red

Output: <s>from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import os
import sys
import tempfile
import

Output: <s>while (true)
    {
        if (input.char() == ' ')
        {
            input.get();
        }
        else
        {
            break;
        }
    }
    return input.

Output: <s>try:
    from setuptools import setup
except ImportError:
    from distutils.core import setup

import os
import sys


def read(fname):
    return open(os.path.join(

Output: <s>if ( ! defined( 'ABSPATH' ) ) {
    exit;
}

if ( ! class_exists( 'Wp_List_Table' ) ) {
    class Wp_List_Table extends WP


In [20]:
# This template helps to compare generated code samples in pretty table form

from IPython.display import HTML, display
table_template = """<table style="border:1px solid black" >
  <tr>
    <th style="text-align: center; border:1px solid black">PROMPT</th>
    <th style="text-align: center; border:1px solid black">BEFORE</th>
    <th style="text-align: center; border:1px solid black">AFTER</th>
  </tr>
{}
</table>"""

row_template = '''  <tr>
    <td style="width:20%; border:1px solid black"><pre align="left">`{}`</pre></td>
    <td style="width:40%; border:1px solid black"><pre align="left">{}</pre></td>
    <td style="width:40%; border:1px solid black"><pre align="left">{}</pre></td>
  </tr>'''

rows = []

for i, prompt in enumerate(prompts):
    # replace placeholders in the format() arguments
    rows.append(row_template.format(prompt, before_finetune[i][3:], after_finetune2[i][3:]))

display(HTML(table_template.format('\n'.join(rows))))

PROMPT,BEFORE,AFTER
``,▶▶ 2019-2020 School Year The 2019-2020 school year is here! We are so excited to welcome our new students and families to the school. We are also excited,"# -*- coding: utf-8 -*- # # Copyright (c) 2012-2015, 2016-2017, 2018-2"
`import`,import Foundation public extension NSURL {  public var absoluteString: String {  return String(cString: CFBundleGetBundleWithURL(self).UTF8String)  } }package com.google.,import os import sys import time import json import logging import requests import requests_cache import requests_cache.backends import requests_cache.backends.redis import requests_cache.backends.red
`from`,from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys from absl import flags from tensorflow.python import pywrap,from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import os import sys import tempfile import
`while`,while(1) while(1) {  // do something } \end{code} Comment: This is not the same as the OP's code. Comment: @Jeffrey: It's,while (true)  {  if (input.char() == ' ')  {  input.get();  }  else  {  break;  }  }  return input.
`try`,try to find the best solution for your needs. We are a team of professionals with a long experience in the field of web development. We are a team of professionals with a long experience in the field of web development. We are a,try:  from setuptools import setup except ImportError:  from distutils.core import setup import os import sys def read(fname):  return open(os.path.join(
`if`,"if ( !window.atmosphere ) {  window.atmosphere = {}; } (function () {  var o = atmosphere.util,  atmosphere = atmosphere.atmosphere = function () {",if ( ! defined( 'ABSPATH' ) ) {  exit; } if ( ! class_exists( 'Wp_List_Table' ) ) {  class Wp_List_Table extends WP
`for`,for the 2019-2020 school year. The application process for the 2019-2020 school year is now open. The application process for the 2019-20,"for i in range(1, 10):  print(i)  print(i * 2)  print(i * 3)  print(i * 4)  print(i *"
`torch`,"torchbearer 2017-05-18 19:55:25 UTC #1 I’m a newbie to the world of RPGs, and I’m looking for a game that",torch.setDefaultTensorNorm(torch.TensorNorm.MaxNorm) # torch.setDefaultTensorNorm(torch.TensorNorm.MaxNorm) # torch.


Ну супер, генерит код (в первом случае комменты, видимо, лицензия), но видно, что модель обучилась генерировать что нужно