In [None]:
%pip install --upgrade transformers accelerate sentencepiece optimum peft bitsandbytes

import torch
import torch.nn as nn
import torch.nn.functional as F

import transformers
from tqdm.auto import tqdm, trange

assert torch.cuda.is_available(), "you need cuda for this part"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Collecting transformers
  Using cached transformers-4.45.1-py3-none-any.whl.metadata (44 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Using cached tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)


In [None]:
model_name = "Enoch/llama-7b-hf"

tokenizer = transformers.LlamaTokenizer.from_pretrained(model_name, device_map=device)
tokenizer.pad_token_id = tokenizer.eos_token_id

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    low_cpu_mem_usage=True,
    offload_state_dict=True,
    load_in_4bit=True,
    torch_dtype=torch.float32,  # weights are 4-bit, layernorms and activations are fp32
)
for param in model.parameters():
    param.requires_grad = False

model.gradient_checkpointing_enable()  # only store a small subset of activations, re-compute the rest.
model.enable_input_require_grads()  # override an implementation quirk in gradient checkpoints that disables backprop unless inputs require grad

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message
The `load_in_4bit` and `load_in_8bit` 

Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

# Learned prompts

Реализуем forward для параметров, которые будем обучать на выучивание промпта

In [None]:
class WordEmbeddingsWithLearnedPrompts(nn.Module):
    """
    To perform prompt tuning, you will need to replace model's original word embeddings with a layer - THIS layer
     - that inserts trainable prompts instead of the first N token embeddings."""

    def __init__(self, word_embeddings: nn.Embedding, num_prompts: int):
        super().__init__()
        self.original_word_embeddings = word_embeddings
        self.num_prompts = num_prompts
        self.learnable_prompts = nn.Parameter(
            torch.randn(1, num_prompts, word_embeddings.embedding_dim),
            requires_grad=True,
        )

    def forward(self, input_ids: torch.LongTensor):
        # input_ids shape: [batch_size, seq length]
        assert input_ids.dtype == torch.int64
        assert input_ids.shape[1] > self.num_prompts
        assert torch.all(
            input_ids[:, : self.num_prompts] == tokenizer.pad_token_id
        ).item(), "don't forget to prepend several BOS tokens to input_ids"

        output = torch.cat(
            [
                self.learnable_prompts,
                self.original_word_embeddings(input_ids[:, self.num_prompts :]),
            ],
            dim=1,
        )

        return output

In [None]:
num_prompts = 16
test_emb_layer = WordEmbeddingsWithLearnedPrompts(
    model.model.embed_tokens, num_prompts=num_prompts
).to(device)
test_input_ids = tokenizer("a cat say on a may", return_tensors="pt")["input_ids"].to(
    device
)

space_for_prompts = torch.full(
    [len(test_input_ids), num_prompts], # size of tensor (1, 16)
    fill_value=tokenizer.pad_token_id,
    dtype=torch.int64,
    device=device,
)

test_inputs_with_prompts = torch.cat([space_for_prompts, test_input_ids], dim=1) # (1, 16+7 = 23)
print(test_inputs_with_prompts.shape)

with torch.cuda.amp.autocast():
    test_prompt_embeddings = test_emb_layer(test_inputs_with_prompts)


assert test_prompt_embeddings.shape[:2] == test_inputs_with_prompts.shape
assert test_prompt_embeddings.shape[-1] == model.config.hidden_size # hidden_size у llama-7b-hf
assert torch.allclose(
    test_prompt_embeddings[:, :num_prompts], test_emb_layer.learnable_prompts.float()
)
assert torch.allclose(
    test_prompt_embeddings[:, num_prompts:],
    model.model.embed_tokens(test_input_ids).float(),
)
print("Looks legit!")

torch.Size([1, 23])


  with torch.cuda.amp.autocast():


Looks legit!


Подготовим оптимизатор и модель

In [None]:
num_prompts = 16

assert isinstance(
    model.model.embed_tokens, nn.Embedding
), "you have already replaced the embedding layer. If the replacement is broken, please reload the model"

# явно поменяем слой эмбединга
model.model.embed_tokens = WordEmbeddingsWithLearnedPrompts(
    model.model.embed_tokens, num_prompts=num_prompts
).to(device)

# оптимизатор только для новых эмбедингов, чтобы градиенты пробрасывались только туда
opt = torch.optim.Adam([model.model.embed_tokens.learnable_prompts], lr=0.01)

# Датасет

Будем выучиваться на переворачивание фразы. Для этого составим датасет из фраз, где идут слова в правильном порядке, а затем в обратном

In [None]:
import datasets
from tqdm.auto import tqdm as tqdma

num = 48

In [None]:
def reverse_phrases(phrase):
  reverse = ' '.join(phrase.split(' ')[::-1])
  return reverse

In [None]:
data = datasets.load_dataset("Abirate/english_quotes", split="train").shuffle()
data = data['quote'][:num]
data_truth = [phrase[1:-2]+ ' ' + reverse_phrases(phrase[1:-2]) for phrase in data]

# Train

In [None]:
for i in range(num):
  batch = tokenizer(data_truth[i], return_tensors="pt", return_token_type_ids=False).to(device) # (32, 554)

  space_for_prompts = torch.full(
      [batch['input_ids'].shape[0], num_prompts],
      fill_value=tokenizer.pad_token_id,
      dtype=torch.int64,
      device=device,
  ) # (48, 16)

  batch["input_ids"] = torch.cat([space_for_prompts, batch["input_ids"]], dim=1) # (48, 554 + 16 = 570)
  batch["attention_mask"] = torch.cat(
      [torch.ones_like(space_for_prompts), batch["attention_mask"]], dim=1
  )

  for j in range(20):
    outputs = model(**batch)
    printm()
    k = (batch["input_ids"].shape[1] - num_prompts - 1)//2 # хотим обучиться с конца нормальной фразы и на всей перевёрнутой
    next_word_logits = outputs.logits[:, num_prompts+k:-1, :]
    true_next_tokens = batch["input_ids"][:, num_prompts + k + 1 :]
    loss = F.cross_entropy(
        next_word_logits.flatten(0, 1), true_next_tokens.flatten(0, 1)
    )
    print("Loss:", loss)
    loss.backward()

    opt.step()
    opt.zero_grad()


    if loss.item() <= 0.1:
        break


assert loss.item() <= 0.1
print("Good job!")

Gen RAM Free: 10.3 GB  |     Proc size: 1.7 GB
GPU RAM Free: 185MB | Used: 14917MB | Util  97% | Total     15360MB
Loss: tensor(2.0855, device='cuda:0', grad_fn=<NllLossBackward0>)
Gen RAM Free: 10.3 GB  |     Proc size: 1.7 GB
GPU RAM Free: 185MB | Used: 14917MB | Util  97% | Total     15360MB
Loss: tensor(2.0277, device='cuda:0', grad_fn=<NllLossBackward0>)
Gen RAM Free: 10.3 GB  |     Proc size: 1.7 GB
GPU RAM Free: 185MB | Used: 14917MB | Util  97% | Total     15360MB
Loss: tensor(1.9370, device='cuda:0', grad_fn=<NllLossBackward0>)
Gen RAM Free: 10.3 GB  |     Proc size: 1.7 GB
GPU RAM Free: 185MB | Used: 14917MB | Util  97% | Total     15360MB
Loss: tensor(1.8283, device='cuda:0', grad_fn=<NllLossBackward0>)
Gen RAM Free: 10.3 GB  |     Proc size: 1.7 GB
GPU RAM Free: 185MB | Used: 14917MB | Util  97% | Total     15360MB
Loss: tensor(1.7145, device='cuda:0', grad_fn=<NllLossBackward0>)
Gen RAM Free: 10.3 GB  |     Proc size: 1.7 GB
GPU RAM Free: 185MB | Used: 14917MB | Util  97% 

In [None]:
prompt = "This phrase should be inverse, I hope it will work"
batch = tokenizer(prompt, return_tensors="pt", return_token_type_ids=False).to(device)
batch["input_ids"] = torch.cat([space_for_prompts[0][None, :], batch["input_ids"]], dim=1)
batch["attention_mask"] = torch.cat(
    [torch.ones_like(space_for_prompts[0][None, :]), batch["attention_mask"]], dim=1
)


for i in range(batch["input_ids"].shape[1] - num_prompts):
    next_token = model(**batch).logits[0, -1].argmax(-1).reshape(1, 1)
    batch["input_ids"] = torch.cat([batch["input_ids"], next_token], dim=-1)
    batch["attention_mask"] = torch.cat(
        [batch["attention_mask"], torch.ones_like(next_token)], dim=-1
    )

print(
    "\nOutput:",
    tokenizer.decode(batch["input_ids"][0, num_prompts:].cpu().numpy().tolist()),
)


Output: <s>This phrase should be inverse, I hope it will work work will it hope inverse, should This phrase be should This
