In [1]:
import os
import random
from argparse import ArgumentParser
import logging

import torch
from trl import SFTConfig, SFTTrainer

from lima_dataset import load_lima_dataset, tokenize_text, format_prompt_func, EOT_TOKEN
from utils import (
    read_yaml,
    get_model_config,
    get_tokenizer_config,
    get_split_config,
    get_dataset_config,
    get_trainer_config,
    get_generation_config,
    get_generation_samples,
    get_lora_config,
    _handle_seed,
    DEVICE,
)
from model import (
    tokenize_text,
    load_model,
    load_tokenizer,
    load_lora_model,
    generate,
    compute_metrics,
)

In [2]:
# config = read_yaml("./configs/train_config_llama_lora.yaml")
config = read_yaml("./configs/train_config_llama_qlora.yaml")

In [3]:
tokenizer_name, tokenizer_path, tokenizer_config = get_tokenizer_config(config)
tokenizer = load_tokenizer(
    tokenizer_name=tokenizer_name,
    tokenizer_path=tokenizer_path,
    tokenizer_config=tokenizer_config,
)
tokenizer_name, tokenizer_path, tokenizer_config

('llama2',
 'meta-llama/Llama-2-7b-hf',
 {'add_bos_token': True, 'add_eos_token': True})

In [4]:
tokenizer.add_eos_token

True

In [5]:
model_name, model_path, base_model_path, model_config = get_model_config(
    config, pad_token_id=tokenizer.pad_token_id, tokenizer_length=len(tokenizer)
)
model = load_model(
    model_string=model_name,
    model_path=model_path,
    base_model_path=base_model_path,
    model_config=model_config,
)
# model_config["pad_token_id"] = tokenizer.pad_token_id
# model_config["tokenizer_length"] = len(tokenizer)
# model.config.pad_token_id = tokenizer.pad_token_id
# model.resize_token_embeddings(len(tokenizer))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [6]:
# model.config.eos_token_id

In [7]:
lora_config = get_lora_config(config)
model = load_lora_model(model, lora_config)

In [8]:
dataset_desc, (train_split_config, val_split_config, test_split_config) = (
    get_split_config(config)
)
dataset_desc, train_split_config

('LIMA Instruct Finetunning Dataset',
 {'dataset_path': 'GAIR/lima', 'sub_split_size': None})

In [9]:
train_dataset_path, train_sub_split_size, train_dataset_config = get_dataset_config(
    train_split_config
)
train_dataset_path, train_sub_split_size, train_dataset_config

('GAIR/lima', None, {})

In [10]:
train_dataset = load_lima_dataset(
    train_dataset_path, "train", train_sub_split_size, **train_dataset_config
)

In [11]:
trainer_config = get_trainer_config(config)
trainer_config["logging_dir"] = os.path.join(
    trainer_config["output_dir"], "runs", trainer_config["run_name"]
)
save_trained_model = trainer_config.pop("save_trained_model", True)
sft_trainer_args = SFTConfig(**trainer_config)

In [12]:
save_trained_model = trainer_config.pop("save_trained_model", True)
resume_from_checkpoint = trainer_config.pop("resume_from_checkpoint", None)
sft_trainer_args = SFTConfig(**trainer_config)

sft_trainer = SFTTrainer(
    model,
    args=sft_trainer_args,
    train_dataset=train_dataset,
    formatting_func=format_prompt_func,
    processing_class=tokenizer,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [13]:
sft_training_outs = sft_trainer.train(resume_from_checkpoint=resume_from_checkpoint,)

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
1,6.921
2,0.0
3,6.7454


KeyboardInterrupt: 

In [None]:
prompt = f'{train_dataset[0]["conversations"][0]}'
prompt

'Can brain cells move? By movement I mean long distance migration (preferably within the brain only).'

In [None]:
generation_config = get_generation_config(config)
generation_config

In [None]:
prompt_temp = f"{prompt} {EOT_TOKEN}"
tokenized_prompts = tokenize_text(
    prompt_temp, tokenizer, use_encode=True, return_tensors="pt"
)

In [None]:
from transformers import GenerationConfig

In [None]:
model.config.bos_token_id, model.config.eos_token_id, model.config.pad_token_id

(1, 2, None)

In [None]:
# gen_config_obj = GenerationConfig(**gen_config_temp)
# gen_config_obj.bos_token_id = 

GenerationConfig {
  "max_length": 2048
}

In [None]:
generated_tokens = model.generate(tokenized_prompts, generation_config)
generated_tokens.shape

AttributeError: 'dict' object has no attribute 'bos_token_id'

In [None]:
tokenized_prompts

tensor([[    1, 14350,   385,  4876,   304,   590, 18385, 17088, 10554,   292,
         12251,  6721,   363,  1371,   373,   263,  3271,  1287, 12827,   373,
          4327,   414, 29889, 26321, 29892,   306,  1016, 29915, 29873,  2274,
           278,  4328,  1546,  2094,   397,   414,   322,  1602,   397,   414,
         29889, 29871, 32000]])

In [None]:
from model import generate

In [None]:
outs = generate(
    model,
    prompt_samples=prompt,
    tokenizer=tokenizer,
    generation_config=generation_config,
    use_encode=True,
)

  return fn(*args, **kwargs)


In [None]:
print(outs[0])

Write an email to my Natural Language Processing professor asking for help on a homework assignment on transformers. Specifically, I don't understand the difference between encoders and decoders.

My professor has been very helpful in responding to questions that I have had regarding the class material so far. However, he is currently out of town traveling with his family overseas (I believe). And since this was due yesterday evening at midnight EST, it seemed like it might be difficult or impossible for him to get back into contact quickly enough before the deadline. Thus, I am writing this e-mail as a way of getting my question answered while also helping me complete my assigned work in time. 

Please note: This is not meant to plagiarize any part of your class materials. In fact, one could argue that using these resources will make it easier for you to answer my question without having to do much research yourself! That said here are some links where they talk about how transformers

In [None]:
prompt

"Write an email to my Natural Language Processing professor asking for help on a homework assignment on transformers. Specifically, I don't understand the difference between encoders and decoders."

In [None]:
prompt = tokenize_text(
    "What is a LLM?[EOT]", tokenizer, use_encode=True, return_tensors="pt"
).to(device="cuda:0")

In [None]:
from transformers import GenerationConfig

In [None]:
generation_config = GenerationConfig(max_length=300)
generation_config

GenerationConfig {
  "max_length": 300
}

In [None]:
outs = model.generate(prompt, generation_config=generation_config)
outs_text = tokenizer.batch_decode(outs)

  return fn(*args, **kwargs)


In [None]:
print(outs_text[0])

<s> What is a LLM? [EOT] a Master of Laws? A a Master? with "









PA
MS
MSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSPAMSPAMSPAMSMSMSPAMSPAMSPAMSPAMSPAMSPAMSPAMSPAMSMSMSPAMSMSPAMSPAMSPAMSMSMSMSMSPAMSMSMSMSMSPAPAPAMSMSMSPAMSPAMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMSMS
