In [1]:
import os
import sys
# Get the absolute path of the project directory
project_root = os.path.abspath(os.path.join(os.path.join(os.getcwd()), ".."))
# Add the project root to sys.path
sys.path.insert(0, project_root)

In [2]:

import random
from argparse import ArgumentParser
import logging

import torch
from trl import SFTConfig, SFTTrainer

from lima_dataset import load_lima_dataset, tokenize_text, format_prompt_func, EOT_TOKEN
from utils import (
    read_yaml,
    get_model_config,
    get_tokenizer_config,
    get_generation_config,
    get_generation_samples,
)
from model import (
    load_model,
    load_tokenizer,
    generate,
)

In [3]:
# config = read_yaml("./configs/generate_config_llama.yaml")
config = read_yaml("../configs/generate_config_llama_qlora.yaml")

In [4]:
tokenizer_name, tokenizer_path, tokenizer_config = get_tokenizer_config(config)
tokenizer = load_tokenizer(
    tokenizer_name=tokenizer_name,
    tokenizer_path=tokenizer_path,
    tokenizer_config=tokenizer_config,
)
tokenizer_name, tokenizer_path, tokenizer_config

('llama2',
 'meta-llama/Llama-2-7b-hf',
 {'add_bos_token': True, 'add_eos_token': False})

In [5]:
model_name, model_path, base_model_path, model_config = get_model_config(
    config,
    pad_token_id=tokenizer.pad_token_id,
    tokenizer_length=len(tokenizer),
)
model_config

{'force_download': False,
 'device_map': 'cuda:0',
 'bnb_config': {'load_in_4bit': True,
  'bnb_4bit_quant_type': 'nf4',
  'bnb_4bit_compute_dtype': 'float16',
  'bnb_4bit_use_double_quant': False},
 'pad_token_id': 32000,
 'tokenizer_length': 32002}

In [6]:
model = load_model(
    model_string=model_name,
    model_path=model_path,
    base_model_path=base_model_path,
    model_config=model_config,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [7]:
config = read_yaml('../configs/generate_config_llama_qlora.yaml')

In [8]:
generation_config = get_generation_config(config)
# generation_config["pad_token_id"] = tokenizer.pad_token_id
# generation_config['max_new_tokens'] = 1024

In [None]:
samples = get_generation_samples(config)
samples

In [11]:
# # prompt = "I'm writing a NeurIPS paper about a new model architecture for processing and generating long texts. Here are some facts about the paper:\n* The main trick is to replace some of the attention heads with an exponential moving average, where the decay rate is learned for each head. We call this architecture ExeMA.\n* On language modeling, the perplexity difference between our model and a vanilla transformer is negligible, but that's because next-token prediction is almost always a local task, so perplexity won't be sensitive enough to detect any improvements in long-range understanding.\n* However, on the SCROLLS benchmark, our model improves by 10% over the baseline.\n* We also have a new metric for measuring coherence in generated text (CoGnaTe), where our model generates text that is 43% more coherent than the baseline.\nHelp me write the paper's introduction."
# # prompt = "Plan a day trip in Tokyo. The spots need to be within walking distance to each other."
# prompt = "What medicine should I take when I get a cold?"
# # prompt = f"{prompt}{EOT_TOKEN}"

outs = generate(
    model,
    tokenizer,
    prompt_samples=samples,
    generation_config=generation_config,
    use_encode=False,
    eot_token=True,
)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [13]:
print(outs[0])

What is reinforcement learning?True or false?  avid readers of this book will have noticed that I often refer to reinforcement learning (RL) as the “third pillar” of machine learning. How can that be? After all, machine learning is a very broad term, and we have already covered a lot of ground in the previous chapters. How can we squeeze in one more pillar?  Well, it is true that machine learning is a very broad term. It can be used to refer to supervised learning, unsupervised learning, and reinforcement learning, and in fact, the term machine learning can be applied to all three of these areas. The “third pillar” is a term that I use to refer to reinforcement learning, because it is the least commonly used of the three pillars. However, it is the most powerful of the three. In this chapter, I will introduce you to reinforcement learning, and show you why it is such a powerful technique.  We will start by explaining what reinforcement learning is. Then we will dive into the mathematic