In [1]:
import os
import random
from argparse import ArgumentParser
import logging

import torch
from trl import SFTConfig, SFTTrainer

from lima_dataset import load_lima_dataset, tokenize_text, format_prompt_func, EOT_TOKEN
from utils import (
    read_yaml,
    get_model_config,
    get_tokenizer_config,
    get_split_config,
    get_dataset_config,
    get_trainer_config,
    get_generation_config,
    get_generation_samples,
    get_lora_config,
    _handle_seed,
    DEVICE,
)
from model import (
    tokenize_text,
    load_model,
    load_tokenizer,
    load_pretrained_base_llama2_model,
    load_lora_model,
    generate,
    compute_metrics,
)

In [2]:
# config = read_yaml("./configs/generate_config_llama.yaml")
config = read_yaml("./configs/generate_config_llama_qlora.yaml")

In [3]:
tokenizer_name, tokenizer_path, tokenizer_config = get_tokenizer_config(config)
tokenizer = load_tokenizer(
    tokenizer_name=tokenizer_name,
    tokenizer_path=tokenizer_path,
    tokenizer_config=tokenizer_config,
)
tokenizer_name, tokenizer_path, tokenizer_config

('llama2',
 'meta-llama/Llama-2-7b-hf',
 {'add_bos_token': True, 'add_eos_token': False})

In [4]:
model_name, model_path, base_model_path, model_config = get_model_config(
    config,
    pad_token_id=tokenizer.pad_token_id,
    tokenizer_length=len(tokenizer),
)
model_config

{'force_download': False,
 'device_map': 'cuda:0',
 'bnb_config': {'load_in_4bit': True,
  'bnb_4bit_quant_type': 'nf4',
  'bnb_4bit_compute_dtype': 'float16',
  'bnb_4bit_use_double_quant': False},
 'pad_token_id': 32000,
 'tokenizer_length': 32002}

In [5]:
model = load_model(
    model_string=model_name,
    model_path=model_path,
    base_model_path=base_model_path,
    model_config=model_config,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [7]:
# model

In [56]:
config = read_yaml("./configs/generate_config_llama_qlora.yaml")

In [57]:
generation_config = get_generation_config(config)
generation_config["pad_token_id"] = tokenizer.pad_token_id
# generation_config['max_new_tokens'] = 1024

In [None]:
# prompt = "I'm writing a NeurIPS paper about a new model architecture for processing and generating long texts. Here are some facts about the paper:\n* The main trick is to replace some of the attention heads with an exponential moving average, where the decay rate is learned for each head. We call this architecture ExeMA.\n* On language modeling, the perplexity difference between our model and a vanilla transformer is negligible, but that's because next-token prediction is almost always a local task, so perplexity won't be sensitive enough to detect any improvements in long-range understanding.\n* However, on the SCROLLS benchmark, our model improves by 10% over the baseline.\n* We also have a new metric for measuring coherence in generated text (CoGnaTe), where our model generates text that is 43% more coherent than the baseline.\nHelp me write the paper's introduction."
# prompt = "Plan a day trip in Tokyo. The spots need to be within walking distance to each other."
prompt = "What medicine should I take when I get a cold?"
# prompt = f"{prompt}{EOT_TOKEN}"
outs = generate(
    model,
    tokenizer,
    prompt_samples=prompt,
    generation_config=generation_config,
    use_encode=False,
    eot_token=True,
)

In [62]:
print(outs[0])

What medicine should I take when I get a cold? Keep it shortTrue! Colds are usually over within 7-10 days.
What is the best way to treat colds and flu at home? Drink plenty of water, rest up, use paracetamol or ibuprofen if you have a fever (over 38°C), put on some layers of clothing so that your body can stay warm in case you start shivering.  
Is there any medicine for cold sore treatment? There's no specific medication available but there are many things which may help relieve symptoms: - Apply an ice pack to reduce swelling around the mouth area; this will also numb pain temporarily by reducing blood flow into affected tissue areas such as lips/cheeks etc., allowing them time enough before they become too tender from rubbing against each other while sleeping during nighttime hours when we don't realize how much discomfort has been caused until morning comes along with its bright sunshine rays warming our faces again after having slept all through darkness without feeling anything 