In [None]:
import torch
from peft import PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
    logging,
)
from deepmultilingualpunctuation import PunctuationModel

In [None]:
# hyperparameters
model_name = 'NousResearch/Meta-Llama-3-8B'
device_map = 'cuda:0'
new_model = ''

In [None]:
################################################################################
# bitsandbytes parameters
################################################################################
use_4bit = True # Activate 4-bit precision base model loading
bnb_4bit_compute_dtype = "float16" # Compute dtype for 4-bit base models
bnb_4bit_quant_type = "nf4" # Quantization type (fp4 or nf4)
use_nested_quant = False # Activate nested quantization for 4-bit base models (double quantization)

In [None]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [None]:
# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map,
)

base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

In [None]:
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

In [None]:
punctuation_ends = ['.', '!', '?']

In [None]:
def output_formating(text, punc_model):
    # replace the ellipsis with '--'
    text = text.replace('...', '--')
    # restore the punctuation
    text = punc_model.restore_punctuation(text)
    # remove the last sentence if it ends with a punctuation
    for i in range(len(text)-1, -1, -1):
        if text[i] in punctuation_ends:
            return text[:i+1].replace('--','...')
    # restore the ellipsis
    return text.replace('--','...') + '...'

In [None]:
def generate(prompt, pipe):
    result = pipe(
        prompt,
        do_sample=True,
        temperature=0.7,
        penalty_alpha=0.65,
        top_k=4,
        max_length=64,
        num_return_sequences=1,
        no_repeat_ngram_size=1
    )
    return result[0]['generated_text']

In [None]:
# Ignore warnings
logging.set_verbosity(logging.ERROR)

# Load punctuation model
punc_model = PunctuationModel()

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

In [None]:
# Run text generation pipeline with model
prompt = "Tell me about yourself."
result = generate(prompt, pipe)
pruned_result = output_formating(result, punc_model)
print(pruned_result)