In [1]:
from transformers import AutoTokenizer, AutoConfig, LlamaTokenizerFast
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

from auto_gptq import AutoGPTQForCausalLM, get_gptq_peft_model, BaseQuantizeConfig

import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ATTR

torch.backends.cuda.matmul.allow_tf32 = True

import argparse

parser = argparse.ArgumentParser(description="""
How load ft Llama2 model, with PAD token.""")
parser.add_argument('--load_emb', type=bool, default=True,
                    help="Load the embedding tokens layer with LoRA.")
parser.add_argument('--load_model_config', type=bool, default=True,
                    help="Load the config of the model.")         
parser.add_argument('--load_tokenizer', type=bool, default=True,
                    help="Save the tokenizer.")
parser.add_argument('--seed', type=int, default=123,
                    help="Seed.")

args = parser.parse_args([])

In [3]:
# OTHERS

# ------------------

save_path = './ft_llama2'

# ------------------

IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"

# ------------------

CHECK = True

In [4]:
import random
import numpy as np

import transformers

def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch` and/or `tf` (if installed).

    Args:
        seed (`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # ^^ safe to call this function even if cuda is not available
    transformers.set_seed(seed)

from typing import Optional, Dict, Sequence

def smart_tokenizer_and_embedding_resize(
    special_tokens_dict: Dict,
    tokenizer: transformers.PreTrainedTokenizer,
    model: transformers.PreTrainedModel,
):
    """Resize tokenizer and embedding.

    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    if num_new_tokens > 0:
        input_embeddings = model.get_input_embeddings().weight.data
        output_embeddings = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)

        input_embeddings[-num_new_tokens:] = input_embeddings_avg
        output_embeddings[-num_new_tokens:] = output_embeddings_avg

In [5]:
# MODEL

# ------------------

set_seed(args.seed)

# ------------------

model_name_or_path = "TheBloke/Luna-AI-Llama2-Uncensored-GPTQ"
model_basename = "gptq_model-4bit-128g"

use_triton = True

model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
        # revision="gptq-8bit-64g-actorder_True",
        model_basename=model_basename,
        use_safetensors=True,
        trust_remote_code=False,
        device="cuda:0",
        use_triton=use_triton,
        inject_fused_mlp=False,
        quantize_config=None)

if args.load_model_config:
    model_config = AutoConfig.from_pretrained(f'{save_path}_model_config.json')
    model.config = model_config

The safetensors archive passed at /home/lazzaro/.cache/huggingface/hub/models--TheBloke--Luna-AI-Llama2-Uncensored-GPTQ/snapshots/67eb2ec5cad2c73dd7cb63e4acab78f7acae164c/gptq_model-4bit-128g.safetensors does not contain metadata. Make sure to save your model with the `save_pretrained` method. Defaulting to 'pt' metadata.


In [6]:
# TOKENIZER

if args.load_tokenizer:
    tokenizer_id = f'{save_path}_tokenizer'

    tokenizer = AutoTokenizer.from_pretrained(
        tokenizer_id,
        padding_side="right",
        use_fast=True,
    )

    # it is necessary to resizing the emb
    model.resize_token_embeddings(len(tokenizer))

else:
    tokenizer = AutoTokenizer.from_pretrained(
        model_name_or_path,
        padding_side="right",
        use_fast=True,
    )
    
    if tokenizer.pad_token == '<unk>':
        # print('AAA')
        smart_tokenizer_and_embedding_resize(
            special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
            tokenizer=tokenizer,
            model=model,
        )

        # 1. Non sembra essere questo il problema...
        # 2. Inoltre secondo me non corretto mettere l'unk_token come pad...
        if isinstance(tokenizer, LlamaTokenizerFast):
            # LLaMA tokenizer may not have correct special tokens set.
            # Check and add them if missing to prevent them from being parsed into different tokens.
            # Note that these are present in the vocabulary. 
            # Note also that `model.config.pad_token_id` is 0 which corresponds to `<unk>` token.
            tokenizer.add_special_tokens(
                {
                    "eos_token": tokenizer.convert_ids_to_tokens(model.config.eos_token_id),
                    "bos_token": tokenizer.convert_ids_to_tokens(model.config.bos_token_id),
                    "unk_token": tokenizer.convert_ids_to_tokens(model.config.pad_token_id),
                }
            )

model.config.pad_token_id = tokenizer.pad_token_id

In [7]:
if CHECK:
    print(model.model.model.embed_tokens.weight.data.shape)
    print(model.model.model.embed_tokens.weight.data)

torch.Size([32001, 4096])
tensor([[ 1.2517e-06, -1.7881e-06, -4.3511e-06,  ...,  8.9407e-07,
         -6.5565e-06,  8.9407e-07],
        [ 2.3651e-03, -3.4142e-03,  9.9373e-04,  ..., -8.8348e-03,
          2.5311e-03, -3.8948e-03],
        [ 1.0674e-02,  1.0468e-02, -5.1956e-03,  ...,  2.9011e-03,
          6.0844e-04, -4.6196e-03],
        ...,
        [-1.0742e-02,  9.3384e-03,  1.2939e-02,  ..., -3.3203e-02,
         -1.6357e-02,  3.3875e-03],
        [-8.3008e-03, -4.0588e-03, -1.1063e-03,  ...,  3.4790e-03,
         -1.2939e-02,  3.1948e-05],
        [-2.3209e-02,  1.5396e-02,  5.8632e-03,  ...,  2.2049e-02,
          1.6346e-03,  1.6388e-02]], device='cuda:0', dtype=torch.float16)


In [8]:
# MODEL ID AND LOAD

if args.load_emb:
    model_id = f'{save_path}_emb_model'
else:
    model_id = f'{save_path}_model'

model = get_gptq_peft_model(model, model_id=model_id)

In [9]:
if CHECK:
    if args.load_emb:
        print('ORIGINAL')
        print(model.model.model.embed_tokens.original_module.weight.data.shape)
        print(model.model.model.embed_tokens.original_module.weight.data)
        print('SAVED')
        print(model.model.model.embed_tokens.modules_to_save.default.weight.data.shape)
        print(model.model.model.embed_tokens.modules_to_save.default.weight.data)
    else:
        print(model.model.model.embed_tokens.weight.data.shape)
        print(model.model.model.embed_tokens.weight.data)
    
    print('LORA A')
    print(model.model.model.layers[0].self_attn.qkv_proj.lora_A.default.weight.data.shape)
    print(model.model.model.layers[0].self_attn.qkv_proj.lora_A.default.weight.data)

ORIGINAL
torch.Size([32001, 4096])
tensor([[ 1.2517e-06, -1.7881e-06, -4.3511e-06,  ...,  8.9407e-07,
         -6.5565e-06,  8.9407e-07],
        [ 2.3651e-03, -3.4142e-03,  9.9373e-04,  ..., -8.8348e-03,
          2.5311e-03, -3.8948e-03],
        [ 1.0674e-02,  1.0468e-02, -5.1956e-03,  ...,  2.9011e-03,
          6.0844e-04, -4.6196e-03],
        ...,
        [-1.0742e-02,  9.3384e-03,  1.2939e-02,  ..., -3.3203e-02,
         -1.6357e-02,  3.3875e-03],
        [-8.3008e-03, -4.0588e-03, -1.1063e-03,  ...,  3.4790e-03,
         -1.2939e-02,  3.1948e-05],
        [-2.3209e-02,  1.5396e-02,  5.8632e-03,  ...,  2.2049e-02,
          1.6346e-03,  1.6388e-02]], device='cuda:0', dtype=torch.float16)
SAVED
torch.Size([32001, 4096])
tensor([[ 1.2517e-06, -1.7881e-06, -4.3511e-06,  ...,  8.9407e-07,
         -6.5565e-06,  8.9407e-07],
        [-1.8291e-03, -5.7182e-03, -1.0538e-03,  ..., -6.0844e-03,
          6.8054e-03, -8.3542e-03],
        [ 6.4583e-03,  6.3248e-03, -9.1934e-04,  ..., -1.

In [10]:
# prove

from langchain import PromptTemplate

template_input= """Traduci il testo dall'inglese all'italiano.

EN: {query}

IT:"""

prompt_template_input = PromptTemplate(
    input_variables=["query"],
    template=template_input
)

query = "To believe, to dare, to get their imagination unlocked for what used to be impossible."

prompt = prompt_template_input.format(query=query)

In [11]:
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=0.4, max_new_tokens=100)

print(tokenizer.decode(output[0]))

<s> Traduci il testo dall'inglese all'italiano.

EN: To believe, to dare, to get their imagination unlocked for what used to be impossible.

IT: Avere, avere, avere la testa un'idea, per credere, per sostenere, per farsi spazio per ciò che era impossibile.</s>
