In [1]:
import torch
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

from peft import prepare_model_for_kbit_training, TaskType

from auto_gptq.utils.peft_utils import GPTQLoraConfig

from auto_gptq import AutoGPTQForCausalLM, get_gptq_peft_model, BaseQuantizeConfig

from peft.tuners.lora import LoraLayer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ATTR

torch.backends.cuda.matmul.allow_tf32 = True

import argparse

parser = argparse.ArgumentParser(description="""
How ft Llama2 model, with PAD token.""")
parser.add_argument('--save_emb', type=bool, default=True,
                    help="Save the embedding tokens layer with LoRA.")
parser.add_argument('--save_model_config', type=bool, default=True,
                    help="Save the config of the model.")         
parser.add_argument('--save_tokenizer', type=bool, default=True,
                    help="Save the tokenizer.")
parser.add_argument('--fp16', type=bool, default=False,
                    help="fp16.")
parser.add_argument('--bf16', type=bool, default=False,
                    help="bf16.")
parser.add_argument('--max_memory_MB', type=int, default=24000,
                    help="Free memory per gpu.")
parser.add_argument('--gradient_checkpointing', type=bool, default=True,
                    help="Use gradient checkpointing. You want to use this.")
parser.add_argument('--seed', type=int, default=123,
                    help="Seed.")

args = parser.parse_args([])

In [3]:
# OTHERS

# ------------------

save_path = './ft_llama2'

# ------------------

CHECK = True

# ------------------

IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"

# ------------------

DATA_TRAIN_LENGTH = 1000

# ------------------

R_LORA = 64
LORA_ALPHA = 16
LORA_DROPOUT = 0.05

# ------------------

CXT_LENGHT = 256

# ------------------

MICRO_BATCH_SIZE = 32
BATCH_SIZE = 512
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
EPOCHS = 1
# LEARNING_RATE = 3e-5
LEARNING_RATE = 3e-3

In [4]:
import random
import numpy as np

import transformers

def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch` and/or `tf` (if installed).

    Args:
        seed (`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # ^^ safe to call this function even if cuda is not available
    transformers.set_seed(seed)

from typing import Optional, Dict, Sequence

def smart_tokenizer_and_embedding_resize(
    special_tokens_dict: Dict,
    tokenizer: transformers.PreTrainedTokenizer,
    model: transformers.PreTrainedModel,
):
    """Resize tokenizer and embedding.

    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    if num_new_tokens > 0:
        input_embeddings = model.get_input_embeddings().weight.data
        output_embeddings = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)

        input_embeddings[-num_new_tokens:] = input_embeddings_avg
        output_embeddings[-num_new_tokens:] = output_embeddings_avg

In [5]:
# MODEL

# ------------------

set_seed(args.seed)

# ------------------

n_gpus = torch.cuda.device_count()
max_memory = f'{args.max_memory_MB}MB'
max_memory = {i: max_memory for i in range(n_gpus)}

model_name_or_path = "TheBloke/Luna-AI-Llama2-Uncensored-GPTQ"
model_basename = "gptq_model-4bit-128g"

quantize_config = BaseQuantizeConfig.from_pretrained(model_name_or_path)

model = AutoGPTQForCausalLM.from_quantized(
    model_name_or_path,
    model_basename=model_basename,
    use_safetensors=True,
    trust_remote_code=False,
    device_map='auto',
    # max_memory=max_memory,
    inject_fused_attention = True,
    inject_fused_mlp = False,
    use_triton=True,
    warmup_triton=False,
    trainable=True,
    quantize_config=quantize_config # for BaseQuantizeConfig from pretrained
)

## otherwise, set quantize_config in this way
# model.model.quantize_config = model.quantize_config

model.train()

model.config.torch_dtype=(torch.float32 if args.fp16 else (torch.bfloat16 if args.bf16 else torch.float32))

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

The safetensors archive passed at /home/lazzaro/.cache/huggingface/hub/models--TheBloke--Luna-AI-Llama2-Uncensored-GPTQ/snapshots/67eb2ec5cad2c73dd7cb63e4acab78f7acae164c/gptq_model-4bit-128g.safetensors does not contain metadata. Make sure to save your model with the `save_pretrained` method. Defaulting to 'pt' metadata.


In [6]:
# TOKENIZER

from transformers import AutoTokenizer, LlamaTokenizerFast

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path,
    padding_side="right",
    use_fast=True,
)

if tokenizer.pad_token == '<unk>':
    smart_tokenizer_and_embedding_resize(
        special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
        tokenizer=tokenizer,
        model=model,
    )
    
if isinstance(tokenizer, LlamaTokenizerFast):
    # LLaMA tokenizer may not have correct special tokens set.
    # Check and add them if missing to prevent them from being parsed into different tokens.
    # Note that these are present in the vocabulary. 
    # Note also that `model.config.pad_token_id` is 0 which corresponds to `<unk>` token.
    tokenizer.add_special_tokens(
        {
            "eos_token": tokenizer.convert_ids_to_tokens(model.config.eos_token_id),
            "bos_token": tokenizer.convert_ids_to_tokens(model.config.bos_token_id),
            "unk_token": tokenizer.convert_ids_to_tokens(model.config.pad_token_id),
        }
    )

You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [7]:
if args.save_emb:
    # target_modules: default all linear layers of att, in our case (llama2): "up_proj", "down_proj", "qkv_proj", "gate_proj", "o_proj"

    config = GPTQLoraConfig(
        r=R_LORA,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        task_type="CAUSAL_LM",
        bias="none",
        modules_to_save=['embed_tokens']
    )
else:
    config = GPTQLoraConfig(
        r=R_LORA,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        task_type="CAUSAL_LM",
        bias="none",
    )

model = get_gptq_peft_model(model, config, auto_find_all_linears=True, train_mode=True)

if args.gradient_checkpointing:
    if hasattr(model, "enable_input_require_grads"):
        model.enable_input_require_grads()
    else:
        def make_inputs_require_grad(module, input, output):
            output.requires_grad_(True)
        model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)


for name, module in model.named_modules():
    if isinstance(module, LoraLayer):
        if args.bf16:
            module = module.to(torch.bfloat16)
    if 'norm' in name:
        module = module.to(torch.float32)
    if 'lm_head' in name or 'embed_tokens' in name:
        if hasattr(module, 'weight'):
            if args.bf16 and module.weight.dtype == torch.float32:
                module = module.to(torch.bfloat16)

model.config.pad_token_id = tokenizer.pad_token_id

model.config.use_cache = False
model.print_trainable_parameters()

trainable params: 405,282,816 || all params: 1,347,485,696 || trainable%: 30.07696610087058


In [8]:
if CHECK:
    if args.save_emb:
        print('ORIGINAL')
        print(model.model.model.embed_tokens.original_module.weight.data.shape)
        print(model.model.model.embed_tokens.original_module.weight.data)
        print('SAVED')
        print(model.model.model.embed_tokens.modules_to_save.default.weight.data.shape)
        print(model.model.model.embed_tokens.modules_to_save.default.weight.data)
    else:
        print(model.model.model.embed_tokens.weight.data.shape)
        print(model.model.model.embed_tokens.weight.data)
    
    print('LORA A')
    print(model.model.model.layers[0].self_attn.qkv_proj.lora_A.default.weight.data.shape)
    print(model.model.model.layers[0].self_attn.qkv_proj.lora_A.default.weight.data)

ORIGINAL
torch.Size([32001, 4096])
tensor([[ 1.2517e-06, -1.7881e-06, -4.3511e-06,  ...,  8.9407e-07,
         -6.5565e-06,  8.9407e-07],
        [ 2.3651e-03, -3.4142e-03,  9.9373e-04,  ..., -8.8348e-03,
          2.5311e-03, -3.8948e-03],
        [ 1.0674e-02,  1.0468e-02, -5.1956e-03,  ...,  2.9011e-03,
          6.0844e-04, -4.6196e-03],
        ...,
        [-1.0742e-02,  9.3384e-03,  1.2939e-02,  ..., -3.3203e-02,
         -1.6357e-02,  3.3875e-03],
        [-8.3008e-03, -4.0588e-03, -1.1063e-03,  ...,  3.4790e-03,
         -1.2939e-02,  3.1948e-05],
        [ 7.0462e-04,  6.7896e-04, -4.5277e-04,  ..., -4.3376e-04,
         -8.9103e-05, -4.8357e-04]], device='cuda:0')
SAVED
torch.Size([32001, 4096])
tensor([[ 1.2517e-06, -1.7881e-06, -4.3511e-06,  ...,  8.9407e-07,
         -6.5565e-06,  8.9407e-07],
        [ 2.3651e-03, -3.4142e-03,  9.9373e-04,  ..., -8.8348e-03,
          2.5311e-03, -3.8948e-03],
        [ 1.0674e-02,  1.0468e-02, -5.1956e-03,  ...,  2.9011e-03,
          6

In [9]:
# DATASET

from datasets import load_dataset

dataset = load_dataset("opus100", "en-it")

from langchain import PromptTemplate

def template_prompt(eos_token):

    template_input_en_it = """Traduci il testo dall'inglese all'italiano.

    EN: {query}

    IT: {answer}{eos_token}'"""

    prompt_template_en_it = PromptTemplate(
        input_variables=["query", "answer", "eos_token"],
        template=template_input_en_it
    )

    template_input_it_en = """Translate the text from italian to english.

    IT: {query}

    EN: {answer}{eos_token}"""

    prompt_template_it_en = PromptTemplate(
        input_variables=["query", "answer", "eos_token"],
        template=template_input_it_en
    )

    def generate_prompt(data):
        
        rand = random.choice([0, 1])

        if(rand == 0):
            return prompt_template_en_it.format(query=data["translation"]['en'], answer=data["translation"]['it'], eos_token=eos_token)
        else:
            return prompt_template_it_en.format(query=data["translation"]['it'], answer=data["translation"]['en'], eos_token=eos_token)
    
    return generate_prompt

generate_prompt_funct = template_prompt(tokenizer.eos_token)

def tok_function(prompt):

    # print(prompt)

    return tokenizer(
            prompt,
            truncation=True,
            max_length=CXT_LENGHT + 1,
            # return_overflowing_tokens=True,
            # return_length=True,
            padding=True
        )

data_train = dataset['train'].shuffle().select(range(DATA_TRAIN_LENGTH)).map(lambda data: tok_function(generate_prompt_funct(data)), remove_columns=dataset['train'].column_names)
data_eval = dataset['validation'].shuffle().map(lambda data: tok_function(generate_prompt_funct(data)), remove_columns=dataset['validation'].column_names)

Map: 100%|██████████| 2000/2000 [00:00<00:00, 6734.72 examples/s]


In [10]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=data_train,
    eval_dataset=data_eval,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=MICRO_BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        warmup_steps=0,
        num_train_epochs=EPOCHS,
        learning_rate=LEARNING_RATE,
        fp16=True,
        logging_steps=1,
        output_dir="FIX_pad_trans_outputs",
        optim="paged_adamw_32bit",
        save_total_limit=3,
        save_strategy="steps",
        eval_steps=1
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False

In [11]:
trainer.train()

  0%|          | 0/2 [00:00<?, ?it/s]You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 50%|█████     | 1/2 [03:37<03:37, 217.96s/it]

{'loss': 2.8979, 'learning_rate': 0.0015, 'epoch': 0.5}


100%|██████████| 2/2 [06:54<00:00, 207.11s/it]

{'loss': 2.1942, 'learning_rate': 0.0, 'epoch': 1.0}
{'train_runtime': 414.2142, 'train_samples_per_second': 2.414, 'train_steps_per_second': 0.005, 'train_loss': 2.5460572242736816, 'epoch': 1.0}





TrainOutput(global_step=2, training_loss=2.5460572242736816, metrics={'train_runtime': 414.2142, 'train_samples_per_second': 2.414, 'train_steps_per_second': 0.005, 'train_loss': 2.5460572242736816, 'epoch': 1.0})

In [12]:
# print require grad

for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.shape)

print('\n-------------------------\n')

print(model)

base_model.model.model.embed_tokens.original_module.weight torch.Size([32001, 4096])
base_model.model.model.embed_tokens.modules_to_save.default.weight torch.Size([32001, 4096])
base_model.model.model.layers.0.self_attn.qkv_proj.lora_A.default.weight torch.Size([64, 4096])
base_model.model.model.layers.0.self_attn.qkv_proj.lora_B.default.weight torch.Size([12288, 64])
base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight torch.Size([64, 4096])
base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 64])
base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight torch.Size([64, 11008])
base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight torch.Size([4096, 64])
base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight torch.Size([64, 4096])
base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 64])
base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight torch.Siz

In [13]:
# SAVE

if args.save_emb:
    model.save_pretrained(f'{save_path}_emb_model')
else:
    model.save_pretrained(f'{save_path}_model')

if args.save_model_config: model.config.to_json_file(f'{save_path}_model_config.json')

if args.save_tokenizer: tokenizer.save_pretrained(f'{save_path}_tokenizer')

In [14]:
if CHECK:
    if args.save_emb:
        print('ORIGINAL')
        print(model.model.model.embed_tokens.original_module.weight.data.shape)
        print(model.model.model.embed_tokens.original_module.weight.data)
        print('SAVED')
        print(model.model.model.embed_tokens.modules_to_save.default.weight.data.shape)
        print(model.model.model.embed_tokens.modules_to_save.default.weight.data)
    else:
        print(model.model.model.embed_tokens.weight.data.shape)
        print(model.model.model.embed_tokens.weight.data)
    
    print('LORA A')
    print(model.model.model.layers[0].self_attn.qkv_proj.lora_A.default.weight.data.shape)
    print(model.model.model.layers[0].self_attn.qkv_proj.lora_A.default.weight.data)

ORIGINAL
torch.Size([32001, 4096])
tensor([[ 1.2517e-06, -1.7881e-06, -4.3511e-06,  ...,  8.9407e-07,
         -6.5565e-06,  8.9407e-07],
        [ 2.3651e-03, -3.4142e-03,  9.9373e-04,  ..., -8.8348e-03,
          2.5311e-03, -3.8948e-03],
        [ 1.0674e-02,  1.0468e-02, -5.1956e-03,  ...,  2.9011e-03,
          6.0844e-04, -4.6196e-03],
        ...,
        [-1.0742e-02,  9.3384e-03,  1.2939e-02,  ..., -3.3203e-02,
         -1.6357e-02,  3.3875e-03],
        [-8.3008e-03, -4.0588e-03, -1.1063e-03,  ...,  3.4790e-03,
         -1.2939e-02,  3.1948e-05],
        [ 7.0462e-04,  6.7896e-04, -4.5277e-04,  ..., -4.3376e-04,
         -8.9103e-05, -4.8357e-04]], device='cuda:0')
SAVED
torch.Size([32001, 4096])
tensor([[ 1.2517e-06, -1.7881e-06, -4.3511e-06,  ...,  8.9407e-07,
         -6.5565e-06,  8.9407e-07],
        [-1.8287e-03, -5.7180e-03, -1.0537e-03,  ..., -6.0834e-03,
          6.8041e-03, -8.3523e-03],
        [ 6.4600e-03,  6.3234e-03, -9.1933e-04,  ..., -1.3286e-03,
          5