## Let's begin!
The libraries to install. Run once per instance

In [None]:
!pip install bitsandbytes
!pip install datasets==2.17.1
!pip install accelerate
!pip install peft
!pip install transformers
!pip install scipy ipywidgets

In [1]:
import os
from datetime import datetime
import transformers
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel

from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [3]:
# the config for peft_model
config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

## Data preprocessing

- Load the data
- Get the tokenizer from the model
- Define the prompt
- Generate the tokenized propmt for train and val datasets

In [5]:
class DataTokenizer:
    def __init__(self, args: dict):
        self.data_path = "gem/viggo"
        self.model_max_length = args["model_max_length"]
        self.base_model_name = args["base_model_name"]
        self.cache_dir = args.get("cache_dir", None)

        self.tokenizer = AutoTokenizer.from_pretrained(
            self.base_model_name,
            model_max_length=512,
            padding_side="left",
            add_eos_token=True,
            cache_dir=self.cache_dir)

        self.tokenizer.pad_token = self.tokenizer.eos_token

        self.train_dataset = load_dataset(
            self.data_path, split='train', trust_remote_code=True, cache_dir=self.cache_dir
        )
        self.val_dataset = load_dataset(
            self.data_path, split='validation', trust_remote_code=True, cache_dir=self.cache_dir
        )
        self.test_dataset = load_dataset(
            self.data_path, split='test', trust_remote_code=True, cache_dir=self.cache_dir
        )

        self.tokenized_train_dataset = self.train_dataset.map(self.generate_and_tokenize_prompt)
        self.tokenized_val_dataset = self.val_dataset.map(self.generate_and_tokenize_prompt)

    def prompt_tokenizer(self, prompt):
        result = self.tokenizer(
            prompt,
            truncation=True,
            max_length=512,
            padding="max_length",
        )
        result["labels"] = result["input_ids"].copy()
        return result

    def generate_and_tokenize_prompt(self, data_point):
        full_prompt = f"""Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
                    This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
                    The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
                
                    ### Target sentence:
                    {data_point["target"]}
                
                    ### Meaning representation:
                    {data_point["meaning_representation"]}
                    """
        return self.prompt_tokenizer(full_prompt)

## Load the base model either for training or inference

In [9]:
class BaseModelLoader:
    def __init__(self, args):
        self.base_model_name = args["base_model_name"]
        self.cache_dir = args.get("cache_dir", None)
        self.is_inference = args.get("is_inference")
        self.quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type='nf4',
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
        if self.is_inference:
            self.model = AutoModelForCausalLM.from_pretrained(
                self.base_model_name,
                quantization_config=self.quantization_config,
                cache_dir=self.cache_dir,
                device_map="auto",
                trust_remote_code=True
            )
        else:
            self.model = AutoModelForCausalLM.from_pretrained(
                self.base_model_name,
                quantization_config=self.quantization_config,
                cache_dir=self.cache_dir)

Creates an instance of an accelerator for distributed training (on multi-GPU, TPU) or mixed precision training.</pre>
I am not sure if it is helpful when using only one GPU.

In [7]:
def accelerator():
    plugin = FullyShardedDataParallelPlugin(
        state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
        optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
    )

    acc = Accelerator(fsdp_plugin=plugin)

    return acc

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    logging.info(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [10]:
### create the path to save model, data, and output of the code

In [11]:
project = "mistral-viggo-finetune"

current_working_dir = os.getcwd()

saved_dir = f"{current_working_dir}/data"
os.makedirs(saved_dir, exist_ok=True)

cache_path = f'{saved_dir}/cache'
os.makedirs(cache_path, exist_ok=True)

model_output_path = f"{saved_dir}/{project}"
os.makedirs(model_output_path, exist_ok=True)

logs_path = f"{saved_dir}/logs"
os.makedirs(logs_path, exist_ok=True)

In [12]:
### arguments to use in the training 

In [20]:
train_args = {
    "base_model_name": "mistralai/Mistral-7B-v0.3",
    "cache_dir": cache_path,
    "is_inference": False,
    "data_path": "gem/viggo",
    "model_max_length": 512,
    "model_output_dir": model_output_path,
    "logs_dir": logs_path,
    "project_name": project
}

In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [14]:
### load base_model for the training

In [15]:
base_model = BaseModelLoader(train_args).model

In [16]:
### tokenized the train and val datasets

In [None]:
data_tokenizer = DataTokenizer(train_args)

tokenized_train_dataset = data_tokenizer.tokenized_train_dataset
tokenized_val_dataset = data_tokenizer.tokenized_val_dataset

To start fine-tuning, we should pre-process the model. For that, we use the `prepare_model_for_kbit_training` from peft.

In [None]:
base_model.gradient_checkpointing_enable()
base_model = prepare_model_for_kbit_training(base_model)

In [18]:
### the config contains the paramaters to define low-rank matrix

In [None]:
base_model = get_peft_model(base_model, config)
print_trainable_parameters(base_model)

In [19]:
### Apply the accelerator. You can comment this out to remove the accelerator.

In [None]:
acc = accelerator()
base_model = acc.prepare_model(base_model)

In [None]:
tokenizer = data_tokenizer.tokenizer

trainer = transformers.Trainer(
    model=base_model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=transformers.TrainingArguments(
        output_dir=train_args.get("model_output_dir"),
        warmup_steps=5,
        per_device_train_batch_size=2,
        gradient_checkpointing=True,
        gradient_accumulation_steps=4,
        max_steps=100,
        learning_rate=2.5e-5,  # Want about 10x smaller than the Mistral learning rate
        logging_steps=50,
        bf16=True,
        optim="paged_adamw_8bit",
        logging_dir=train_args.get("logs_dir"),  # Directory for storing logs
        save_strategy="steps",  # Save the model checkpoint every logging step
        save_steps=10,  # Save checkpoints every 50 steps
            evaluation_strategy="steps",  # Evaluate the model every logging step
        eval_steps=10,  # Evaluate and save checkpoints every 50 steps
        do_eval=True,  # Perform evaluation at the end of training
        run_name=f"{train_args.get('project_name')}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

### Train

In [None]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

### Let's try the trained model

In [21]:
inference_args = {
    "base_model_name": "mistralai/Mistral-7B-v0.3",
    "cache_dir": cache_path,
    "is_inference": True,
    "data_path": "gem/viggo",
    "model_max_length": 512,
    "project_name": project,
    "model_output_path": model_output_path,
    "checkpoint": "checkpoint-1000",
    "target_sentence": "I remember you saying you found Little Big Adventure to be average. Are you not usually "
                       "that into single-player games on PlayStation?"
}

In [22]:
### load base-model for inference 

In [None]:
base_model = BaseModelLoader(inference_args).model

In [None]:
eval_tokenizer = AutoTokenizer.from_pretrained(
        inference_args.get("base_model_name"),
        add_bos_token=True,
        trust_remote_code=True,
        cache_dir=inference_args.get("cache_dir)
    )

In [23]:
### load fine-tuned model

In [None]:
output_dir = inference_args.get("model_output_path")
checkpoint_version = inference_args.get("checkpoint")
ft_model = PeftModel.from_pretrained(base_model, f"{output_dir}/{checkpoint_version}")

In [24]:
eval_prompt = f"""Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
    This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
    The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']

    ### Target sentence:
    {inference_args.get("target_sentence")}

    ### Meaning representation:
    """

In [2]:
### fine-tuned model outcome

In [None]:
model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

ft_model.eval()
with torch.no_grad():
    print(eval_tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=256)[0], skip_special_tokens=True))

In [3]:
### base-model outcome

In [None]:
base_model.eval()
with torch.no_grad():
    print(eval_tokenizer.decode(base_model.generate(**model_input, max_new_tokens=256)[0], skip_special_tokens=True))