In [1]:
!pip install --upgrade transformers
!pip install --upgrade datasets
!pip install --upgrade accelerate
!pip install -i https://test.pypi.org/simple/ bitsandbytes
!pip install peft
!pip install rouge_score
!pip install --upgrade evaluate

Collecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 2.1.0
    Uninstalling datasets-2.1.0:
      Successfully uninstalled datasets-2.1.0
Successfully installed datasets-2.13.1
Collecting accelerate
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.20.3
    Uninstalling accelerate-0.20.3:
      Successfully uninstalled accelerate-0.20.3
Successfully installed accelerate-0.21.0
Looking in indexes: https://test.pypi.org/simple/
Collecting bitsandbytes

In [2]:
import transformers
from transformers import  BloomForCausalLM, AutoTokenizer
from transformers import get_scheduler

import datasets
from datasets import load_dataset

import torch
from torch.optim import AdamW
from typing import Union

import matplotlib.pyplot as plt

from contextlib import nullcontext
from torch.cuda.amp import GradScaler, autocast

import evaluate

from peft import LoraConfig, get_peft_model, PeftConfig, PeftModel

from tqdm.auto import tqdm


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 6.0
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118_nocublaslt.so...


  warn(msg)
  warn(msg)
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
mixed_precision_dtype = torch.float16 if device.type == "cuda" else torch.bfloat16
ctx = torch.amp.autocast(device_type = device.type, dtype=mixed_precision_dtype)
scaler = GradScaler()

# **Build Modules**

### **Config tokenizer and model**

In [5]:
class Config:
    def __init__(self,
                 device):
        self.device = device
    
    def tokenizer(self, model_checkpoint):
        tok = AutoTokenizer.from_pretrained(model_checkpoint)
        return tok
    
    def load_pretrained_model(self, model_checkpoint):
        model = BloomForCausalLM.from_pretrained(model_checkpoint)
        return model.to(self.device)
    
    def add_lora(self, model, r: int, lora_alpha: int, lora_dropout: float):
        lora_config = LoraConfig(r = r,
                                 lora_alpha = lora_alpha,
                                 lora_dropout = lora_dropout,
                                 bias = "none",
                                 task_type = "CAUSAL_LM")
        lora_model = get_peft_model(model, lora_config)
        return lora_model

### **Create prompt**

In [6]:
class Prompter(object):
    __slots__ = ("template")

    def __init__(self, template_name: str = "", verbose: bool = False):
        self.template = {
            "prompt_input": "Dưới đây là một Instruction mô tả một nhiệm vụ, được ghép nối với một Input cung cấp thêm ngữ cảnh. Viết một Response hoàn thành yêu cầu một cách thích hợp.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
            "prompt_no_input": "Dưới đây là một Instruction mô tả một nhiệm vụ. Việt một Response hoàn thành yêu cầu một cách thích hợp.\n\n### Instruction:\n{instruction}\n\n### Response:\n",
            "response_split": "### Response:"
        }

    def generate_prompt(
        self,
        instruction: str,
        input: Union[None, str] = None,
        label: Union[None, str] = None,
    ) -> str:
        # returns the full prompt from instruction and optional input
        # if a label (=response, =output) is provided, it's also appended.
        if input:
            res = self.template["prompt_input"].format(
                instruction=instruction, input=input
            )
        else:
            res = self.template["prompt_no_input"].format(
                instruction=instruction
            )
        if label:
            res = f"{res}{label}"
        return res

    def get_response(self, output: str) -> str:
        return output.split(self.template["response_split"])[1].strip()

### **Data processing and analysis**

In [7]:
class DataProcess:
    def __init__(self,
                 data_path,
                 tokenizer):
        self.data_path = data_path
        self.tokenizer = tokenizer
        
    def load_data(self):
        dataset = load_dataset(self.data_path, "vi", split = "train")
        return dataset
    
    def statistical(self, dataset, prompter):
        prompt_len = []
        for line in dataset:
            full_prompt = prompter.generate_prompt(line["instruction"],
                                                   line["input"],
                                                   line["output"])
            prompt_len.append(len(self.tokenizer.encode(full_prompt)))
        return prompt_len
    
    def draw(self, prompt_len):
        freq = {}
        for num in prompt_len:
            if num in freq:
                freq[num] += 1
            else:
                freq[num] = 1
        
        max_freq = 0
        max_keys = None

        for k in freq.keys():
            if freq[k] >= max_freq:
                max_freq = freq[k]
                max_keys = k
                
        fig = plt.figure(figsize = ((8, 5)))
        plt.bar(freq.keys(), freq.values(), width = 0.6)   
        plt.xlabel("Prompt length")
        plt.ylabel("Prompt length frequency")
        plt.show()
        
        print("Length occupies the most frequency: ", max_keys)
        print("Maximum frequency: ", max_freq)

### **Create input for the model**

In [8]:
class MODEL_INPUTS:
    def __init__(self,
                 prompter,
                 tokenizer,
                 max_length: int):
        self.prompter = prompter
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def tokenize(self, prompt, add_eos_token = True):
        result = self.tokenizer(prompt,
                                truncation = True,
                                max_length = self.max_length,
                                padding = True,
                                return_tensors = None)
        if (   
            result["input_ids"][-1] != self.tokenizer.eos_token_id
            and len(result["input_ids"]) < self.max_length
            and add_eos_token
        ):
            
            result["input_ids"].append(self.tokenizer.eos_token_id)
            result["attention_mask"].append(1)

        result["labels"] = result["input_ids"].copy()
        return result
        
    def generate_and_tokenize_prompt(self, dataset):
        full_prompt = self.prompter.generate_prompt(dataset["instruction"],
                                                    dataset["input"],
                                                    dataset["output"])
        
        tokenized_full_prompt = self.tokenize(full_prompt)
        return tokenized_full_prompt
    
    def prepare_dataloader(self,
                           train_data,
                           valid_data,
                           batch_size: int):
        
        train_dataloader = torch.utils.data.DataLoader(dataset = train_data,
                                                       batch_size = batch_size,
                                                       collate_fn = transformers.DataCollatorForSeq2Seq(tokenizer = self.tokenizer,
                                                                                                        padding = True,
                                                                                                        return_tensors = "pt"))
        valid_dataloader = torch.utils.data.DataLoader(dataset = valid_data,
                                                       batch_size = batch_size,
                                                       collate_fn = transformers.DataCollatorForSeq2Seq(tokenizer = self.tokenizer,
                                                                                                        padding = True,
                                                                                                        return_tensors = "pt"))
        return train_dataloader, valid_dataloader

### **Evaluate and Test**

In [9]:
class EVALUATEandTEST:
    def __init__(self,
                 tokenizer,
                 device,
                 metrics,
                 prompter,
                 ctx):
        self.tokenizer = tokenizer
        self.device = device
        self.metrics = metrics
        self.prompter = prompter
        self.ctx = ctx
        
    def evaluate(self,
                 model,
                 dataset,
                 gen_mode: bool = False,
                 samples_gen: int = None,
                 samples_eval: int = None):
        model.eval()
        total_loss = 0
        predicted_texts, correct_texts = [], []
        current_gen_mode = gen_mode
        for i, batch in enumerate(dataset):
            batch = {k:v.to(self.device) for k, v in batch.items()}
            with torch.no_grad():
                with self.ctx:
                    outputs = model(input_ids = batch["input_ids"],
                                    attention_mask = batch["attention_mask"],
                                    labels = batch["labels"],
                                    return_dict = True)
            loss = outputs.loss
            total_loss += loss.item()
            
            if current_gen_mode is True:
                outputs_gen = model.generate(input_ids = batch["input_ids"],
                                             attention_mask = batch["attention_mask"],
                                             top_k = 40,
                                             no_repeat_ngram_size = 3,
                                             num_beams = 1,
                                             max_new_tokens = 256,
                                             bos_token_id = self.tokenizer.bos_token_id,
                                             eos_token_id = self.tokenizer.eos_token_id,
                                             pad_token_id = self.tokenizer.pad_token_id,
                                             early_stopping = True)
                
                generate_batch = self.tokenizer.batch_decode(outputs_gen, skip_special_tokens = True)
                correct_batch = self.tokenizer.batch_decode(batch["input_ids"], skip_special_tokens = True)
                
                for j in len(generate_batch):       
                    prompt = generate_batch[j]
                    response = self.prompter.get_response(prompt)
                    generate_batch[j] = response
                        
                for k in len(correct_batch):
                    prompt = correct_batch[k]
                    response = self.prompter.get_response(prompt)
                    correct_batch[k] = response
                    
                predicted_texts += generate_batch
                correct_texts += correct_batch
                
            if samples_gen is not None:
                if i >= samples_gen:
                    current_gen_mode = False
            
            if samples_eval is not None:
                if i >= samples_eval:
                    break
                
        if gen_mode is True:
            rouge = self.metrics.compute(predictions = predicted_texts,
                                         references = correct_texts)
        
            return {"rouge1": rouge["rouge1"],
                    "rouge2": rouge["rouge2"],
                    "rougeL": rouge["rougeL"],
                    "rougeLsum": rouge["rougeLsum"],
                    "loss": total_loss/(samples_eval + 1 if samples_eval is not None else len(dataset))}
     
        else:
            return {"loss": total_loss/(samples_eval + 1 if samples_eval is not None else len(dataset))}
    
    def test(self,
             model,
             instruction: str,
             input: str = None,
             label: str = None):
        
        prompt = self.prompter.generate_prompt(instruction = instruction, input = input)
        inputs = self.tokenizer(prompt, return_tensors = "pt")
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        outputs = model.generate(input_ids = inputs["input_ids"],
                                 attention_mask = inputs["attention_mask"],
                                 max_new_tokens = 256,
                                 no_repeat_ngram_size = 3,
                                 num_beams = 1,
                                 top_k = 40,
                                 bos_token_id = self.tokenizer.bos_token_id,
                                 eos_token_id = self.tokenizer.eos_token_id,
                                 pad_token_id = self.tokenizer.pad_token_id,
                                 early_stopping = True)
        text = self.tokenizer.batch_decode(outputs, skip_special_tokens = True)[0]
        response = self.prompter.get_response(text)
        if label is not None:
            return {"label": label,
                    "response": response}
        else:
            return {"response": response}

### **Train and Save**

In [10]:
class Trainer:
    def __init__(self,
                 lr: float,
                 epochs: int,
                 model,
                 gradient_accumulation_steps: int,
                 device,
                 evaluate_fn,
                 mixed_precision_dtype,
                 scaler,
                 ctx):
        self.epochs = epochs
        self.model = model
        self.optimizer = AdamW(model.parameters(), lr = lr)
        self.gradient_accumulation_steps = gradient_accumulation_steps
        self.device = device
        self._eval = evaluate_fn
        self.mixed_precision_dtype = mixed_precision_dtype
        self.scaler = scaler
        self.ctx = ctx
        
    def train(self,
              train_dataloader, 
              display_steps: int,
              save_steps: int,
              save_name: str = None,
              save_checkpoint: bool = False,
              valid_dataloader = None,
              samples_gen: int = None,
              samples_eval: int = None,
              gen_mode: bool = False,
              checkpoint = None):
        
        num_update_steps_per_epoch = len(train_dataloader)
        
        if checkpoint is not None:
            current_steps = checkpoint["current_steps"]
            self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
            num_steps = num_update_steps_per_epoch * self.epochs - current_steps
            progress_bar = tqdm(range(num_steps))
            lr_scheduler = get_scheduler("cosine",
                                         optimizer = self.optimizer,
                                         num_warmup_steps = 100,
                                         num_training_steps = num_steps)
            lr_scheduler.load_state_dict(checkpoint["lr_scheduler_state_dict"])
            self.scaler.load_state_dict(checkpoint["scaler_state_dict"])
            self.model.load_state_dict(checkpoint["model_state_dict"])
            total_loss = checkpoint["total_loss"]
            
        else:
            current_steps = 0
            num_steps = num_update_steps_per_epoch * self.epochs
            progress_bar = tqdm(range(num_steps))
            lr_scheduler = get_scheduler("cosine",
                                         optimizer = self.optimizer,
                                         num_warmup_steps = 100,
                                         num_training_steps = num_steps)
            total_loss = 0
            
        idx = 0
        for epoch in range(self.epochs):
            
            self.model.train()
            for batch in train_dataloader:
                idx += 1
                if idx > current_steps:
                    batch = {k:v.to(self.device) for k, v in batch.items()}
                    self.optimizer.zero_grad()
                    with self.ctx:
                        outputs = self.model(input_ids = batch["input_ids"],
                                             attention_mask = batch["attention_mask"],
                                             labels = batch["labels"],
                                             return_dict = True)
                    loss = outputs.loss
                    total_loss += loss.item()
                    
                    loss /= self.gradient_accumulation_steps
                    if self.mixed_precision_dtype:
                        self.scaler.scale(loss).backward()
                        
                        if idx % self.gradient_accumulation_steps == 0:
                            self.scaler.step(self.optimizer)
                            lr_scheduler.step()
                            self.scaler.update()
                            
                    else:
                        loss.backward()
                        if idx % self.gradient_accumulation_steps == 0:
                            self.optimizer.step()
                            lr_scheduler.step()
                    
                    progress_bar.update(1)
                    current_steps += 1
                    
                    if current_steps % display_steps == 0:
                        if current_steps % len(train_dataloader) == 0:
                            if valid_dataloader is not None:
                                eval_ = self._eval(self.model, valid_dataloader, samples_gen = samples_gen, samples_eval = samples_eval, gen_mode = True)
                                print(f'Epoch: {epoch + 1} -- step: {current_steps} -- train_loss: {total_loss/current_steps} -- val_loss: {eval_["loss"]}')
                                print(f'rouge1: {eval_["rouge1"]} -- rouge2: {eval_["rouge2"]} -- rougeL: {eval_["rougeL"]} -- rougeLsum: {eval_["rougeLsum"]}')
                                print("----------------------- End of epoch {} -----------------------".format(epoch + 1))
                                
                            else:
                                print(f'Epoch: {epoch + 1} -- step: {current_steps} -- train_loss: {total_loss/current_steps}') 
                                print("----------------------- End of epoch {} -----------------------".format(epoch + 1))
                        else:
                            if valid_dataloader is not None:
                                eval_ = self._eval(self.model, valid_dataloader, samples_eval = samples_eval, gen_mode = False)
                                print(f'Epoch: {epoch + 1} -- step: {current_steps} -- train_loss: {total_loss/current_steps} -- val_loss: {eval_["loss"]}')
                            else:
                                print(f'Epoch: {epoch + 1} -- step: {current_steps} -- train_loss: {total_loss/current_steps}')
                    
                    if save_checkpoint is True:
                        if current_steps % save_steps == 0:
                            print("Saving..........")
                            torch.save({"model_state_dict": self.model.state_dict(),
                                        "optimizer_state_dict": self.optimizer.state_dict(),
                                        "scaler_state_dict": self.scaler.state_dict(),
                                        "lr_scheduler_state_dict": lr_scheduler.state_dict(),
                                        "current_steps": current_steps,
                                        "total_loss": total_loss},
                                       save_name)
                            print("****** Save successfully ******")
                    

# **Run Modules**

In [11]:
config = Config(device)
tokenizer = config.tokenizer(model_checkpoint = "bigscience/bloom")

Downloading (…)okenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

In [12]:
model = config.load_pretrained_model(model_checkpoint = "bigscience/bloom-560m")

Downloading (…)lve/main/config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [17]:
lora_model = config.add_lora(model = model, r = 8, lora_alpha = 16, lora_dropout = 0.05)

In [18]:
lora_model.to(device)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): BloomForCausalLM(
      (transformer): BloomModel(
        (word_embeddings): Embedding(250880, 1024)
        (word_embeddings_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (h): ModuleList(
          (0-23): 24 x BloomBlock(
            (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (self_attention): BloomAttention(
              (query_key_value): Linear(
                in_features=1024, out_features=3072, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict(

In [19]:
prompter = Prompter()

In [20]:
data_process = DataProcess(data_path = "MBZUAI/Bactrian-X", tokenizer = tokenizer)
dataset = data_process.load_data()

In [19]:
# data_process.draw(data_process.statistical(dataset, prompter))

In [21]:
splited_dataset = dataset.train_test_split(test_size = 0.1, seed = 42)

In [22]:
splited_dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'id', 'output'],
        num_rows: 60315
    })
    test: Dataset({
        features: ['instruction', 'input', 'id', 'output'],
        num_rows: 6702
    })
})

In [23]:
model_inputs = MODEL_INPUTS(prompter = prompter,
                            tokenizer = tokenizer,
                            max_length = 512)

In [24]:
train_data = splited_dataset["train"].shuffle().map(model_inputs.generate_and_tokenize_prompt)

Map:   0%|          | 0/60315 [00:00<?, ? examples/s]

In [25]:
train_data = train_data.remove_columns(["instruction", "input", "id", "output"])

In [26]:
valid_data = splited_dataset["test"].map(model_inputs.generate_and_tokenize_prompt)

Map:   0%|          | 0/6702 [00:00<?, ? examples/s]

In [27]:
valid_data = valid_data.remove_columns(["instruction", "input", "id", "output"])

In [28]:
train_data.set_format("torch")
valid_data.set_format("torch")

In [29]:
train_dataloader, valid_dataloader = model_inputs.prepare_dataloader(train_data, valid_data, batch_size = 2)

In [30]:
metrics = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [31]:
evalntest = EVALUATEandTEST(tokenizer = tokenizer,
                            metrics = metrics,
                            device = device,
                            prompter = prompter,
                            ctx = ctx)

In [32]:
trainer = Trainer(lr = 1e-4,
                  epochs = 5,
                  model = lora_model,
                  gradient_accumulation_steps = 4,
                  device = device,
                  evaluate_fn = evalntest.evaluate,
                  mixed_precision_dtype = mixed_precision_dtype,
                  scaler = scaler, 
                  ctx = ctx)

In [None]:
trainer.train(train_dataloader = train_dataloader,
              display_steps = 500,
              save_steps = 3000,
              save_name = "bloom-560m-checkpoint.pt",
              valid_dataloader = valid_dataloader,
              samples_gen = 100,
              samples_eval = None,
              gen_mode = False,
              save_checkpoint = True,
              checkpoint = None)

  0%|          | 0/150790 [00:00<?, ?it/s]

You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch: 1 -- step: 500 -- train_loss: 2.4628809394836426 -- val_loss: 1.7886342240674786
Epoch: 1 -- step: 1000 -- train_loss: 2.113223148941994 -- val_loss: 1.7186852011777365
Epoch: 1 -- step: 1500 -- train_loss: 1.9838043514490127 -- val_loss: 1.7006095085916715
Epoch: 1 -- step: 2000 -- train_loss: 1.9104936387240887 -- val_loss: 1.6874646208635624
Epoch: 1 -- step: 2500 -- train_loss: 1.8636505910634995 -- val_loss: 1.677755925592043
Epoch: 1 -- step: 3000 -- train_loss: 1.8314305991927782 -- val_loss: 1.6700203587175733
