# Notebook Info
**Reference**: \\
- https://huggingface.co/docs/transformers/chat_templating \\
- https://github.com/MicrosoftDocs/azure-docs/blob/main/articles/ai-services/openai/includes/chat-markup-language.md \\
- https://huggingface.co/datasets/FreedomIntelligence/alpaca-gpt4-indonesian \\
- https://huggingface.co/datasets/FreedomIntelligence/sharegpt-indonesian \\
- https://huggingface.co/datasets/FreedomIntelligence/evol-instruct-indonesian \\
- https://huggingface.co/datasets/jakartaresearch/indoqa \\


**Task**: Chat or Conversational \\
**Input**: User's prompt containing chat templated text in string format \\
**Output**: Model's generated text in string format

**Experiment**:
- Use bos_token and eos_token to replace <|im_start|> and <|im_end|> in ChatML. (Inspired by: https://asmirnov.xyz/doppelganger) \\
- Use left padding and left truncation to conform to max_length. \\
- Set max_length = 256 in the training process, which consumes 33.7 GB of memory.

# Install Required Package

In [None]:
! pip install transformers[torch] datasets evaluate -q
! pip install accelerate -U -q

# Load Dataset

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset
import pandas as pd

checkpoint="bigscience/bloom-1b1"
max_length=256
chat_tokenizer=AutoTokenizer.from_pretrained(checkpoint)
chat_tokenizer.chat_template="{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{ bos_token }}{{message['role'] + '\n' + message['content'] + '\n'}}{{ eos_token }}{% if not loop.last %}{{ '\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '\n'}}{{ bos_token }}{{'assistant' + '\n' }}{% endif %}"
chat_tokenizer.model_max_length=max_length
chat_tokenizer.padding=True
chat_tokenizer.truncation_side='left'

In [None]:
alpaca_ds = load_dataset("FreedomIntelligence/alpaca-gpt4-indonesian")
def preprocess_alpaca(ds):
    chatML = []
    for conversation in ds['conversations']:
        chatML.append(
            {
                'role': 'user' if conversation['from'] == 'human' else 'assistant',
                'content': conversation['value'].strip(),
            }
        )
    return {
            'input_ids': chat_tokenizer.apply_chat_template(
                                  chatML,
                                  tokenize=True,
                                  add_generation_prompt=False,
                                  return_tensors="pt",
                                  truncation=True,
                                  padding=True,
                         )
    }

alpaca_chatML = alpaca_ds.map(
    preprocess_alpaca,
    batched=False,
    num_proc=4,
    remove_columns=alpaca_ds['train'].column_names,
)



In [None]:
evol_ds = load_dataset("FreedomIntelligence/evol-instruct-indonesian")
def preprocess_evol(ds):
    chatML = []
    for conversation in ds['conversations']:
        chatML.append(
            {
                'role': 'user' if conversation['from'] == 'human' else 'assistant',
                'content': conversation['value'].strip(),
            }
        )
    return {
            'input_ids': chat_tokenizer.apply_chat_template(
                                  chatML,
                                  tokenize=True,
                                  add_generation_prompt=False,
                                  return_tensors="pt",
                                  truncation=True,
                                  padding=True,
                         )
    }

evol_chatML = evol_ds.map(
    preprocess_evol,
    batched=False,
    num_proc=4,
    remove_columns=evol_ds['train'].column_names,
)



In [None]:
sharegpt_ds = load_dataset("FreedomIntelligence/sharegpt-indonesian")
def preprocess_sharegpt(ds):
    chatML = []
    for conversation in ds['conversations']:
        chatML.append(
            {
                'role': 'user' if conversation['from'] == 'human' else 'assistant',
                'content': conversation['value'].strip(),
            }
        )
    return {
            'input_ids': chat_tokenizer.apply_chat_template(
                                  chatML,
                                  tokenize=True,
                                  add_generation_prompt=False,
                                  return_tensors="pt",
                                  truncation=True,
                                  padding=True,
                         )
    }

sharegpt_chatML = sharegpt_ds.map(
    preprocess_sharegpt,
    batched=False,
    num_proc=4,
    remove_columns=sharegpt_ds['train'].column_names,
)

In [None]:
indoqa_ds = load_dataset("jakartaresearch/indoqa")
def preprocess_indoqa(ds):
    chatML = [
        {
            'role': 'system',
            'content': ds['context'].strip('\n').strip(),
        },
        {
            'role': 'user',
            'content': ds['question'].strip('\n').strip(),
        },
        {
            'role': 'assistant',
            'content': ds['answer'].strip('\n').strip(),
        }
    ]
    return {
            'input_ids': chat_tokenizer.apply_chat_template(
                                  chatML,
                                  tokenize=True,
                                  add_generation_prompt=False,
                                  return_tensors="pt",
                                  truncation=True,
                                  padding=True,
                         )
    }

for split in indoqa_ds:
  indoqa_ds[split] = Dataset.from_pandas(pd.DataFrame(indoqa_ds[split]).dropna()) # removing None

indoqa_chatML = indoqa_ds.map(
    preprocess_indoqa,
    batched=False,
    num_proc=4,
    remove_columns=indoqa_ds['train'].column_names,
)

Downloading data:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/466k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3309 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1104 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3250 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1084 [00:00<?, ? examples/s]

In [None]:
from datasets import concatenate_datasets

num_data_per_source = 21000
ds_chatML = concatenate_datasets([
    alpaca_chatML['train'].select(range(num_data_per_source)),
    evol_chatML['train'].select(range(num_data_per_source)),
    sharegpt_chatML['train'],
    indoqa_chatML['train'],
    indoqa_chatML['validation'],
    ])

In [None]:
# partial_ds_chatML = ds_chatML.select(range(50000))
# partial_ds_chatML

In [None]:
def group_texts(tokenized_ds):
    tokenized_ds = {k: sum(tokenized_ds[k], []) for k in tokenized_ds.keys()}
    new_input_ids = []
    for row in tokenized_ds['input_ids']:
        total_length = len(row)
        new_input_ids += [row[i : i + max_length] for i in range(0, total_length, max_length)]
    return {
        'input_ids': new_input_ids,
        'labels': new_input_ids,
    }

# lm_dataset = ds_chatML.map(group_texts, batched=True, num_proc=4)
lm_dataset = ds_chatML.map(group_texts, batched=True, num_proc=4)
lm_dataset.set_format("torch")
lm_dataset

Map (num_proc=4):   0%|          | 0/51894 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 51894
})

In [None]:
from transformers import DataCollatorForLanguageModeling
from torch.utils.data import DataLoader

data_collator = DataCollatorForLanguageModeling(tokenizer=chat_tokenizer, mlm=False)

lm_dataset = lm_dataset.train_test_split(test_size=0.2)
train_dataloader = DataLoader(lm_dataset["train"], shuffle=True, batch_size=8, collate_fn=data_collator)
eval_dataloader = DataLoader(lm_dataset["test"], batch_size=8, collate_fn=data_collator)

# Load Model and Set Hyperparameters

In [None]:
from transformers import AutoModelForCausalLM, get_scheduler
from torch.optim import AdamW
import torch

model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    # torch_dtype=torch.float16,
    )

optimizer = AdamW(model.parameters(), lr=2e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BloomForCausalLM(
  (transformer): BloomModel(
    (word_embeddings): Embedding(250880, 1536)
    (word_embeddings_layernorm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
    (h): ModuleList(
      (0-23): 24 x BloomBlock(
        (input_layernorm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
        (self_attention): BloomAttention(
          (query_key_value): Linear(in_features=1536, out_features=4608, bias=True)
          (dense): Linear(in_features=1536, out_features=1536, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (post_attention_layernorm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
        (mlp): BloomMLP(
          (dense_h_to_4h): Linear(in_features=1536, out_features=6144, bias=True)
          (gelu_impl): BloomGelu()
          (dense_4h_to_h): Linear(in_features=6144, out_features=1536, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
  )
  (

# Pre-Evaluate Model (Qualitative)

In [None]:
from transformers import GenerationConfig

model.eval() # swtich to inferencing mode
prefix_gen = '\n<s>assistant\n'
messages = chat_tokenizer.decode(lm_dataset['test'][30]['input_ids']).split(prefix_gen)
prompt = messages[0] + prefix_gen
print(prompt, "\n --------------")
generated_text = model.generate(input_ids=chat_tokenizer(prompt, return_tensors="pt").input_ids.to(device),
                                generation_config=GenerationConfig(max_new_tokens=512), do_sample=False,
               )
print(chat_tokenizer.decode(generated_text[0], skip_special_tokens=False))

<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><s>user
Karen merencanakan kunjungan Natalnya dan dia ingin membeli baju baru seharga $85. Dia membutuhkan 1/3 dari uang yang telah dia tabung untuk tujuan itu. Berapa total anggaran tabungannya?
</s>
<s>assistant
 
 --------------
<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pa

# Train Model

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train() # swtich to training mode
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/15570 [00:00<?, ?it/s]

You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
import datetime

now_str = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

finetuned_chatML_model_path = f"./saved_finetuned_chatML_model-{now_str}"
model.save_pretrained(finetuned_chatML_model_path)
chat_tokenizer.save_pretrained(finetuned_chatML_model_path)

('./saved_finetuned_chatML_model-20240115-073054/tokenizer_config.json',
 './saved_finetuned_chatML_model-20240115-073054/special_tokens_map.json',
 './saved_finetuned_chatML_model-20240115-073054/tokenizer.json')

# Post-Evaluate Model (Qualitative)

In [None]:
model.eval() # swtich to inferencing mode

messages = [
    {
        "role": "system",
        "content": "Anda adalah BaGoEs, Chatbot yang dikembangkan oleh Group of Expert. Jawab pertanyaan dengan maksimal dua kalimat.",
    },
    {
        "role": "user",
        "content": "Perkenalkan diri Anda!",
    },
 ]


input_ids = chat_tokenizer.apply_chat_template(
                    messages,
                    tokenize=True,
                    add_generation_prompt=True,
                    return_tensors="pt",
            )
generated_text = model.generate(input_ids=input_ids.to(device),
                                generation_config=GenerationConfig(max_new_tokens=200),
               )
print(chat_tokenizer.decode(generated_text[0], skip_special_tokens=False))

<s>system
Anda adalah BaGoEs, Chatbot yang dikembangkan oleh Group of Expert. Jawab pertanyaan dengan maksimal dua kalimat.
</s>
<s>user
Perkenalkan diri Anda!
</s>
<s>assistant
Halo! Nama saya BaGoEs, dan saya adalah Chatbot yang dikembangkan oleh Group of Expert. Saya di sini untuk membantu menjawab pertanyaan Anda sebaik mungkin. Mari kita mulai?
</s>


In [None]:
model.eval() # swtich to inferencing mode

messages = [
    {
        "role": "system",
        "content": "Namamu adalah BaGoEs, Chatbot yang dikembangkan oleh Group of Expert. Berikan jawaban pendek!.",
    },
    {
        "role": "user",
        "content": "Hai!",
    },
 ]


input_ids = chat_tokenizer.apply_chat_template(
                    messages,
                    tokenize=True,
                    add_generation_prompt=True,
                    return_tensors="pt",
            )
generated_text = model.generate(input_ids=input_ids.to(device),
                                generation_config=GenerationConfig(max_new_tokens=512),
               )
print(chat_tokenizer.decode(generated_text[0], skip_special_tokens=False))

<s>system
Namamu adalah BaGoEs, Chatbot yang dikembangkan oleh Group of Expert. Berikan jawaban pendek!.
</s>
<s>user
Hai!
</s>
<s>assistant
Halo! Nama saya BaGoEs, dan saya adalah asisten AI yang dikembangkan oleh Group of Expert. Saya di sini untuk membantu Anda dengan pertanyaan apa pun yang Anda miliki. Mari kita mulai!
</s>


In [None]:
from transformers import GenerationConfig

model.eval() # swtich to inferencing mode
prefix_gen = '\n<s>assistant\n'
messages = chat_tokenizer.decode(lm_dataset['test'][30]['input_ids']).split(prefix_gen)
prompt = messages[0] + prefix_gen
print(prompt, "\n --------------")
generated_text = model.generate(input_ids=chat_tokenizer(prompt, return_tensors="pt").input_ids.to(device),
                                generation_config=GenerationConfig(max_new_tokens=512), do_sample=False,
               )
print(chat_tokenizer.decode(generated_text[0], skip_special_tokens=False))

<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><s>user
Karen merencanakan kunjungan Natalnya dan dia ingin membeli baju baru seharga $85. Dia membutuhkan 1/3 dari uang yang telah dia tabung untuk tujuan itu. Berapa total anggaran tabungannya?
</s>
<s>assistant
 
 --------------
<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pa

In [None]:
! cp -r saved_* ./local-volume/.