In [None]:
! pip install -q -U bitsandbytes
! pip install -q -U datasets
! pip install -q -U git+https://github.com/huggingface/transformers.git
! pip install -q -U git+https://github.com/huggingface/peft.git
! pip install -q -U git+https://github.com/huggingface/accelerate.git
! pip install -q -U loralib
! pip install -q -U einops

In [2]:
import json
import os
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from pprint import pprint
from datasets import load_dataset
from huggingface_hub import notebook_login
from peft import (
  LoraConfig,
  PeftConfig,
  PeftModel,
  get_peft_model,
  prepare_model_for_kbit_training
)
from transformers import (
  AutoConfig,
  AutoModelForCausalLM,
  AutoTokenizer,
  BitsAndBytesConfig
)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
MODEL_NAME = "vilm/vinallama-7b-chat"
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map = "auto",
    trust_remote_code = True,
    quantization_config = bnb_config
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)


config.json:   0%|          | 0.00/681 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.91G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.80G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.67M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
config = LoraConfig(
    r=16,
    lora_alpha =32,
    target_modules =[
        "q_proj",
        "up_proj",
        "o_proj",
        "k_proj",
        "down_proj",
        "gate_proj",
        "v_proj"
        ],
    lora_dropout =0.05,
    bias ="none",
    task_type ="CAUSAL_LM"
)

model = get_peft_model(model, config)

In [5]:
data = load_dataset ("hllj/vi_grade_school_math_mcq")

Downloading readme:   0%|          | 0.00/2.95k [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/4.60M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
data

DatasetDict({
    train: Dataset({
        features: ['grade', 'problems', 'title', 'id', 'url'],
        num_rows: 2733
    })
})

In [7]:
def generate_prompt(question, choices, explanation):
  return f"""
<| im_start|>system
Bạn là một chuyên gia về toán. Bạn sẽ nhận câu hỏi trắc nghiệm kèm theo các lựa chọn, hãy giải step by step nếu có và chọn phương án đúng.
<|im_start|>user
### Câu hỏi:
{ question }
### Các lựa chọn:
{ choices }
### Câu trả lời:

<|im_start|>assistant
{ explanation }
""".strip()

def generate_and_tokenize_prompt(question, choices, explanation):
  full_prompt = generate_prompt(question, choices, explanation)
  tokenized_full_prompt = tokenizer(
      full_prompt,
      padding = True,
      truncation = True
      )
  return tokenized_full_prompt

In [8]:
from datasets import Dataset

In [9]:
training_samples = []
for sample in data["train"]:
  for quest in sample["problems"]:
    choices = quest["choices"]
    explanation = quest["explanation"]. strip ()
    question = quest["question"]
    if explanation == "" or question == "" or choices == []:
      continue
    try:
      question = question.split ("\n \n")[1].strip ()
    except:
      continue
    choices = "\n".join ( choices )
    training_sample = generate_and_tokenize_prompt (
        question, choices, explanation
        )
    training_samples.append ( training_sample )

choices_data = Dataset.from_list ( training_samples )

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [10]:
choices_data

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 9173
})

In [11]:
training_args = transformers.TrainingArguments (
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 4,
    num_train_epochs = 1,
    learning_rate = 2e-4,
    fp16 = True,
    save_total_limit = 3,
    logging_steps = 1,
    output_dir = "experiments",
    optim = "paged_adamw_8bit",
    lr_scheduler_type = "cosine",
    warmup_ratio = 0.05,
)

trainer = transformers.Trainer (
    model =model,
    train_dataset = choices_data,
    args = training_args,
    data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm =False)
    )
model.config.use_cache = False
trainer.train()


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
1,3.8753
2,3.6857
3,3.8619
4,3.4645
5,3.8444
6,3.9058
7,3.2973
8,3.3663
9,3.7636
10,3.3915


TrainOutput(global_step=2293, training_loss=0.5622106155244949, metrics={'train_runtime': 18269.3249, 'train_samples_per_second': 0.502, 'train_steps_per_second': 0.126, 'total_flos': 5.133391809778483e+16, 'train_loss': 0.5622106155244949, 'epoch': 1.0})

In [12]:
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [13]:
%%time
device = "cuda" if torch.cuda.is_available() else "cpu"

prompt = """
<| im_start|>system
Bạn là một chuyên gia về toán. Bạn sẽ nhận câu hỏi trắc nghiệm kèm theo các lựa chọn, hãy giải step by step nếu có và chọn phương án đúng.
<|im_start|>user
### Câu hỏi:
10% của 11,5m2 là:
### Các lựa chọn:
A. 10,15dm2
B. 1,5m2
C. 15,5m2
D. 1,15m2
### Câu trả lời:

<|im_start|>assistant
""".strip()

encoding = tokenizer(prompt, return_tensors = "pt").to(device)
with torch.inference_mode() :
  outputs = model.generate(input_ids = encoding.input_ids,
                              attention_mask = encoding.attention_mask,
                              generation_config = generation_config)
print(tokenizer.decode(outputs[0], skip_special_tokens = True))

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


<| im_start|>system
Bạn là một chuyên gia về toán. Bạn sẽ nhận câu hỏi trắc nghiệm kèm theo các lựa chọn, hãy giải step by step nếu có và chọn phương án đúng.
<|im_start|> user
### Câu hỏi:
10% của 11,5m2 là:
### Các lựa chọn:
A. 10,15dm2
B. 1,5m2
C. 15,5m2
D. 1,15m2
### Câu trả lời:

<|im_start|> assistant
Đáp án là D 
 10% của 11,5m2 là: 
 11,5 : 100 × 10 = 1,15 (m2) 
 Đáp số: 1,15m2 
 Chọn D. 
 Đáp án cần chọn là: D 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn D. 
 Chọn
CPU times: user 2min 2s, sys: 40.9 s, total: 2min 43s
Wall time: 2min 43s


In [21]:
!pip install huggingface_hub

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [34]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [35]:
trainer.push_to_hub("kiendt/vinallama-math-7b")

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/160M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.28k [00:00<?, ?B/s]

events.out.tfevents.1705996212.3cc9726dd11c.26.0:   0%|          | 0.00/365k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kiendt/experiments/commit/45ab2abb6d5b00a3f94fcf29159a1e494b0932b2', commit_message='kiendt/vinallama-math-7b', commit_description='', oid='45ab2abb6d5b00a3f94fcf29159a1e494b0932b2', pr_url=None, pr_revision=None, pr_num=None)