In [1]:
!pip install torch  --quiet

# # Install Hugging Face libraries
!pip install  --upgrade transformers datasets accelerate evaluate bitsandbytes --quiet

# #FlashAttention only supports Ampere GPUs or newer. #NEED A100 IN GOOGLE COLAB
!pip install -U transformers
# # !pip install -U flash-attn --no-build-isolation --quiet


! pip install peft --quiet
! pip install datasets trl ninja packaging --quiet

# # Uncomment only if you're using A100 GPU
# #!pip install flash-attn --no-build-isolation
!pip install diffusers safetensors  --quiet

# %pip install -U wandb



In [2]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset, DatasetDict
from trl import SFTTrainer, setup_chat_format

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
device = torch.device('cuda'if torch.cuda.is_available() else 'cpu')

In [4]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HF_TOKEN")

login(token = hf_token)

wb_token = user_secrets.get_secret("wandb")

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune Llama 3 8B on SQL dataset', 
    job_type="training", 
    anonymous="allow"
)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhuyhoangt2201[0m ([33mhuyhoangt2201-fpt-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111388302222167, max=1.0)…

In [32]:
base_model = "phamhai/Llama-3.2-1B-Instruct-Frog"
new_model = "llama-3.2-1b-sql_finetuned_billingual_3.0"

In [35]:
torch_dtype = torch.float16

# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float32
    #attn_implementation=attn_implementation
)
tokenizer = AutoTokenizer.from_pretrained(base_model,use_fast=True)

In [36]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

In [37]:
dataset_train = load_dataset("huyhoangt2201/Jidouka3.2", split='train[:90%]')
dataset_val = load_dataset("huyhoangt2201/Jidouka3.2", split='train[-10%:]')
dataset = DatasetDict({
    'train': dataset_train,
    'validation': dataset_val
})
dataset.save_to_disk("completed_train_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/936 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/104 [00:00<?, ? examples/s]

In [67]:
dataset_train2 = load_dataset("huyhoangt2201/jidouka3.1", split='train[:90%]')
dataset_val2 = load_dataset("huyhoangt2201/jidouka3.1", split='train[-10%:]')
dataset2 = DatasetDict({
    'train': dataset_train2,
    'validation': dataset_val2
})

In [79]:
def format_context(sample):
    sample['context'] = prompt_template
    return sample

In [81]:
dataset_train2_2 = dataset_train2.map(format_context, batched=False)
dataset_val2_2 = dataset_val2.map(format_context, batched=False)

Map:   0%|          | 0/1132 [00:00<?, ? examples/s]

Map:   0%|          | 0/126 [00:00<?, ? examples/s]

In [86]:
dataset_train2_2['question'][1]

"Danh sách các cải tiến có tác giả là 'Tran Thi H' và số công việc áp dụng từ 10 đến 25."

In [38]:
dataset['train'][0]['context']

' \nYou are an SQL query assistant. Based on the table information below, generate an SQL query to retrieve the relevant information for the user. If the user’s question is unrelated to the table, respond naturally in user\'s language.\n\nThe jidouka table contains the following columns:\n\nid: Row identifier (int)\ntên_cải_tiến: Name of the improvement (str)\nloại_hình_công_việc: Type of work that the improvement is intended to enhance (str) (e.g., database processing, data entry, workflow optimization, etc.)\ncông_cụ: Tool used to achieve the improvement (str) (e.g., Python, Excel, Visual Studio Code, etc.)\nmô_tả: Detailed description of the improvement (str) (e.g., each step of the improvement process)\nsản_phẩm: Output product of the improvement (str) (e.g., .csv file, .xlsx file, etc.)\ntác_giả: Contributor, company employee, or creator of the improvement (str)\nbộ_phận: Department of the author, usually referred to as "dc" (str) (e.g., dc1, dc2, dc3, dcd, souko, etc.)\nsố_giờ: N

In [10]:
def format_data_template(sample):
    chat = [
          {"role":"system", "content": sample['context']},
          {"role":"user", "content":sample['question']},
          {"role":"assistant","content":sample['answer']}
    ]
    return {
        "messages": tokenizer.apply_chat_template(chat, tokenize=False)
    }

In [None]:
def format_data_template_to_token(sample):
    chat = [
          {"role":"system", "content": sample['context']},
          {"role":"user", "content":sample['question']}
    ]
    sample['input_ids'] = tokenizer.apply_chat_template(chat, tokenize=True, padding=True, truncation=True, return_tensors='pt')
    sample['labels'] = tokenizer(sample['answer'], padding=True, truncation=True, return_tensors='pt').input_ids
    
    return sample
tokenized_dataset_train = dataset['train'].map(format_data_template_to_token, remove_columns=['context','question','answer'], batched=True)
tokenized_dataset_valid = dataset['validation'].map(format_data_template_to_token, remove_columns=['context','question', 'answer'], batched=True)

In [40]:
dataset_train = dataset['train'].map(format_data_template, remove_columns=['context','question','answer'])

In [39]:
dataset_valid = dataset['validation'].map(format_data_template, remove_columns=['context', 'question','answer'])

In [41]:
dataset_train['messages'][0]

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 14 Nov 2024\n\nYou are an SQL query assistant. Based on the table information below, generate an SQL query to retrieve the relevant information for the user. If the user’s question is unrelated to the table, respond naturally in user\'s language.\n\nThe jidouka table contains the following columns:\n\nid: Row identifier (int)\ntên_cải_tiến: Name of the improvement (str)\nloại_hình_công_việc: Type of work that the improvement is intended to enhance (str) (e.g., database processing, data entry, workflow optimization, etc.)\ncông_cụ: Tool used to achieve the improvement (str) (e.g., Python, Excel, Visual Studio Code, etc.)\nmô_tả: Detailed description of the improvement (str) (e.g., each step of the improvement process)\nsản_phẩm: Output product of the improvement (str) (e.g., .csv file, .xlsx file, etc.)\ntác_giả: Contributor, company employee, or creator of the improvement 

In [42]:
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=10,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

In [43]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_valid,
    peft_config=peft_config,
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_arguments,
    dataset_text_field='messages',
    packing= False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/104 [00:00<?, ? examples/s]

In [44]:
eot = "<|eot_id|>"
eot_id = tokenizer.convert_tokens_to_ids(eot)
tokenizer.pad_token = eot
tokenizer.pad_token_id = eot_id

In [None]:
trainer.train()

Step,Training Loss,Validation Loss


In [None]:
wandb.finish()
model.config.use_cache = True

In [None]:
new_model = 'llama-3.2-1b-sql_finetuned_billingual_3.0_adapter'
new_model

In [None]:
trainer.model.save_pretrained(new_model)
trainer.model.push_to_hub(new_model, use_temp_dir=False)

In [None]:
new_model = 'huyhoangt2201/llama-3.2-1b-sql_finetuned_billingual_3.0_adapter'
base_model = 'phamhai/Llama-3.2-1B-Instruct-Frog'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model)

base_model_reload = AutoModelForCausalLM.from_pretrained(
        base_model,
        return_dict=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
)

# base_model_reload, tokenizer = setup_chat_format(base_model_reload, tokenizer)

# Merge adapter with base model
merge_model = PeftModel.from_pretrained(base_model_reload, new_model)

merge_model = merge_model.merge_and_unload()

In [None]:
new_model_merged = 'llama-3.2-1b-sql_finetuned_billingual_3.0_merged'

In [None]:
merge_model.save_pretrained(new_model_merged)
tokenizer.save_pretrained(new_model_merged)

In [31]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HF_TOKEN")
login(token = hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
merge_model.push_to_hub(new_model_merged, use_temp_dir=False)
tokenizer.push_to_hub(new_model_merged, use_temp_dir=False)

## new_model inference

In [28]:
new_model_name = 'huyhoangt2201/llama-3.2-1b-sql_finetuned_billingual_2.0_merged'

In [29]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
device = torch.device('cuda')
model_path = new_model_name
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to('cuda')

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/928 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

In [30]:
prompt_template = """
You are an SQL query assistant. Based on the table information below, generate an SQL query to retrieve the relevant information for the user. If the user’s question is unrelated to the table, respond naturally in human language.

The jidouka table contains the following columns:
id: Row identifier (int)
tên_cải_tiến: Name of the improvement (str)
loại_hình_công_việc: Type of work that the improvement is intended to enhance (str) (e.g., database processing, data entry, workflow optimization, etc.)
công_cụ: Tool used to achieve the improvement (str) (e.g., Python, Excel, Visual Studio Code, etc.)
mô_tả: Detailed description of the improvement (str) (e.g., each step of the improvement process)
sản_phẩm: Output product of the improvement (str) (e.g., .csv file, .xlsx file, etc.)
tác_giả: Contributor, company employee, or creator of the improvement (str)
bộ_phận: Department of the author, usually referred to as "dc" (str) (e.g., dc1, dc2, dc3, dcd, souko, etc.)
số_giờ: Number of hours saved by applying the improvement (int)
số_công_việc_áp_dụng: Number of tasks in the company that the improvement has supported (int)
thời_điểm_ra_mắt: Launch date of the tool (str) (e.g., 2024-10-11, 2024-10-09, etc.)
thông_tin_thêm: Link to additional documentation (PowerPoint, video) on using the improvement or the improvement’s tool (str)
"""

In [31]:
from typing import List, Dict
class ContextAwareChatbot:
    def __init__(self,prompt, max_history: int = 5):
        self.model = model
        self.tokenizer = tokenizer 
        self.max_history = max_history
        self.conversation_history: List[Dict[str, str]] = []
        self.prompt=prompt
    def _build_prompt(self) -> str:
        # Build context from history

        return self.prompt

    def _clean_response(self, response: str) -> str:
        # Clean up the generated response
        response = response.split("Assistant:")[-1].strip()
        # Stop at any new "Human:" or "Assistant:" markers
        if "Human:" in response:
            response = response.split("Human:")[0].strip()
        return response

    def chat(self, user_input: str) -> str:
        # Generate the contextualized prompt
        prompt = self._build_prompt()

#         # Generate response
#         response = self.pipeline(
#             prompt,
#             return_full_text=False,
#             clean_up_tokenization_spaces=True
#         )[0]['generated_text']

#         # Clean the response
#         cleaned_response = self._clean_response(response)
        messages =[
            {'role':'system',
             'content':prompt}
            ,
            {'role':'user',
             'content':user_input}
        ]
        tokenized_chat = self.tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors='pt').to('cuda')
        outputs = self.model.generate(tokenized_chat, max_new_tokens=256).to('cuda')
        bot_response = self.tokenizer.decode(outputs[0])
        bot_response = bot_response.split('<|start_header_id|>assistant<|end_header_id|>')
        bot_response = bot_response[1].strip()[:-10]
        # Update conversation history
        self.conversation_history.append({
            'human': user_input,
            'assistant': bot_response
        })

        return bot_response

    def get_history(self) -> List[Dict[str, str]]:
        return self.conversation_history

    def clear_history(self):
        self.conversation_history = []

# 4. Create chatbot instance
chatbot = ContextAwareChatbot(prompt_template)

# 5. Example usage function
def chat_session():
    print("Chatbot initialized. Type 'exit' to end the conversation, 'clear' to clear history.")

    while True:
        user_input = input("\nYou: ").strip()

        if user_input.lower() == 'exit':
            print("Goodbye!")
            break
        elif user_input.lower() == 'clear':
            chatbot.clear_history()
            print("Conversation history cleared!")
            continue

        response = chatbot.chat(user_input)
        print(f"\nAssistant: {response}")

# 6. Example of how to use
if __name__ == "__main__":
    chat_session()

Chatbot initialized. Type 'exit' to end the conversation, 'clear' to clear history.



You:  xin chào


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Assistant: Danh sách các cải tiến có số giờ tiết kiệm trên 5 và công cụ hỗ trợ là Excel? WHERE số_giờ > 5 AND công_cụ LIKE LOWER('%Excel%'); SELECT * FROM jidouka WHERE số_giờ > 5 AND công_cụ LIKE LOWER('%Excel%'); SELECT * FROM jidouka WHERE số_giờ > 5 AND công_cụ LIKE LOWER('%Excel%'); SELECT tên_cải_tiến FROM jidouka WHERE số_giờ > 5 AND công_cụ LIKE LOWER('%Excel%'); SELECT tên_cải_tiến FROM jidouka WHERE số_giờ > 5 AND công_cụ LIKE LOWER('%Excel%'); SELECT tên_cải_tiến FROM jidouka WHERE số_giờ > 5 AND công_cụ LIKE LOWER('%Excel%'); SELECT tên_cải_tiến FROM jidouka WHERE số_giờ > 5 AND công_cụ LIKE LOWER('%Excel%'); SELECT tên_cải_tiến FROM jidouka WHERE số_giờ > 5 AND công_cụ LIKE LOWER('%Excel%'); SELECT tên_cải_tiến FROM jidouka WHERE 



You:  How many contributors have worked on improvements?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Assistant: SELECT COUNT(DISTINCT tác_giả) FROM jidouka;



You:  List all improvements launched on a Friday in 2024.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Assistant: SELECT * FROM jidouka WHERE thời_điểm_ra_mắt IN (SELECT thời_điểm_ra_mắt FROM jidouka WHERE DAYOFWEEK(thời_điểm_ra_mắt) = 6); (str) (e.g., 2024-04-10, 2024-04-19, etc.)
SELECT thời_điểm_ra_mắt FROM jidouka WHERE thời_điểm_ra_mắt IN (SELECT thời_điểm_ra_mắt FROM jidouka WHERE DAYOFWEAR = '2024-04-10'); (str) (e.g., 2024-04-10, 2024-04-09, etc.)
SELECT thời_điểm_ra_mắt FROM jidouka WHERE thời_điểm_ra_mắt IN (SELECT thời_điểm_ra_mắt FROM jidouka WHERE thời_điểm_ra_mô_tả LIKE LOWER('%2024-04-10%'); (str) (e.g., 2024-04-10, 2024-04-09, etc.)
SELECT thời_điểm_ra_mắt FROM jidouka WHERE thời_điểm_ra_mô_tả LIK



You:  Find improvements from dcd related to workflow optimization.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Assistant: SELECT * FROM jidouka WHERE bộ_phận LIKE LOWER('%dcd%') AND loại_hình_công_việc LIKE LOWER('%workflow optimization%'); SELECT * FROM jidouka WHERE bộ_phận LIKE LOWER('%dcd%') AND loại_hình_công_việc LIKE LOWER('%workflow optimization%'); SELECT * FROM jidouka WHERE bộ_phận LIKE LOWER('%dcd%') AND loại_hình_công_việc LIKE LOWER('%workflow optimization%'); SELECT * FROM jidouka WHERE bộ_phận LIKE LOWER('%dcd%') AND loại_hình_công_việc LIKE LOWER('%workflow optimization%'); SELECT * FROM jidouka WHERE bộ_phận LIKE LOWER('%dcd%') AND loại_hình_công_việc LIKE LOWER('%workflow optimization%'); SELECT * FROM jidouka WHERE bộ_phận LIKE LOWER('%dcd%') AND loại_hình_công_việc LIKE LOWER('%workflow optimization%'); SELECT * FROM jidouka WHERE bộ_phận LIKE LOWER('%dcd%') AND loại_hình_công_việc LIKE LOWER('%workflow optimization%'); SELECT * FROM jidouka WHERE bộ_phận LIKE LOWE



You:   relevant information for the user. If the user’s question is unrelated to the table, respond naturally in user's language. The jidouka table contains the following columns: id: Row identifier (int) tên_cải_tiến: Name of the improvement (str) loại_hình_công_việc: Type of work that the improvement is intended to enhance (str) (e.g., database processing, data entry, workflow optimization, etc.) công_cụ: Tool used to achieve the improvement (str) (e.g., Python, Excel, Visual Studio Code, etc.) mô_tả: Detailed description of the improvement (str) (e.g., each step of the improvement process) sản_phẩm: Output product of the improvement (str) (e.g., .csv file, .xlsx file, etc.) tác_giả: Contributor, company employee, or creator of the improvement (str) bộ_phận: Department of the author, usually referred to as "dc" (str) (e.g., dc1, dc2, dc3, dcd, souko, etc.) số_giờ: Number of hours saved by applying the improvement (int) số_công_việc_áp_dụng: Number of tasks in the company that the im

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Assistant: Liệt kê các cải tiến có sản phẩm đầu ra là file txt và tiết kiệm ít nhất 4 giờ. Đụ ngôn nào có sản phẩm đầu ra là file txt và tiết kiệm ít nhất 4 giờ? NULL



You:  Excel có những công dụng gì?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Assistant: Excel có số công việc áp dụng lớn hơn 3. Nó có công cụ hỗ trợ là Python và có số công việc áp dụng trên 6.



You:  Tìm những cải tiến có sử dụng nhiều công cụ nhất (số lượng dấu phẩy trong cột công_cụ nhiều nhất)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Assistant: SELECT tên_cải_tiến FROM jidouka WHERE công_cụ ORDER BY số_giờ DESC LIMIT 1; SELECT tên_cải_tiến FROM jidouka WHERE công_cụ LIKE LOWER('%tổng số giờ tiết kiệm của các cải tiến có công cụ hỗ trợ là Excel%'); SELECT tên_cải_tiến FROM jidouka WHERE công_cụ LIKE LOWER('%Excel%') ORDER BY số_giờ DESC LIMIT 1; SELECT tên_cải_tiến FROM jidouka WHERE công_cụ LIKE LOWER('%Excel%') ORDER BY số_giờ DESC LIMIT 1; SELECT tên_cải_tiến FROM jidouka WHERE công_cụ LIKE LOWER('%Excel%') ORDER BY số_giờ DESC LIMIT 1; SELECT tên_cải_tiến FROM jidouka WHERE công_cụ LIKE LOWER('%Excel%') ORDER BY số_giờ DESC LIMIT 1; SELECT tên_cải_tiến FROM jidouka WHERE công_cụ LIKE LOWER('%Excel%') ORDER BY số_giờ DESC LIMIT 1; SELECT tên_cải_tiến FROM jidouka WHERE công_cụ LIKE LOWER('%Excel%') ORDER BY số_giờ DESC LIMIT


KeyboardInterrupt: Interrupted by user