## Import packages and model tokenizer

In [1]:
!pip install torch  --quiet

# # Install Hugging Face libraries
!pip install  --upgrade transformers datasets accelerate evaluate bitsandbytes --quiet

# #FlashAttention only supports Ampere GPUs or newer. #NEED A100 IN GOOGLE COLAB
!pip install -U transformers
# # !pip install -U flash-attn --no-build-isolation --quiet


! pip install peft --quiet
! pip install datasets trl ninja packaging --quiet

# # Uncomment only if you're using A100 GPU
# #!pip install flash-attn --no-build-isolation
!pip install diffusers safetensors  --quiet

# %pip install -U wandb



In [2]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset, DatasetDict
from trl import SFTTrainer, setup_chat_format, SFTConfig

In [3]:
base_model = "huyhoangt2201/llama-3.2-1b-sql_finetuned_multitableJidouka2_1.0_977_records_mix_fix_210_records_merged"
new_model = "llama-3.2-1b-sql_finetuned_multitableJidouka2_1.0_977_records_mix_fix_84_records"

In [4]:
torch_dtype = torch.float16

# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16,

    #attn_implementation=attn_implementation
)
tokenizer = AutoTokenizer.from_pretrained(base_model,use_fast=True)
tokenizer.padding_side = 'right' # to prevent warnings

config.json:   0%|          | 0.00/980 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [22]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

In [6]:
from datasets import load_dataset, DatasetDict

In [7]:
dataset_train = load_dataset("huyhoangt2201/fixed_errors_from_contextawareJidouka2", split='train[:95%]')
dataset_val = load_dataset("huyhoangt2201/fixed_errors_from_contextawareJidouka2", split='train[-5%:]')
dataset = DatasetDict({
    'train': dataset_train,
    'validation': dataset_val
})
dataset.save_to_disk("completed_train_dataset")

output84_fixed_errors.csv:   0%|          | 0.00/45.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/84 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/80 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4 [00:00<?, ? examples/s]

## process data

In [9]:
# system_prompt = """You are an SQL query assistant. Based on schema and context below, generate an SQL query to retrieve the relevant information for the user. If the user’s question is unrelated to the table, respond naturally in user's language.

# Schema:
# +Table Author, columns=[AuthorId: int, AuthorName: nvarchar(255), DepartmentId int, GroupDCId int]
# +Table Department, columns=[DepartmentId: int, DepartmentName: nvarchar(255)]
# +Table GroupDC, columns=[GroupDCId: int, DepartmentId: int, GroupDCName nvarchar(255)]
# +Table Job, columns=[JobId: int, JobName: nvarchar(255)]
# +Table Tool, columns=[ToolId: int, ToolName: nvarchar(255), ToolDescription: text]
# +Table Jidouka, columns=[JidoukaId: bigint, ProductApply: nvarchar(255), ImprovementName: nvarchar(255), SoftwareUsing: nvarchar(255), Description: nvarchar(255), Video: text, DetailDocument: text, TotalJobApplied: int, TotalTimeSaved: int, DateCreate: datetime, JobId: int, AuthorId: int, DepartmentId: int, GroupDCId: int]
# +Table JidoukaTool, columns=[JidoukaId: bigint, ToolId: int]
# +Primary_keys=[Author.AuthorId, Department.DepartmentId, GroupDC.GroupDCId, Job.JobId, Tool.ToolId, Jidouka.JidoukaId]
# +Foreign_keys=[GroupDC.DepartmentId=Department.DepartmentId, Jidouka.JobId=Job.JobId, Jidouka.AuthorId=Author.AuthorId, Jidouka.DepartmentId=Department.DepartmentId, Jidouka.GroupDCId=GroupDC.GroupDCId, JidoukaTool.JidoukaId=Jidouka.JidoukaId, JidoukaTool.ToolId=Tool.ToolId, Author.DepartmentId=Department.DepartmentId, Author.GroupDCId=GroupDC.GroupDCId]

# Context:
# Previous user question: {previous_question}
# Previous answer: {previous_answer}
# Previous schema linking(Format: [Tables, Columns, Foreign keys, Possible cell values]): {schema_linking}
# """

In [10]:
# def format_context(sample):
#     sample['context'] = system_prompt.format(previous_question=sample['previous_question'], previous_answer=sample['previous_answer'], schema_linking=sample['schema_linking'])
    
#     return sample

In [8]:
system_prompt = """You are an SQL query assistant. Based on schema, generate an SQL query to retrieve the relevant information for the user. If the user’s question is unrelated to the table, respond naturally in user's language.

Schema:
+Table Author, columns=[AuthorId: int, AuthorName: nvarchar(255), DepartmentId int, GroupDCId int]
+Table Department, columns=[DepartmentId: int, DepartmentName: nvarchar(255)]
+Table GroupDC, columns=[GroupDCId: int, DepartmentId: int, GroupDCName nvarchar(255)]
+Table Job, columns=[JobId: int, JobName: nvarchar(255)]
+Table Tool, columns=[ToolId: int, ToolName: nvarchar(255), ToolDescription: text]
+Table Jidouka, columns=[JidoukaId: bigint, ProductApply: nvarchar(255), ImprovementName: nvarchar(255), SoftwareUsing: nvarchar(255), Description: nvarchar(255), Video: text, DetailDocument: text, TotalJobApplied: int, TotalTimeSaved: int, DateCreate: datetime, JobId: int, AuthorId: int, DepartmentId: int, GroupDCId: int]
+Table JidoukaTool, columns=[JidoukaId: bigint, ToolId: int]
+Primary_keys=[Author.AuthorId, Department.DepartmentId, GroupDC.GroupDCId, Job.JobId, Tool.ToolId, Jidouka.JidoukaId]
+Foreign_keys=[GroupDC.DepartmentId=Department.DepartmentId, Jidouka.JobId=Job.JobId, Jidouka.AuthorId=Author.AuthorId, Jidouka.DepartmentId=Department.DepartmentId, Jidouka.GroupDCId=GroupDC.GroupDCId, JidoukaTool.JidoukaId=Jidouka.JidoukaId, JidoukaTool.ToolId=Tool.ToolId, Author.DepartmentId=Department.DepartmentId, Author.GroupDCId=GroupDC.GroupDCId]
"""

In [9]:
def format_context(sample):
    sample['context'] = system_prompt
    
    return sample

In [10]:
dataset_train2 = dataset_train.map(format_context)
dataset_val2 = dataset_val.map(format_context)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [11]:
dataset_train2

Dataset({
    features: ['previous_question', 'previous_answer', 'schema_linking', 'question', 'answer', 'context'],
    num_rows: 80
})

In [12]:
dataset_train3 = dataset_train2.shuffle(seed=42)
dataset_val3 = dataset_val2.shuffle(seed=42)

In [14]:
def format_data_template(sample):
    chat = [
          {"role":"system", "content": sample['context']},
          {"role":"user", "content":sample['previous_question']},
          {"role":"assistant","content":sample['previous_answer']}
    ]
    return {
        "messages": tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    }

In [15]:
train_set = dataset_train3.map(format_data_template, remove_columns=['context','question','answer','previous_question', 'previous_answer','schema_linking'])
val_set = dataset_val3.map(format_data_template, remove_columns=['context','question','answer','previous_question', 'previous_answer','schema_linking'])

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [16]:
train_set['messages'][1]

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 27 Dec 2024\n\nYou are an SQL query assistant. Based on schema, generate an SQL query to retrieve the relevant information for the user. If the user’s question is unrelated to the table, respond naturally in user's language.\n\nSchema:\n+Table Author, columns=[AuthorId: int, AuthorName: nvarchar(255), DepartmentId int, GroupDCId int]\n+Table Department, columns=[DepartmentId: int, DepartmentName: nvarchar(255)]\n+Table GroupDC, columns=[GroupDCId: int, DepartmentId: int, GroupDCName nvarchar(255)]\n+Table Job, columns=[JobId: int, JobName: nvarchar(255)]\n+Table Tool, columns=[ToolId: int, ToolName: nvarchar(255), ToolDescription: text]\n+Table Jidouka, columns=[JidoukaId: bigint, ProductApply: nvarchar(255), ImprovementName: nvarchar(255), SoftwareUsing: nvarchar(255), Description: nvarchar(255), Video: text, DetailDocument: text, TotalJobApplied: int, TotalTimeSaved: int

## Training

In [None]:


# training_arguments = TrainingArguments(
#     output_dir=new_model,
#     per_device_train_batch_size=2,
#     per_device_eval_batch_size=2,
#     gradient_accumulation_steps=4,
#     optim="adamw_8bit",
#     num_train_epochs=25,
#     eval_strategy="epoch",
#     eval_steps=0.2,
#     save_strategy='epoch',
#     logging_steps=1,
#     warmup_steps=10,
#     logging_strategy="steps",
#     learning_rate=2e-4,
#     fp16=False,
#     bf16=True,
#     group_by_length=True,
#     report_to="wandb",
#     load_best_model_at_end = True
# )

In [35]:
early_stopping_callback = EarlyStoppingCallback( 
    early_stopping_patience=5
)

sft_config = SFTConfig(
    output_dir=new_model,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    optim="adamw_8bit",
    num_train_epochs=25,
    eval_strategy="epoch",
    eval_steps=0.2,
    dataset_text_field = 'messages',
    max_seq_length = 2048, 
    save_strategy='epoch',
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=True,
    group_by_length=True,
    #report_to="wandb",
    packing = False,
    load_best_model_at_end = True

)

In [36]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_set,
    eval_dataset = val_set,
    args = sft_config,
    peft_config = peft_config, 
    callbacks=[early_stopping_callback]
)

  trainer = SFTTrainer(


In [None]:
%%time

eot = "<|eot_id|>"
eot_id = tokenizer.convert_tokens_to_ids(eot)
tokenizer.pad_token = eot
tokenizer.pad_token_id = eot_id

trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 9
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112713744442873, max=1.0…

Epoch,Training Loss,Validation Loss
1,0.0481,0.040687
2,0.0199,0.024582
3,0.0143,0.017012
4,0.0167,0.013846
5,0.0133,0.01373


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


In [25]:
adapter_model = new_model+'_adapter'
adapter_model

'llama-3.2-1b-sql_finetuned_multitableJidouka2_1.0_977_records_mix_fix_84_records_adapter'

In [26]:
trainer.model.save_pretrained(adapter_model)
trainer.model.push_to_hub(adapter_model, use_temp_dir=False)

adapter_model.safetensors:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/huyhoangt2201/llama-3.2-1b-sql_finetuned_multitableJidouka2_1.0_977_records_mix_fix_84_records_adapter/commit/009fd87f2bf7ac9699982f7d9351d3be7bb8ec6c', commit_message='Upload model', commit_description='', oid='009fd87f2bf7ac9699982f7d9351d3be7bb8ec6c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/huyhoangt2201/llama-3.2-1b-sql_finetuned_multitableJidouka2_1.0_977_records_mix_fix_84_records_adapter', endpoint='https://huggingface.co', repo_type='model', repo_id='huyhoangt2201/llama-3.2-1b-sql_finetuned_multitableJidouka2_1.0_977_records_mix_fix_84_records_adapter'), pr_revision=None, pr_num=None)

In [27]:
adapter_model = 'huyhoangt2201/' + adapter_model
base_model = 'huyhoangt2201/llama-3.2-1b-sql_finetuned_multitableJidouka2_1.0_977_records_mix_fix_210_records_merged'

In [5]:
tokenizer = AutoTokenizer.from_pretrained(base_model)

base_model_reload = AutoModelForCausalLM.from_pretrained(
        base_model,
        return_dict=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
)

# base_model_reload, tokenizer = setup_chat_format(base_model_reload, tokenizer)

# Merge adapter with base model
merge_model = PeftModel.from_pretrained(base_model_reload, adapter_model)

merge_model = merge_model.merge_and_unload()

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

In [29]:
merged_model = new_model + '_merged'
merge_model.save_pretrained(merged_model)
tokenizer.save_pretrained(merged_model)

('llama-3.2-1b-sql_finetuned_multitableJidouka2_1.0_977_records_mix_fix_84_records_merged/tokenizer_config.json',
 'llama-3.2-1b-sql_finetuned_multitableJidouka2_1.0_977_records_mix_fix_84_records_merged/special_tokens_map.json',
 'llama-3.2-1b-sql_finetuned_multitableJidouka2_1.0_977_records_mix_fix_84_records_merged/tokenizer.json')

In [30]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HF_TOKEN")
login(token = hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [7]:
merge_model.push_to_hub(merged_model, use_temp_dir=False)
tokenizer.push_to_hub(merged_model, use_temp_dir=False)

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/huyhoangt2201/Llama-3.2-1B-Instruct-Frog_fixed_84errors_merged/commit/540e4241b140ffa563ac5b615336e82417d62f08', commit_message='Upload tokenizer', commit_description='', oid='540e4241b140ffa563ac5b615336e82417d62f08', pr_url=None, repo_url=RepoUrl('https://huggingface.co/huyhoangt2201/Llama-3.2-1B-Instruct-Frog_fixed_84errors_merged', endpoint='https://huggingface.co', repo_type='model', repo_id='huyhoangt2201/Llama-3.2-1B-Instruct-Frog_fixed_84errors_merged'), pr_revision=None, pr_num=None)