In [None]:
!pip install torch  --quiet

# # Install Hugging Face libraries
!pip install  --upgrade transformers datasets accelerate evaluate bitsandbytes --quiet

# #FlashAttention only supports Ampere GPUs or newer. #NEED A100 IN GOOGLE COLAB
!pip install -U transformers
# # !pip install -U flash-attn --no-build-isolation --quiet


! pip install peft --quiet
! pip install datasets trl ninja packaging --quiet

# # Uncomment only if you're using A100 GPU
# #!pip install flash-attn --no-build-isolation
!pip install diffusers safetensors  --quiet

# %pip install -U wandb

In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset, DatasetDict
from trl import SFTTrainer, setup_chat_format

ModuleNotFoundError: No module named 'peft'

In [None]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HF_TOKEN")

login(token = hf_token)

wb_token = user_secrets.get_secret("wandb")

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune Llama 3 8B on SQL dataset', 
    job_type="training", 
    anonymous="allow"
)

In [None]:
base_model = "phamhai/Llama-3.2-1B-Instruct-Frog"
new_model = "llama-3.2-1b-sql_finetuned_multitableJidouka_4.0"

In [None]:
torch_dtype = torch.float16

# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float32,
    #attn_implementation=attn_implementation
)
tokenizer = AutoTokenizer.from_pretrained(base_model,use_fast=True)

In [None]:
# peft_config = LoraConfig(
#     r=16,
#     lora_alpha=16,
#     lora_dropout=0,
#     bias="none",
#     task_type="CAUSAL_LM",
#     target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj'],
#     use_rslora=False,
#     loftq_config=None
# )
# model = get_peft_model(model, peft_config)

# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

In [2]:
from datasets import load_dataset, DatasetDict

In [3]:
dataset_train = load_dataset("huyhoangt2201/multitableJidouka2.0", split='train[:90%]')
dataset_val = load_dataset("huyhoangt2201/multitableJidouka2.0", split='train[-10%:]')
dataset = DatasetDict({
    'train': dataset_train,
    'validation': dataset_val
})
dataset.save_to_disk("completed_train_dataset")

(…)records_english_multitableJidouka2.0.csv:   0%|          | 0.00/191k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1009 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/908 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/101 [00:00<?, ? examples/s]

In [5]:
prompt_template = """ 
You are an SQL query assistant. Based on schema below, generate an SQL query to retrieve the relevant information for the user. If the user’s question is unrelated to the table, respond naturally in user's language.

Schema:
-- Table: Job
CREATE TABLE Job (
    Id INT PRIMARY KEY AUTO_INCREMENT,
    Job_name NVARCHAR(255) NOT NULL,
);

-- Table: Department
CREATE TABLE Department (
    Id INT PRIMARY KEY AUTO_INCREMENT,
    Department_name NVARCHAR(255) NOT NULL,
);

-- Table: Author
CREATE TABLE Author (
    Id INT PRIMARY KEY AUTO_INCREMENT,
    Author_name NVARCHAR(255) NOT NULL,
);

-- Table: Tool
CREATE TABLE Tool (
    Id INT PRIMARY KEY AUTO_INCREMENT,
    Tool_name NVARCHAR(255) NOT NULL,
);

-- Table: Jidouka
CREATE TABLE Jidouka (
    Id BIGINT PRIMARY KEY AUTO_INCREMENT,
    Improve_name NVARCHAR(255) NOT NULL,
    Job_id INT,
    Department_id INT,
    Author_id INT,
    Description NVARCHAR(255),
    Product_name NVARCHAR(255),
    Time INT,
    Applications INT,
    Release_date DATETIME,
    Other_info NVARCHAR(255),
    FOREIGN KEY (Job_id) REFERENCES Job(Id),
    FOREIGN KEY (Department_id) REFERENCES Department(Id),
    FOREIGN KEY (Author_id) REFERENCES Author(Id)
);

-- Table: JidoukaTool
CREATE TABLE JidoukaTool (
    Jidouka_id BIGINT,
    Tool_id INT,
    PRIMARY KEY (Jidouka_id, Tool_id),
    FOREIGN KEY (Jidouka_id) REFERENCES Jidouka(Id),
    FOREIGN KEY (Tool_id) REFERENCES Tool(Id)
);"""

In [6]:
def format_context(sample):
    sample['context'] = prompt_template
    return sample

In [8]:
dataset_train2 = dataset_train.map(format_context,batched=False)
dataset_val2 = dataset_val.map(format_context, batched=False)

Map:   0%|          | 0/908 [00:00<?, ? examples/s]

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

In [None]:
def format_data_template(sample):
    chat = [
          {"role":"system", "content": sample['context']},
          {"role":"user", "content":sample['question']},
          {"role":"assistant","content":sample['answer']}
    ]
    return {
        "messages": tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    }

In [None]:
train_set = dataset_train2.map(format_data_template, remove_columns=['context','question','answer'])
test_set = dataset_val2.map(format_data_template, remove_columns=['context', 'question','answer'])

In [None]:
early_stopping_callback = EarlyStoppingCallback( 
    early_stopping_patience=5
)

training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    optim="adamw_8bit",
    num_train_epochs=20,
    eval_strategy="epoch",
    eval_steps=0.2,
    save_strategy='epoch',
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=True,
    group_by_length=True,
    report_to="wandb",
    load_best_model_at_end = True
)

In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset_train,
    eval_dataset = dataset_valid,
    dataset_text_field = 'messages',
    max_seq_length = 2048, 
    peft_config = peft_config, 
    packing=False,
    args = training_arguments,
    callbacks=[early_stopping_callback]
)

In [None]:
%%time

eot = "<|eot_id|>"
eot_id = tokenizer.convert_tokens_to_ids(eot)
tokenizer.pad_token = eot
tokenizer.pad_token_id = eot_id

trainer.train()

In [None]:
new_model = 'llama-3.2-1b-sql_finetuned_multitableJidouka_4.0_adapter'
new_model

In [None]:
trainer.model.save_pretrained(new_model)
trainer.model.push_to_hub(new_model, use_temp_dir=False)

In [None]:
new_model = 'huyhoangt2201/llama-3.2-1b-sql_finetuned_multitableJidouka_4.0_adapter'
base_model = 'phamhai/Llama-3.2-1B-Instruct-Frog'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model)

base_model_reload = AutoModelForCausalLM.from_pretrained(
        base_model,
        return_dict=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
)

# base_model_reload, tokenizer = setup_chat_format(base_model_reload, tokenizer)

# Merge adapter with base model
merge_model = PeftModel.from_pretrained(base_model_reload, new_model)

merge_model = merge_model.merge_and_unload()

In [None]:
new_model_merged = 'llama-3.2-1b-sql_finetuned_multitableJidouka_4.0_merged'
merge_model.save_pretrained(new_model_merged)
tokenizer.save_pretrained(new_model_merged)

In [None]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HF_TOKEN")
login(token = hf_token)

In [None]:
merge_model.push_to_hub(new_model_merged, use_temp_dir=False)
tokenizer.push_to_hub(new_model_merged, use_temp_dir=False)