In [1]:
import os
import torch 
from datasets import load_dataset 
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer, 
    BitsAndBytesConfig, 
    HfArgumentParser, 
    TrainingArguments,  
    pipeline,  
    logging,  
)
from peft import LoraConfig, PeftModel  
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
from huggingface_hub import interpreter_login

interpreter_login()


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token can be pasted using 'Right-Click'.
Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (manager).
Your token has been saved to C:\Users\ENEY\.cache\huggingface\token
Login successful


In [2]:
train_dataset = load_dataset('json', data_files='./data/train_hf_last.jsonl', split='train')
eval_dataset = load_dataset('json', data_files='./data/val_hf_last.jsonl', split='train')

In [3]:
from pprint import pprint
print(train_dataset[1]['inputs'])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>
### Instruction:
- Generate a SQL query to answer [QUESTION]Identify the explainability scores and associated AI algorithms, including the average explainability score.[/QUESTION]
- If you cannot answer the question with the available database schema, return 'I do not know'
- Remember that revenue is price multiplied by quantity
- Remember that cost is supply_price multiplied by quantity

<|eot_id|><|start_header_id|>user<|end_header_id|>
### Input:
[QUESTION]Identify the explainability scores and associated AI algorithms, including the average explainability score.[/QUESTION]


### Context:
This query will run on a database whose schema is represented in this string:
CREATE TABLE ai_algorithm (algorithm_id INT, algorithm_name VARCHAR(255)); CREATE TABLE explainability_score (score_id INT, algorithm_id INT, score DECIMAL(5, 4)); INSERT INTO ai_algorithm (algorithm_id, algorithm_name) VALUES (1, 'SHAP'); INSERT INTO ai_algorith

In [4]:
model_name = "meta-llama/Meta-Llama-3-8B"
new_model = "llama-3-8b-sql-synthetic_text_to_sql" 

In [5]:
lora_r = 64   
lora_alpha = 16 
lora_dropout = 0.1

In [6]:
use_4bit = True 
bnb_4bit_compute_dtype = "float16" 
bnb_4bit_quant_type = "nf4" 
use_nested_quant = False 

In [7]:
output_dir = "./results" 
num_train_epochs = 3
fp16 = False   
bf16 = True   
per_device_train_batch_size = 1
per_device_eval_batch_size = 1
gradient_accumulation_steps = 2
gradient_checkpointing = True  
max_grad_norm = 0.3 
learning_rate = 2e-6 
weight_decay = 0.001 
optim = "paged_adamw_32bit"  
lr_scheduler_type = "cosine"   
max_steps = -1
warmup_ratio = 0.03  
group_by_length = True   
save_steps = 0  
logging_steps = 100

In [8]:
max_seq_length = 400
packing = False  
device_map = {"": 0}  

In [9]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit, 
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype, 
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [10]:
from datetime import datetime
import wandb, os
wandb.login()

wandb_project = "llm-sql"
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mcrysis[0m ([33mchanghyun[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [11]:
run_name = new_model + "-" + wandb_project

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" 

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM", 
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    # logging_dir="./logs",        
    # save_strategy="steps",                      
    # evaluation_strategy="steps",
    # eval_steps=100,               
    # do_eval=True,
    report_to="wandb",
    run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}" 
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    # eval_dataset=eval_dataset,
    peft_config=peft_config,
    dataset_text_field="inputs",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

Loading checkpoint shards: 100%|██████████| 4/4 [01:01<00:00, 15.47s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [12]:
trainer.train()
trainer.model.save_pretrained(new_model)

  attn_output = torch.nn.functional.scaled_dot_product_attention(
  0%|          | 100/135000 [01:39<31:47:17,  1.18it/s]

{'loss': 2.4233, 'grad_norm': 0.71484375, 'learning_rate': 4.938271604938271e-08, 'epoch': 0.0}


  0%|          | 200/135000 [03:18<32:38:39,  1.15it/s]

{'loss': 2.426, 'grad_norm': 0.84375, 'learning_rate': 9.876543209876542e-08, 'epoch': 0.0}


  0%|          | 274/135000 [04:35<41:15:34,  1.10s/it]

In [14]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16
)
model = PeftModel.from_pretrained(base_model, new_model) # LoRA 가중치를 가져와 기본 모델에 통합

Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.68s/it]


In [15]:
model = model.merge_and_unload()

# 사전 훈련된 토크나이저를 다시 로드
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)  

# 토크나이저의 패딩 토큰을 종료 토큰(end-of-sentence token)과 동일하게 설정
tokenizer.pad_token = tokenizer.eos_token  

# 패딩을 시퀀스의 오른쪽에 적용
tokenizer.padding_side = "right" 

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [16]:
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

[A[A
[A


model-00004-of-00004.safetensors:   0%|          | 16.4k/1.17G [00:00<2:54:38, 111kB/s]
[A


[A[A[A
model-00004-of-00004.safetensors:   0%|          | 4.70M/1.17G [00:00<01:02, 18.5MB/s] 


[A[A[A


model-00004-of-00004.safetensors:   1%|          | 6.54M/1.17G [00:00<02:29, 7.78MB/s]


model-00004-of-00004.safetensors:   1%|          | 7.72M/1.17G [00:00<02:37, 7.39MB/s]
model-00004-of-00004.safetensors:   1%|          | 8.68M/1.17G [00:01<02:39, 7.27MB/s]


[A[A[A
model-00004-of-00004.safetensors:   1%|          | 11.0M/1.17G [00:01<01:55, 10.0MB/s]
[A


[A[A[A


model-00004-of-00004.safetensors:   1%|          | 13.2M/1.17G [00:01<01:46, 10.8MB/s]
[A


model-00004-of-00004.safetensors:   1%|▏         | 16.0M/1.17G [00:01<02:13, 8.60MB/s]
[A


[A[A[A
model-00004-of-00004.safetensors:   2%|▏         | 19.8M/1.17G [00:02<01:55, 9.90MB/s]


[A[A[A
[A


[A[A[A


model-000

CommitInfo(commit_url='https://huggingface.co/Crysiss/llama-3-8b-sql2/commit/084e22438a7303955bbb4b7ab95e0310982a7e10', commit_message='Upload tokenizer', commit_description='', oid='084e22438a7303955bbb4b7ab95e0310982a7e10', pr_url=None, pr_revision=None, pr_num=None)

In [5]:
base_model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id, 
    quantization_config=bnb_config, 
    device_map="auto",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(base_model_id, add_bos_token=True, trust_remote_code=True)

Loading checkpoint shards: 100%|██████████| 4/4 [00:07<00:00,  1.90s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [21]:
prompt = '''
### Instruction:
- If you cannot answer the question with the available database schema, return 'I do not know'
- Remember that revenue is price multiplied by quantity
- Remember that cost is supply_price multiplied by quantity
- Do not say an Explanation

### Input:
{question}

### Context:
This query will run on a database whose schema is represented in this string:
 CREATE TABLE emp (									
  EMPNO INT NOT NULL, -- number of employee					
  ENAME VARCHAR(10) NULL, -- Name of the employee	
  JOB VARCHAR(9) NOT NULL,  -- The employee's job	
  MGR DECIMAL(4,0) NULL,  -- direct supervisor's employee number 						
  HIREDATE DATE NULL,  -- Employee joining date			
  SAL DECIMAL(7,2) NULL, -- The employee's monthly salary
  COMM DECIMAL(7,2) NULL, -- Commissions					
  DEPTNO INT NULL, -- Department number				
  PRIMARY KEY (EMPNO, DEPTNO)
 );

 CREATE TABLE dept (
  DEPTNO INT , -- Department ID							
  DNAME VARCHAR(14) NOT NULL, -- The Department's Name   
  LOC VARCHAR(13) NOT NULL,  -- The Department's Location
  PRIMARY KEY (DEPTNO)
 );	

 CREATE TABLE salgrade (
  GRADE TINYINT , 			
  LOSAL SMALLINT NOT NULL,  
  HISAL SMALLINT NOT NULL, 
 );		

CREATE TABLE athletics(
id INT NOT NULL PRIMARY KEY, -- ID of the Athletics
name VARCHAR(100) NOT NULL -- The Athletics name
);

CREATE TABLE events(
id int NOT NULL PRIMARY KEY, -- ID of the Event
sport varchar(50) NOT NULL, 
event varchar(100) NOT NULL
);

CREATE TABLE teams(
id INT NOT NULL PRIMARY KEY, -- ID of the Team Nation
team VARCHAR(10) NOT NULL -- Name of the Team Nation
);

CREATE TABLE olympic_games(
id INT NOT NULL PRIMARY KEY, -- ID of the Olympic games
year INT NOT NULL, -- Olympic games Year
season VARCHAR(10) NOT NULL, -- Olympic Season(Summer or Winter)
city VARCHAR(50) NOT NULL -- Olympic host city
);

CREATE TABLE records(
id INT NOT NULL PRIMARY KEY, -- ID of the Records
athlete_id INT NOT NULL, -- ID of the Athlete
sex VARCHAR(5), -- Player's sex(M or F)
age INT NULL, -- Player's age at time of participation
weight DECIMAL(5,1) NULL, -- Player's weight at time of participation
height DECIMAL(5,1) NULL, -- Player's height at time of participation
game_id INT NOT NULL, -- ID of the Olympic games
team_id INT NOT NULL, -- ID of the Team Nation
event_id INT NOT NULL, -- ID of the Event
medal VARCHAR(10) NULL -- Medal(Gold, Silver, Bronze),
FOREIGN KEY(athlete_id) REFERENCES athletics(id),
FOREIGN KEY(game_id) REFERENCES olympic_games(id),
FOREIGN KEY(team_id) REFERENCES teams(id),
FOREIGN KEY(event_id) REFERENCES events(id),
);
													
-- emp.DEPTNO can be joined with dept.DEPTNO
-- athletes.id can be joined with records.athlete_id
-- olympic_games.id can be joined with records.game_id
-- teams.id can be joined with records.team_id
-- events.id can be joined with records.event_id


## Response:
Given the database schema, here is the SQL query that answers
'''

In [20]:
def generate_query(question):
    updated_prompt = prompt.format(question=question)
    inputs = tokenizer(updated_prompt, return_tensors="pt").to("cuda")
    generated_ids = base_model.generate(
        **inputs,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
        max_new_tokens=400,
        do_sample=True,
        num_beams=1,
    )
    outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    torch.cuda.empty_cache()
    torch.cuda.synchronize()

    return outputs

In [26]:
from pprint import pprint 
output = generate_query("View the events and winning teams that won silver medals in 2012")
pprint(output[0].split("```sql"))

['\n'
 '### Instruction:\n'
 '- If you cannot answer the question with the available database schema, '
 "return 'I do not know'\n"
 '- Remember that revenue is price multiplied by quantity\n'
 '- Remember that cost is supply_price multiplied by quantity\n'
 '- Do not say an Explanation\n'
 '\n'
 '### Input:\n'
 'View the events and winning teams that won silver medals in 2012\n'
 '\n'
 '### Context:\n'
 'This query will run on a database whose schema is represented in this '
 'string:\n'
 ' CREATE TABLE emp (\t\t\t\t\t\t\t\t\t\n'
 '  EMPNO INT NOT NULL, -- number of employee\t\t\t\t\t\n'
 '  ENAME VARCHAR(10) NULL, -- Name of the employee\t\n'
 "  JOB VARCHAR(9) NOT NULL,  -- The employee's job\t\n"
 "  MGR DECIMAL(4,0) NULL,  -- direct supervisor's employee "
 'number \t\t\t\t\t\t\n'
 '  HIREDATE DATE NULL,  -- Employee joining date\t\t\t\n'
 "  SAL DECIMAL(7,2) NULL, -- The employee's monthly salary\n"
 '  COMM DECIMAL(7,2) NULL, -- Commissions\t\t\t\t\t\n'
 '  DEPTNO INT NULL, -- D

In [6]:
import mysql.connector

In [7]:
def connect_db():
    db = mysql.connector.connect(
        host="101.101.210.141",
        user="study",
        passwd="study!@#$%",
        database="sample"
)
    return db

def mysql_query(database, query:str):
    cur = database.cursor() 
    cur.execute(query)
    result = cur.fetchall()

    return result

In [8]:
db = connect_db()

In [17]:
test = '''
SELECT DISTINCT e.name, r.medal
FROM records r
JOIN events e ON r.event_id = e.id
WHERE r.medal = 'Silver' AND e.sport = 'Summer' AND r.year = 2012;
'''

In [18]:
tables = mysql_query(db, test)
print(tables)

ProgrammingError: 1054 (42S22): Unknown column 'e.name' in 'field list'

In [6]:
dataset_name = "mlabonne/orpo-dpo-mix-40k"
dataset = load_dataset(dataset_name, split="all")
dataset = dataset.shuffle(seed=42).select(range(1000)) # Only use 1000 samples for quick demo

def format_chat_template(row):
    row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
    row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc= os.cpu_count(),
)
dataset = dataset.train_test_split(test_size=0.01)

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['source', 'chosen', 'rejected', 'prompt'],
        num_rows: 990
    })
    test: Dataset({
        features: ['source', 'chosen', 'rejected', 'prompt'],
        num_rows: 10
    })
})

In [11]:
from pprint import pprint

pprint(dataset['train']['chosen'][3])

('<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n'
 '\n'
 'Do you think governments should invest more in promoting lesser-known '
 'tourist attractions?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n'
 '\n'
 "Thank you for the clarification! I'm happy to help you with your question.\n"
 'Promoting lesser-known tourist attractions can be a great way to support '
 'local economies and provide visitors with unique and authentic experiences. '
 'Many governments have already recognized the potential benefits of promoting '
 'these hidden gems, and have implemented various strategies to raise their '
 'profiles.\n'
 'Some of the ways governments can invest in promoting lesser-known tourist '
 'attractions include:\n'
 '1. Marketing campaigns: Governments can launch targeted marketing campaigns '
 'to showcase these attractions to a wider audience. This can be done through '
 'social media, online advertising, and travel trade shows.\n'
 '2. Infrastructure development: