In [1]:
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset
import mysql.connector
from tqdm.notebook import tqdm
import pandas as pd
import time

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth: Fast Llama patching release 2024.4
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.628 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 64,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [4]:
BOS_TOKEN = tokenizer.bos_token
EOS_TOKEN = tokenizer.eos_token

In [5]:
dpo_data_path = '../dpo/data/university_orpo_data.jsonl'
# test_data_path = '../dpo/data/test_kor_data.jsonl'
# eval_data_path = '../data/welfare/test.jsonl'
dposet = load_dataset("json", data_files = dpo_data_path, split='train')
# testset = load_dataset("json", data_files = test_data_path, split='train')

Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
prompt_1 = """### Task
Generate a SQL query to answer the following question:
`{}`
 
### Database Schema
This query will run on a database whose schema is represented in this string:
"{}"
An example of the SQL would be 'SELECT * FROM orders WHERE DATE(orderdate) = CURDATE() - INTERVAL 1 DAY AND status = 'COMPLETED'
 
### SQL
Given the database schema, here is the SQL query that answers `{}`:
[SQL]{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def format_prompt(sample):
    input       = sample["input"]
    accepted    = sample["accept"]
    rejected    = sample["reject"]
    context     = sample["context"]

    # ORPOTrainer expects prompt/chosen/rejected keys
    # See: https://huggingface.co/docs/trl/main/en/orpo_trainer
    sample["prompt"]   = prompt_1.format(input, context, input, "")
    sample["chosen"]   = accepted + EOS_TOKEN
    sample["rejected"] = rejected + EOS_TOKEN
    return sample
pass

dataset = dposet.map(format_prompt,)
dataset = dataset.remove_columns(['input', 'accept', 'reject', 'context'])

Map:   0%|          | 0/398 [00:00<?, ? examples/s]

In [7]:
dataset

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 398
})

In [8]:
# One must patch the DPO Trainer first!
from unsloth import PatchDPOTrainer
PatchDPOTrainer()

In [9]:
from datetime import datetime
import wandb, os
wandb.login()

model_name = "meta-llama/Meta-Llama-3-8B"  
wandb_project = "llama-3-university-orpo"
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project
    
project = "Qlora-4bit"
run_name = model_name + "-" + project

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mcrysis[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [10]:
from trl import ORPOConfig, ORPOTrainer

orpo_trainer = ORPOTrainer(
    model = model,
    train_dataset = dataset,
    tokenizer = tokenizer,
    args = ORPOConfig(
        max_length = max_seq_length,
        max_prompt_length = max_seq_length//2,
        max_completion_length = max_seq_length//2,
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        beta = 0.1,
        logging_steps = 1,
        optim = "paged_adamw_8bit",
        lr_scheduler_type = "cosine",
        max_steps = 1000, # Change to num_train_epochs = 1 for full training runs
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        output_dir = "outputs",
    ),
)



Map:   0%|          | 0/398 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [11]:
orpo_trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 398 | Num Epochs = 21
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 1,000
 "-____-"     Number of trainable parameters = 167,772,160


Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,3.4881,-0.117686,-0.104276,0.125,-0.01341,-1.042759,-1.176861,-0.609302,-0.620683
2,3.5626,-0.129197,-0.1393,0.625,0.010102,-1.392997,-1.291972,-0.536433,-0.564575
3,2.7396,-0.129716,-0.123795,0.25,-0.005922,-1.237947,-1.297163,-0.718015,-0.712013
4,2.5804,-0.116239,-0.116031,0.5,-0.000208,-1.160309,-1.162388,-0.745354,-0.747169
5,2.7755,-0.087146,-0.087745,0.875,0.000598,-0.877446,-0.871465,-0.531392,-0.556878
6,2.4258,-0.059095,-0.063532,0.5,0.004437,-0.635319,-0.590946,-0.668179,-0.673061
7,2.5778,-0.086023,-0.078157,0.125,-0.007865,-0.781575,-0.860226,-0.608542,-0.61119
8,2.2507,-0.05993,-0.052915,0.25,-0.007014,-0.529154,-0.599297,-0.67358,-0.673067
9,2.5632,-0.073802,-0.074203,0.375,0.000401,-0.742032,-0.738023,-0.575004,-0.576256
10,2.5027,-0.075046,-0.075939,0.375,0.000893,-0.759389,-0.750459,-0.689449,-0.693705


TrainOutput(global_step=1000, training_loss=1.0758370063602924, metrics={'train_runtime': 5497.6411, 'train_samples_per_second': 1.455, 'train_steps_per_second': 0.182, 'total_flos': 0.0, 'train_loss': 1.0758370063602924, 'epoch': 20.100502512562816})

In [12]:
# model.save_pretrained("llama3-8B-welfare-rollback") # Local saving
model.push_to_hub("Crysiss/llama-3-8B-university-orpo-v0.1") # Online saving

README.md:   0%|          | 0.00/574 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/671M [00:00<?, ?B/s]

Saved model to https://huggingface.co/Crysiss/llama-3-8B-university-orpo-v0.1


In [12]:
eval_data_path = './data/test.jsonl'
evalset = load_dataset("json", data_files = eval_data_path, split='train')

In [13]:
evalset

Dataset({
    features: ['input', 'context', 'output'],
    num_rows: 136
})

In [14]:
def connect_db():
    db = mysql.connector.connect(
        host="101.101.210.141",
        user="study",
        passwd="study!@#$%",
        database="seoul_welfare"
        # database="aihub_seoul_healthcare"
)
    return db

def mysql_query(database, query:str):
    database.reconnect()
    cur = database.cursor() 
    cur.execute(query)
    result = cur.fetchall()

    return result

db = connect_db()

In [15]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
- Generate a SQL query to answer

### Input:
{}

### context:
{}

### Response:
{}"""

In [16]:
excute_result = []
query_cnt_result = []
include_query_result = []
cnt_query_result = []
inference_query = []
fail_check = []
FastLanguageModel.for_inference(model) 

In [17]:
for i in tqdm(range(len(evalset))):
    inputs = tokenizer(
    [
        alpaca_prompt.format(
            f"{evalset[i]['input']}",
            f"{evalset[i]['context']}",
            "", 
        )
    ], return_tensors = "pt").to("cuda")

    outputs = model.generate(**inputs, max_new_tokens = 300, use_cache = True)
    result = tokenizer.batch_decode(outputs)
    result = result[0].split("### Response:")[-1].split("### Explanation:")[0].split("<|end_of_text|>")[0].replace("\n", "")
    torch.cuda.empty_cache()
    inference_query.append(result)
    
    try:
        infer_query_result = mysql_query(db, result)
        real_query_result = mysql_query(db, evalset['output'][i])
        comp = any(item in real_query_result for item in infer_query_result)
        excute_result.append(1)
        cnt_query_result.append(len(infer_query_result))
        if len(infer_query_result) == len(real_query_result):
            query_cnt_result.append(1)
        else:
            query_cnt_result.append(0)
             
        if comp == True:
            include_query_result.append(1)
        else:
            include_query_result.append(0)
    except Exception as e:
        excute_result.append(0)
        cnt_query_result.append(0)
        print(i, e)
        fail_check.append(i)

    time.sleep(1)
        
    

  0%|          | 0/136 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

In [18]:
print(f"쿼리 실행 확률 : {round(excute_result.count(1) / len(evalset), 2)}\n\
쿼리 결과 수 같을 확률 : {round(query_cnt_result.count(1) / len(evalset), 2)}")

쿼리 실행 확률 : 1.0
쿼리 결과 수 같을 확률 : 0.78


In [34]:
j = 10
inputs = tokenizer(
    [
        alpaca_prompt.format(
            f"{evalset[j]['input']}", # input
            f"{evalset[j]['context']}",
            "", 
        )
    ], return_tensors = "pt").to("cuda")
# pprint(inputs)
outputs = model.generate(**inputs,
                         max_new_tokens = 300,
                        #  pad_token_id= tokenizer.eos_token_id,
                        #  eos_token_id= tokenizer.eos_token_id,
                         output_scores=True,
                        #  logits_processor =[EosTokenRewardLogitsProcessor(eos_token_id=tokenizer.eos_token_id, max_length=300)],
                         use_cache = True)
result = tokenizer.batch_decode(outputs)
result = result[0].split("### Response:")[-1].split(";")[0].split("<|end_of_text|>")[0].replace("\n", "") + ";"
torch.cuda.empty_cache()
# print(f"Inference: {result}\nLabel: {evalset['output'][j]}")
print(f"Inference: {result}\nLabel: {evalset[j]['output']}")
# infer_query_result = mysql_query(db, result)
# real_query_result = mysql_query(db, label)
# comp = any(item in real_query_result for item in infer_query_result)
# pritn(f"\ncomparison: {comp}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Inference: SELECT BASS_ADRES FROM TN_FCLTY_STTUS_INFO10073 WHERE CLTUR_EVENT_ETC_NM = '그래그래도서관';
Label: SELECT BASS_ADRES FROM TN_FCLTY_STTUS_INFO10073 WHERE CLTUR_EVENT_ETC_NM = '그래그래 도서관'


In [18]:
evalset[j]['input']

'그래그래 도서관의 기본 주소를 알려줘'