In [1]:
import os
import torch
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (LlamaForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from peft import LoraConfig
from trl import SFTTrainer
from bs4 import BeautifulSoup

2023-12-13 13:37:48.063317: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-13 13:37:48.204370: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-13 13:37:48.220944: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-13 13:37:48.759063: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: li

In [4]:
# Model from HuggingFace hub
base_model = "kfkas/Llama-2-ko-7b-Chat"

# New instruction dataset
dataset = load_dataset("squad_kor_v2")

# Fine-tuned model
new_model = "llama-2-7b-ko-wikidata-QA"

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answer', 'url', 'raw_html'],
        num_rows: 83486
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answer', 'url', 'raw_html'],
        num_rows: 10165
    })
})

In [5]:
# Set Train/Test Sets
trainset = dataset['train'].to_pandas()
testset = dataset['validation'].to_pandas()

# Preprocessing
# Trainset
trainset['text'] = "<s>[INST] " + trainset['question'].astype(str) + "[/INST]" + trainset['answer'].apply(lambda x: x['text']).apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())

# Testset
testset['text'] = "<s>[INST] " + testset['question'].astype(str) + "[/INST]" + testset['answer'].apply(lambda x: x['text']).apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())

temp_train = trainset['text'].reset_index()
temp_test = testset['text'].reset_index()

del temp_train['index']
del temp_test['index']

tds = Dataset.from_pandas(temp_train)
vds = Dataset.from_pandas(temp_test)

dataset = DatasetDict()

dataset['train'] = tds
dataset['validation'] = vds

print(dataset['train'])

  trainset['text'] = "<s>[INST] " + trainset['question'].astype(str) + "[/INST]" + trainset['answer'].apply(lambda x: x['text']).apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
  trainset['text'] = "<s>[INST] " + trainset['question'].astype(str) + "[/INST]" + trainset['answer'].apply(lambda x: x['text']).apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
  testset['text'] = "<s>[INST] " + testset['question'].astype(str) + "[/INST]" + testset['answer'].apply(lambda x: x['text']).apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())


Dataset({
    features: ['text'],
    num_rows: 83486
})


In [6]:
# 4-bit quantization configuration
"""
4-bit quantization via QLoRA allows efficient finetuning of huge LLM models on consumer hardware while retaining high performance. This dramatically improves accessibility and usability for 
real-world applications.
QLoRA quantizes a pre-trained language model to 4 bits and freezes the parameters. A small number of trainable low-rank adapter layers are then added to the model.

During fine-tuning, gradients are backpropagated through the frozen 4-bit quanized model into only the Low-Rank Adapter layers. So, the entire pretrained model remains fixed at
4-bits while only the adapters are updated. Also, the 4-bit quantization does not hurt model performance.
"""

compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = compute_dtype,
    bnb_4bit_use_double_quant = False,
)


In [7]:
# loading Llama 2 model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config = quant_config,
    device_map = {"" : 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.padding_side = "right"
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

Downloading (…)model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.91G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/3.80G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/5.21M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [8]:
peft_params = LoraConfig(
    lora_alpha = 16,
    lora_dropout = 0.1,
    r = 32,
    bias = "none",
    task_type = "CAUSAL_LM",
)

# Training Model config
training_params = TrainingArguments(
    output_dir = "./results",
    num_train_epochs = 1,
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 1,
    optim = "paged_adamw_32bit",
    save_steps = 25,
    logging_steps = 25,
    learning_rate = 2e-4,
    weight_decay = 0.001,
    fp16 = False,
    bf16 = False,
    max_grad_norm = 0.3,
    max_steps = -1,
    warmup_ratio = 0.03,
    group_by_length = True,
    lr_scheduler_type = "constant",
    report_to = "wandb"
)

In [9]:
# Model fine-tuning
trainer = SFTTrainer(
    model = model,
    train_dataset = dataset['train'],
    eval_dataset = dataset['validation'],
    peft_config = peft_params,
    dataset_text_field = "text",
    max_seq_length = 512,
    tokenizer = tokenizer,
    args = training_params,
    packing = False,
)

Map:   0%|          | 0/83486 [00:00<?, ? examples/s]

Map:   0%|          | 0/10165 [00:00<?, ? examples/s]

In [10]:
trainer.train()

You are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. It is recommended to update your version as a major bug has been fixed in 8-bit optimizers.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33mnova18[0m ([33mandlabyonsei[0m). Use [1m`wandb login --relogin`[0m to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(tru

  0%|          | 0/41743 [00:00<?, ?it/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 3.2142, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.4234, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.3469, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.13, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.3052, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.252, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.2845, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.0892, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.0099, 'learning_rate': 0.0002, 'epoch': 0.01}
{'loss': 2.0313, 'learning_rate': 0.0002, 'epoch': 0.01}
{'loss': 2.1509, 'learning_rate': 0.0002, 'epoch': 0.01}
{'loss': 2.053, 'learning_rate': 0.0002, 'epoch': 0.01}
{'loss': 2.0869, 'learning_rate': 0.0002, 'epoch': 0.01}
{'loss': 2.0305, 'learning_rate': 0.0002, 'epoch': 0.01}
{'loss': 2.211, 'learning_rate': 0.0002, 'epoch': 0.01}
{'loss': 2.0023, 'learning_rate': 0.0002, 'epoch': 0.01}
{'loss': 2.0869, 'learning_rate': 0.0002, 'epoch': 0.01}
{'loss': 2.117, 'learning_rate': 0.0002, 'ep

TrainOutput(global_step=41743, training_loss=1.9224704369305394, metrics={'train_runtime': 26663.7045, 'train_samples_per_second': 3.131, 'train_steps_per_second': 1.566, 'train_loss': 1.9224704369305394, 'epoch': 1.0})

In [14]:
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

('llama-2-7b-ko-wikidata-QA/tokenizer_config.json',
 'llama-2-7b-ko-wikidata-QA/special_tokens_map.json',
 'llama-2-7b-ko-wikidata-QA/tokenizer.json')

In [4]:
from transformers import LlamaForCausalLM, AutoTokenizer

# 저장된 모델과 토크나이저를 불러옵니다.
model = LlamaForCausalLM.from_pretrained('./llama-2-7b-ko-wikidata-QA')
tokenizer = AutoTokenizer.from_pretrained('./llama-2-7b-ko-wikidata-QA')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
logging.set_verbosity(logging.CRITICAL)

prompt = "레오나르도 다빈치는 누구야?"
pipe = pipeline(task = "text-generation", model = model, tokenizer = tokenizer, max_length = 200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] 레오나르도 다빈치는 누구야? [/INST]레오나르도다빈치(Leonardo da Vinci)는 1452년 4월 15일피렌체에서태어났다.그의아버지는피렌체의가죽공예가였다.다빈치는 1466년 14세때피렌체의안드레아델베르치(Andrea del Verrocchio)의화실에서도제수업을받았다.그의스승은피렌체의명문가문출신으로,다빈치보다 10년정도연상이었다.다빈치는스승의딸과결혼하였다.다빈치는 1472년 10세때피렌체의산로렌초성당(Basilica di San Lorenzo)에있는마르티노델리(Martino del


In [61]:
logging.set_verbosity(logging.CRITICAL)

prompt = "코로나는 어떻게 전염돼?"
pipe = pipeline(task = "text-generation", model = model, tokenizer = tokenizer, max_length = 200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

KeyboardInterrupt: 

In [11]:
logging.set_verbosity(logging.CRITICAL)

prompt = "코로나 감염 경로가 뭐야?"
pipe = pipeline(task = "text-generation", model = model, tokenizer = tokenizer, max_length = 50)
result = pipe(f"<s>[INST] {prompt} [/INST]")
# s = resultsplit('a')
print(result[0]['generated_text'])
print(result)

<s>[INST] 코로나 감염 경로가 뭐야? [/INST]중국에서온사람과접촉한경우,중국에서온사람과접촉한경우,중국에서온사람과접촉한경우,중국에서
[{'generated_text': '<s>[INST] 코로나 감염 경로가 뭐야? [/INST]중국에서온사람과접촉한경우,중국에서온사람과접촉한경우,중국에서온사람과접촉한경우,중국에서'}]
