In [1]:
# !pip install -U accelerate peft bitsandbytes transformers trl datasets

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "6,"

In [3]:
import torch
from datasets import load_dataset
from transformers import (
    TrainingArguments,
    AutoModelForCausalLM,
    AutoTokenizer,
)
from trl import SFTTrainer

2024-07-23 08:36:52.520381: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
out_dir = "./saved_models/kogpt2_koalpaca"
model_name = "skt/kogpt2-base-v2"
batch_size = 8
num_train_epochs = 5
logging_steps = 500
bf16 = False
fp16 = True
context_length = 256
num_workers = 1
# num_workers = os.cpu_count()
gradient_accumulation_steps = 2
learning_rate = 0.0001



In [5]:
dataset = load_dataset('bingsu/ko_alpaca_data')

Downloading readme:   0%|          | 0.00/4.38k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.49M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/49620 [00:00<?, ? examples/s]

In [6]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 49620
    })
})


In [8]:
print(dataset['train']['instruction'][0])

건강을 유지하기 위한 세 가지 팁을 알려주세요.


In [9]:
print(dataset['train'][0])

{'instruction': '건강을 유지하기 위한 세 가지 팁을 알려주세요.', 'input': '', 'output': '세 가지 팁은 아침식사를 꼭 챙기며, 충분한 수면을 취하고, 적극적으로 운동을 하는 것입니다.'}


In [10]:
# Create train set and validation set
full_dataset = dataset['train'].train_test_split(test_size=0.05, shuffle=True)
dataset_train = full_dataset['train']
dataset_valid = full_dataset['test']

print(f"Train set size: {len(dataset_train)}, Valid set size: {len(dataset_valid)}")

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 47139
})
Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 2481
})


In [11]:
# Check dataset as alpaca prompt format
for i in range(3):
    print(dataset_train[i])
    print('****************')
    
    text = dataset_train[i]
    instruction = '### Instruction:\n' + text['instruction']
    inputs = '\n\n### Input:\n' + text['input']
    response = '\n\n### Response:\n' + text['output']
    
    final_text = instruction + inputs + response
    print(final_text)
    print('#'*50)

In [13]:
# Mapping function for dataset
def preprocess_function(examples):
    """
    Formatting function returning a list of processed strings.
    """
    texts = []
    
    for example in zip(examples['instruction'], examples['input'], examples['output']):
        instruction, input_text, output = example
        text = f"### 지시:\n{instruction}\n\n### 자료:\n{input_text}\n\n### 응답:\n{output}"
        texts.append(text)
    
    return texts

In [14]:
# Load model
if bf16:
    model = AutoModelForCausalLM.from_pretrained(model_name).to(dtype=torch.bfloat16)
else:
    model = AutoModelForCausalLM.from_pretrained(model_name)

In [15]:
# Check model
print(model)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

In [17]:
# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    trust_remote_code=True,
    use_fast=True,
)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '</s>', 'eos_token':'</s>'})
    model.resize_token_embeddings(len(tokenizer))

In [19]:
training_args = TrainingArguments(
    output_dir=f"{out_dir}/logs",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    eval_strategy='epoch',
    num_train_epochs=num_train_epochs,
    logging_strategy='steps',
    logging_steps=logging_steps,
    save_strategy='epoch',
    save_total_limit=2,
    load_best_model_at_end=True,
    bf16=bf16,
    fp16=fp16,
    weight_decay=0.01,
    report_to='tensorboard',
    dataloader_num_workers=num_workers,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    lr_scheduler_type='constant',
)



In [20]:
# Load trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_valid,
    max_seq_length=context_length,
    tokenizer=tokenizer,
    args=training_args,
    formatting_func=preprocess_function,
    # packing=True,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/47139 [00:00<?, ? examples/s]

Map:   0%|          | 0/2481 [00:00<?, ? examples/s]

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs


In [22]:
# Training
history = trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss,Validation Loss
500,2.5328,2.321936
1000,2.3813,2.258352
1500,2.2976,2.224798
2000,2.0362,2.203952
2500,2.066,2.184902
3000,2.0376,2.215594


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [23]:
# Save model
model.save_pretrained(out_dir)
tokenizer.save_pretrained(out_dir)

('./saved_models/kogpt2_koalpaca/kogpt2_koalpaca)/tokenizer_config.json',
 './saved_models/kogpt2_koalpaca/kogpt2_koalpaca)/special_tokens_map.json',
 './saved_models/kogpt2_koalpaca/kogpt2_koalpaca)/vocab.json',
 './saved_models/kogpt2_koalpaca/kogpt2_koalpaca)/merges.txt',
 './saved_models/kogpt2_koalpaca/kogpt2_koalpaca)/added_tokens.json',
 './saved_models/kogpt2_koalpaca/kogpt2_koalpaca)/tokenizer.json')

## Inference

In [24]:
from transformers import (
    AutoModelForCausalLM, 
    logging, 
    pipeline,
    AutoTokenizer
)

In [29]:
# Load newly trained model
model = AutoModelForCausalLM.from_pretrained("/home/sslunder13/project/06_instruction_tuning/saved_models/kogpt2_koalpaca/kogpt2_koalpaca")
tokenizer = AutoTokenizer.from_pretrained("/home/sslunder13/project/06_instruction_tuning/saved_models/kogpt2_koalpaca/kogpt2_koalpaca")

tokenizer.pad_token = tokenizer.eos_token

In [30]:
# Print only critical error messages
logging.set_verbosity(logging.CRITICAL)

In [31]:
# Inference using fine-tuned model
pipe = pipeline(task='text-generation', model=model, tokenizer=tokenizer, max_length=256)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [32]:
# Input prompt in alpaca format

prompt = """### 지시:
화자의 감정이 긍정적인지 부정적인지 알려줘.

### 자료:
기분이 최고야!

### 응답:
"""

In [34]:
# Check generated response
result = pipe(prompt)
print(result[0]['generated_text'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
