## Requirements

In [None]:
# install packages
%pip install -U bitsandbytes trl accelerate peft tqdm datasets transformers sentencepiece -q
%pip install apache_beam > /dev/null 2>&1 # for trivia or nq datasets, just in case

## 데이터 불러오기

In [None]:
import os
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

from datasets import Dataset

In [None]:
os.environ['HF_TOKEN'] = 'api_key'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = '1'

In [None]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['question'])):
        text = f"### 질문: {example['question'][i]}\n### 답변: {example['answer'][i]}"
        output_texts.append(text)
    return output_texts

In [None]:
train = pd.read_csv('./train.csv')

In [None]:
q_cols = ['질문_1', '질문_2']
a_cols = ['답변_1', '답변_2', '답변_3', '답변_4', '답변_5']

questions = []
answers = []
for _, row in tqdm(train.iterrows(), total=len(train)):
  for q_col in q_cols:
    for a_col in a_cols:
      questions.append(row[q_col])
      answers.append(row[a_col])

In [None]:
dataset = pd.DataFrame(
    data={'question': questions, 'answer': answers}
)

In [None]:
dataset = Dataset.from_pandas(dataset)

## 훈련

In [None]:
import torch
import torch.nn.functional as F

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    TextStreamer
)

from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
# The model that you want to train from the Hugging Face hub
model_name = "heavytail/kullm-solar-S"
# model_name = 'google/gemma-2b'
# model_name = 'heavytail/kullm-mistral-S'


################################################################################
# QLoRA parameters
################################################################################
# LoRA attention dimension
lora_r = 8

# Alpha parameter for LoRA scaling
lora_alpha = 32

# Dropout probability for LoRA layers
lora_dropout = 0.1

# target_modules for base model
target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"]

################################################################################
# bitsandbytes parameters
################################################################################
# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

output_dir = "models/lora"
num_train_epochs = 3
batch_size = 2

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True
bf16 = False

# Optimizer to use
optim = "paged_adamw_32bit"

# Load the entire model on the GPU 0
device_map = "auto"

In [None]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [None]:
# Load base model
# 다운로드가 오래걸립니다 (약 24GB)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    trust_remote_code=True,
)
model.config.use_cache = False
model.config.pretraining_tp = 1

In [None]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

In [None]:
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=target_modules,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM", # generation task
)

- 만약 train을 하지 않았을 경우 아래 코드를 실행
- 그렇지 않다면 Inference 로 건너뛰기

In [None]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=8,
    optim=optim,
    save_steps=100,
    logging_steps=10,
    lr_scheduler_type="cosine",
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    save_total_limit=1
)

In [None]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    max_seq_length=256,
    peft_config=peft_config,
    train_dataset=dataset,
    formatting_func=formatting_prompts_func,
    args=training_arguments,
)

In [None]:
# Train model
trainer.train()

In [None]:
# save lora adaptor
model_save_repo = f'./{model_name.replace("/","-")}'
trainer.save_pretrained(model_save_repo)

# inference

In [None]:
test = pd.read_csv('./test.csv')

In [None]:
streamer = TextStreamer(tokenizer, skip_prompt=True)

def generate_answer(model, question):
  model.eval()
  reformat_question = f'### 질문: {question}\n### 답변: '
  inputs = tokenizer(reformat_question, return_tensors="pt")
  inputs = {k:v.cuda() for k, v in inputs.items()}
  with torch.no_grad():
    # Generate
    generate_ids = model.generate(
      **inputs,
      # bos_token_id=1,
      # eos_token_id=2,
      max_new_tokens=444,
      repetition_penalty=1.2,
      temperature=0.9,
      top_k=10,
      # top_p=0.9,
      # do_sample=True,
      streamer=streamer,
    )

  generated_answers = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, skip_prompt=True, clean_up_tokenization_spaces=False)[0]
  print('--------------------------------------------------------------------------------------------------------')
  return generated_answers

In [None]:
# test if it works
test_id_0 = generate_answer(trainer.model, test['질문'][1])

In [None]:
preds = []

for _, row in tqdm(test.iterrows(), total=len(test)):
  preds.append(generate_answer(trainer.model, row['질문']))

In [None]:
# 데아터 후처리
def extract_only_answer(text):
    text_split = text.split('### 답변: ')
    answer_only = text_split[1]

    if '### 질문:' in answer_only:
        answer_only = answer_only.split('### 질문:')[0]
        print(answer_only)

    return answer_only.strip()

def remove_repetitions(text):
    sentences = text.split('. ')
    unique_sentences = []
    for sentence in sentences:
        if sentence not in unique_sentences:
            unique_sentences.append(sentence)
    return '. '.join(unique_sentences)

def cut_to_last_dot(text):
    for i in range(len(text)-1, -1, -1):
        if text[i] == '.':
            break

    return text[:i+1]

In [None]:
processed = [cut_to_last_dot(remove_repetitions(extract_only_answer(x))) for x in preds]

In [None]:
original = preds[:]
preds = processed[:]

In [None]:
with open('./preds.txt', 'w', encoding='utf-8') as f:
    f.write('\n\n'.join(preds))

## submission

In [None]:
# Test 데이터셋의 모든 질의에 대한 답변으로부터 512 차원의 Embedding Vector 추출
# 평가를 위한 Embedding Vector 추출에 활용하는 모델은 'distiluse-base-multilingual-cased-v1' 이므로 반드시 확인해주세요.
from sentence_transformers import SentenceTransformer # SentenceTransformer Version 2.2.2

# Embedding Vector 추출에 활용할 모델(distiluse-base-multilingual-cased-v1) 불러오기
emb_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

# 생성한 모든 응답(답변)으로부터 Embedding Vector 추출
pred_embeddings = emb_model.encode(preds)
pred_embeddings.shape

In [None]:
submit = pd.read_csv('./sample_submission.csv')
# 제출 양식 파일(sample_submission.csv)을 활용하여 Embedding Vector로 변환한 결과를 삽입
submit.iloc[:,1:] = pred_embeddings
submit.head()

In [None]:
# 리더보드 제출을 위한 csv파일 생성
submit.to_csv('./submission_solar_newtrain_v2.csv', index=False)