### 필수 라이브러리 설치

In [None]:
!pip3 install -q -U transformers==4.38.2
!pip3 install -q -U datasets==2.18.0
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.9.0
!pip3 install -q -U trl==0.7.11
!pip3 install -q -U accelerate==0.27.2
!pip3 install -q -U wandb

### 데이터 다운

In [None]:
# !wget https://aistages-api-public-prod.s3.amazonaws.com/app/Competitions/000302/data/data.tar.gz
# !tar -xvf data.tar.gz
# !rm -rf data.tar.gz

### 허깅페이스 로그인

In [None]:
!huggingface-cli login --token

### WandB login

In [None]:
!wandb login 

### 라이브러리 임포트

In [None]:
import torch
from datasets import Dataset, DatasetDict, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline, TrainingArguments
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import pandas as pd
import wandb
import time
from tqdm import tqdm
import warnings

## 모델 불러오기

In [None]:
# LoRA
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM", 
)
# Quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
torch.cuda.empty_cache()

In [None]:
model_id = "beomi/OPEN-SOLAR-KO-10.7B"

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")
model = prepare_model_for_kbit_training(model)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

## 데이터 불러오기

In [None]:
train_df = pd.read_csv('../data/train.csv')
valid_df = pd.read_csv('../data/dev.csv')
test_df = pd.read_csv('../data/test.csv')

In [None]:
display(train_df.head())
display(valid_df.head())
display(test_df.head())

In [None]:
# DataFrame을 Dataset으로 변환
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)
test_dataset = Dataset.from_pandas(test_df)

# DatasetDict 생성
dataset = DatasetDict({
    'train': train_dataset,
    'valid': valid_dataset,
    'test': test_dataset
})

In [None]:
dataset

In [None]:
# 데이터 형태 확인
dataset['train'][0]

In [None]:
dataset['train']

## 프롬프트 형태로 변환하기

In [None]:
def prompt_formatter(sample):
    return f"""<s>### Instruction:
당신은 대화를 요약해주는 유능한 AI입니다. \
당신의 임무는 다음에 나오는 대화를 요약하는 것입니다. \
당신의 대답은 오직 제공된 대화에만 근거해야 합니다.

### Dialogue:
{sample['dialogue']}

### Summary:
{sample['summary']}</s>"""

In [None]:
print(prompt_formatter(dataset['train'][0]))

## 모델 학습하기

### WandB 연동

In [None]:
wandb.init(
    entity='NLP-team3',
    project='OPEN-SOLAR-KO-10.7B',
    name=f"OPEN-SOLAR-KO-10.7B-{str(int(time.time()))}",
)

In [None]:
model = get_peft_model(model, peft_config)

In [None]:
args = TrainingArguments(
    output_dir="models",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    logging_steps=4,
    save_strategy="epoch",
    learning_rate=4e-4,  ### 2e-4
    optim="paged_adamw_32bit",
    bf16=True,
    fp16=False,
    tf32=True,
    max_grad_norm=1.0,
    warmup_ratio=0.06,
    lr_scheduler_type="cosine",
    disable_tqdm=False,
    weight_decay=0.01,
    report_to='wandb',     # Logging에 wandb를 이용함
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['valid'],
    max_seq_length=1024,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=prompt_formatter,
    args=args,
)

In [None]:
torch.cuda.empty_cache()

In [None]:
trainer.train()

In [None]:
# wandb 종료
wandb.finish()

In [None]:
ADAPTER_MODEL = "lora_adapter"

trainer.model.save_pretrained(ADAPTER_MODEL)

In [None]:
torch.cuda.empty_cache()

In [None]:
model_id = "beomi/OPEN-SOLAR-KO-10.7B"
ADAPTER_MODEL = "lora_adapter"

model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto', torch_dtype=torch.float16)
model = PeftModel.from_pretrained(model, ADAPTER_MODEL, device_map='auto', torch_dtype=torch.float16)

model = model.merge_and_unload()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model.save_pretrained('OPEN-SOLAR-KO-10.7B-sum')
tokenizer.save_pretrained('OPEN-SOLAR-KO-10.7B-sum')

In [None]:
torch.cuda.empty_cache()

In [None]:
MODEL_SAVE_HUB_PATH = 'whybe-choi/OPEN-SOLAR-KO-10.7B-sum' # 여기에 {본인의 허깅페이스 허브}/{저장하고자 하는 이름} 형태로 작성
HUGGINGFACE_AUTH_TOKEN = '' # 허깅페이스 write token

model.push_to_hub(
   MODEL_SAVE_HUB_PATH,
   use_temp_dir=True,
   use_auth_token=HUGGINGFACE_AUTH_TOKEN
)
tokenizer.push_to_hub(
   MODEL_SAVE_HUB_PATH,
   use_temp_dir=True,
   use_auth_token=HUGGINGFACE_AUTH_TOKEN
)

## 추론

In [None]:
torch.cuda.empty_cache()

In [None]:
# Quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
FINETUNE_MODEL = "whybe-choi/OPEN-SOLAR-KO-10.7B-sum"

finetune_model = AutoModelForCausalLM.from_pretrained(FINETUNE_MODEL, low_cpu_mem_usage=True, quantization_config=bnb_config, device_map="cuda")
tokenizer = AutoTokenizer.from_pretrained(FINETUNE_MODEL)

In [None]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
pipe_finetuned = pipeline("text-generation", model=finetune_model, tokenizer=tokenizer, max_new_tokens=192)

In [None]:
def generate_prompt(example):
    output_texts = []
    for i in range(len(example['dialogue'])):
        prompt = f"""<s>### Instruction:
당신은 대화를 요약해주는 유능한 AI입니다. \
당신의 임무는 다음에 나오는 대화를 요약하는 것입니다. \
당신의 대답은 오직 제공된 대화에만 근거해야 합니다.

### Dialogue:
{example['dialogue'][i]}

### Summary:
"""
        output_texts.append(prompt)
    return output_texts

In [None]:
test_data = dataset['test']
print(generate_prompt(test_data[:1])[0])

In [None]:
prompt = generate_prompt(test_data[:1])[0]

outputs = pipe_finetuned(
    prompt,
    do_sample=True,
    temperature=0.1, 
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.1,
)
summary = outputs[0]["generated_text"][len(prompt):]
print(summary)

In [None]:
warnings.filterwarnings(action='ignore')

submission = pd.read_csv("../data/sample_submission.csv")
prompts = generate_prompt(dataset['test'])

for idx, prompt in enumerate(tqdm(prompts)):
    outputs = pipe_finetuned(
        prompt,
        do_sample=True,
        temperature=0.1,
        top_k=50,
        top_p=0.95,
        repetition_penalty=1.1,
    )
    summary = outputs[0]["generated_text"][len(prompt):]

    if idx % 50 == 0:
        print("="*25, "[ 대화 ]", "="*25)
        print(dataset['test'][idx]["dialogue"])
        print("="*25, "[ 요약 ]", "="*25)
        print(summary)
        print()

    submission.loc[idx, 'summary'] = summary.strip()

In [None]:
submission.to_csv("../submission_solar.csv", index=False)