!pip install transformer==4.35 accelerate bitsandbytes==0.35 trl==0.4.7  !pip install peft==0.4

In [None]:
import pandas as pd
from tqdm import tqdm
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
import datetime
from datasets import Dataset

In [None]:
device = torch.device("cuda:0")
print(f"Using device: {device}")

## 1. DATA LOAD

In [None]:
# 데이터 로드
train = pd.read_csv('Data/train_final_0213.csv', index_col=0)
train
# train['question'].unique()[:10]

In [None]:
from datasets import Dataset

# transformer dataset으로 변환하기
dataset = Dataset.from_pandas(train[['question','answer']])

In [None]:
dataset

## 2. MODEL LOAD

In [None]:
# Quantization
# 4-bit quantization with NF4 type configuration using BitsAndBytes

compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("LDCC/LDCC-SOLAR-10.7B")
model = AutoModelForCausalLM.from_pretrained(
    "LDCC/LDCC-SOLAR-10.7B",
    device_map="auto",
    torch_dtype=torch.float16,
    quantization_config=quant_config
)

model.config.use_cache = False
model.config.pretraining_tp = 1

peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
training_params = TrainingArguments(
    output_dir="./results_LDCC",
    num_train_epochs=1,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
)

In [None]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['question'])):
        text = f"### Question: {example['question'][i]}\n ### Answer: {example['answer'][i]}"
        output_texts.append(text)
    return output_texts

In [None]:
from peft import LoraConfig
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
from transformers import Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args = training_params,
    peft_config = peft_params,
    packing = False,
    formatting_func=formatting_prompts_func
)

    # peft_config = peft_params,
    # ,
    # 

In [None]:
trainer.train()

In [None]:
trainer.model.save_pretrained('./results_LDCC')
trainer.tokenizer.save_pretrained('./results_LDCC')

## 4. INFERENCE

In [None]:
from rouge import Rouge
import pandas as pd
from tqdm import tqdm
import datetime
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np

In [None]:
def cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b) if norm_a != 0 and norm_b != 0 else 0

In [None]:
model = AutoModelForCausalLM.from_pretrained('./newmodel',device_map='auto')
tokenizer = AutoTokenizer.from_pretrained('./newmodel')

In [None]:
model.device

In [None]:
# model.resize_token_embeddings(len(tokenizer))

# 
### 1) 130개 inference + submission csv→ 33개 추출 csv

In [None]:
# TEST SET LOADING
start_time=datetime.datetime.now().strftime("%m-%d-%H")
now_time=datetime.datetime.now().strftime("%m-%d  %H:%M:%S")
s=datetime.datetime.now()
print(':::: START -',now_time)
test=pd.read_csv('Data/test.csv') #,encoding='cp949'

# GENERATE
preds=[]
for question_list in tqdm(test['질문']) :
    conversation = [ {'role': 'system', 'content': "요약해서 간략하게 3문장 내외로 대답합니다."},
        {'role':'user', 'content' : question_list} ] #{'role': 'system', 'content': "It's a chatbot that only answers in Korean."},
    prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, use_cache=True, max_length=400)
    preds.append(outputs)


# DECODING
predict_out=[]
for i in preds :
    full_text=tokenizer.decode(i[0])
    answer_start=full_text.find('### Assistant:')
    answer_only = full_text[answer_start:].strip().replace('### Assistant:\n','').replace('</s>','')
    predict_out.append(answer_only)


# MAKING ANSWER CSV
test_sample=pd.DataFrame(predict_out)

test_sample.to_csv('Answer/'+start_time+'_answer130.csv', encoding='utf-8-sig')
now_time=datetime.datetime.now().strftime("%m-%d  %H:%M:%S")
print(':::: SAVE - answer130-',now_time)


# MAKGIN SUBMISSION CSV

## Embedding Vector 추출에 활용할 모델(distiluse-base-multilingual-cased-v1) 불러오기
Submission_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

## 생성한 모든 응답(답변)으로부터 Embedding Vector 추출
pred_embeddings = Submission_model.encode(predict_out)
pred_embeddings.shape
submit = pd.read_csv('Data/sample_submission.csv')
## 제출 양식 파일(sample_submission.csv)을 활용하여 Embedding Vector로 변환한 결과를 삽입
submit.iloc[:,1:] = pred_embeddings
submit.head()
## 리더보드 제출을 위한 csv파일 생성
submit.to_csv('Answer/'+start_time+'_submission.csv', index=False)
now_time=datetime.datetime.now().strftime("%m-%d  %H:%M:%S")
print(':::: SAVE - submission - ',now_time)

# Rouge eval
rouge = Rouge()
test['predict']=predict_out
gpt_answwer=pd.read_csv('Data/dacon_llm_answer.csv')
rou_df=pd.merge(test,gpt_answwer, on=['id','질문'])

rouge_score=rouge.get_scores(rou_df['predict'], rou_df['GPT 답변'], avg=True)['rouge-l']
print(':::: EVAL_rouge_score - ', rouge_score)


# 33sentences cosine_score
sample_scores = []
for pred, gt in zip(df['predict'], df['GPT 답변']):
    # 생성된 답변 내용을 512 Embedding Vector로 변환
    pred_embed = Submission_model.encode(pred)
    gt_embed = Submission_model.encode(gt)   
    sample_score = cosine_similarity(gt_embed, pred_embed)
    # Cosine Similarity Score가 0보다 작으면 0으로 간주
    sample_score = max(sample_score, 0)
    sample_scores.append(sample_score)
print(':::: Cosine Similarity_mean score - ', np.mean(sample_scores))

rou_df['cos_score'] = sample_scores
rou_df.to_csv('Answer/'+start_time+'_answer33.csv', encoding='utf-8-sig')
f=datetime.datetime.now()
print(':::: FINISH  -',now_time, '  toal time : ', f-s)

### 2) 33개 추출 inference csv

In [None]:
# TEST SET LOADING
start_time=datetime.datetime.now().strftime("%m-%d-%H")
now_time=datetime.datetime.now().strftime("%m-%d  %H:%M:%S")
s=datetime.datetime.now()
print(':::: START -',now_time)
test=pd.read_csv('Data/dacon_llm_answer.csv') #,encoding='cp949'

# GENERATE
preds=[]
for question_list in tqdm(test['질문']) :
    conversation = [ {'role': 'system', 'content': "요약해서 간략하게 3문장 내외로 대답합니다."},
        {'role':'user', 'content' : question_list} ] #{'role': 'system', 'content': "It's a chatbot that only answers in Korean."},
    prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, use_cache=True, max_length=400)
    preds.append(outputs)


# DECODING
predict_out=[]
for i in preds :
    full_text=tokenizer.decode(i[0])
    answer_start=full_text.find('### Assistant:')
    answer_only = full_text[answer_start:].strip().replace('### Assistant:\n','').replace('</s>','')
    predict_out.append(answer_only)


# Rouge eval
rouge = Rouge()
test['predict']=predict_out
gpt_answwer=pd.read_csv('Data/dacon_llm_answer.csv')
rou_df=pd.merge(test,gpt_answwer, on=['id','질문'])

rouge_score=rouge.get_scores(rou_df['predict'], rou_df['GPT 답변'], avg=True)['rouge-l']
print(':::: EVAL_rouge_score - ', rouge_score)


# 33sentences cosine_score
sample_scores = []
for pred, gt in zip(df['predict'], df['GPT 답변']):
    # 생성된 답변 내용을 512 Embedding Vector로 변환
    pred_embed = Submission_model.encode(pred)
    gt_embed = Submission_model.encode(gt)   
    sample_score = cosine_similarity(gt_embed, pred_embed)
    # Cosine Similarity Score가 0보다 작으면 0으로 간주
    sample_score = max(sample_score, 0)
    sample_scores.append(sample_score)
print(':::: Cosine Similarity_mean score - ', np.mean(sample_scores))

rou_df['cos_score'] = sample_scores
rou_df.to_csv('Answer/'+start_time+'_answer33.csv', encoding='utf-8-sig')
f=datetime.datetime.now()
print(':::: FINISH  -',now_time, '  toal time : ', f-s)

## 5. Submission

In [None]:
data=pd.read_csv('24-02-20-16.csv')
data['0']

In [None]:
# Test 데이터셋의 모든 질의에 대한 답변으로부터 512 차원의 Embedding Vector 추출
# 평가를 위한 Embedding Vector 추출에 활용하는 모델은 'distiluse-base-multilingual-cased-v1' 이므로 반드시 확인해주세요.
from sentence_transformers import SentenceTransformer # SentenceTransformer Version 2.2.2

# Embedding Vector 추출에 활용할 모델(distiluse-base-multilingual-cased-v1) 불러오기
Submission_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

# 생성한 모든 응답(답변)으로부터 Embedding Vector 추출
pred_embeddings = Submission_model.encode(data['0'])
pred_embeddings.shape
submit = pd.read_csv('./sample_submission.csv')
# 제출 양식 파일(sample_submission.csv)을 활용하여 Embedding Vector로 변환한 결과를 삽입
submit.iloc[:,1:] = pred_embeddings
submit.head()
# 리더보드 제출을 위한 csv파일 생성
submit.to_csv('./24-02-21-10_submission.csv', index=False)

===
===
===
======

conversation = [ {'role': 'system', 'content': '친절하고 도배상식이 풍부한 상담원입니다. 언제나 한국어로만 대답합니다.'} ] 

prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors="pt").to(model.device) 
outputs = model.generate(**inputs, use_cache=True, max_length=4096)
output_text = tokenizer.decode(outputs[0]) 
print(output_text)


for _, row in tqdm(data.iterrows()):
    input_text = row['question'] + tokenizer.eos_token + row['answer']
    input_ids = tokenizer.encode(input_text, return_tensors='pt')