In [1]:
import os
import json
import numpy as np
import pandas as pd
import re
import string
from collections import Counter
from tqdm import tqdm

import torch
from transformers import (
    Trainer,
    TrainingArguments,
    pipeline,
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForQuestionAnswering,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling
)
from peft import (
    prepare_model_for_kbit_training, 
    LoraConfig, 
    TaskType,
    get_peft_model,
    PeftModelForCausalLM,
)

from trl import DataCollatorForCompletionOnlyLM, SFTTrainer, SFTConfig
from datasets import load_dataset, Dataset, DatasetDict
from accelerate import Accelerator

  from .autonotebook import tqdm as notebook_tqdm


# 모델 정의

In [4]:
repo = "beomi/OPEN-SOLAR-KO-10.7B"

accelerater = Accelerator()

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
        repo,
        torch_dtype="auto",
        attn_implementation="eager",
        quantization_config=quantization_config
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj'],
    task_type=TaskType.CAUSAL_LM
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
tokenizer = AutoTokenizer.from_pretrained(repo)
model, tokenizer = accelerater.prepare(model, tokenizer)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Downloading shards: 100%|██████████| 8/8 [59:27<00:00, 445.95s/it]
Loading checkpoint shards: 100%|██████████| 8/8 [00:42<00:00,  5.33s/it]


# 데이터셋 정의

In [23]:
train_dataset = load_dataset("csv", data_files="/home/jovyan/work/prj_data/open/train.csv")

def get_template(context, question, answer):
    return [
        {
            "role": "system",
            "content": "너는 주어진 Context에서 Question에 대한 Answer를 찾는 챗봇이야. Context에서 Answer가 될 수 있는 부분을 찾아서 그대로 적어줘. 단, Answer는 주관식이 아니라 단답형으로 적어야 해"
        },
        {
            "role": "user",
            "content": f"Context: {context}\nQuestion: {question}"
        },
        {
            "role": "assistant",
            "content": answer
        }
    ]
print(tokenizer.apply_chat_template(get_template('d', 'e', 'a'), tokenize=False))

INSTRUCTION_TEMPLATE = """
너는 주어진 Context에서 Question에 대한 Answer를 찾는 챗봇이야. Context에서 Answer가 될 수 있는 부분을 찾아서 그대로 적어줘. 단, Answer는 주관식이 아니라 단답형으로 적어야 해.

Context: {context}
Question: {question}"""

RESPONSE_TEMPLATE = "{answer}"

class QADataCollator(DataCollatorForLanguageModeling):
    def __init__(self, tokenizer, inst, resp, mlm=False):
        super().__init__(tokenizer=tokenizer, mlm=mlm)
    
    def __call__(self, examples):
        batch = []
        for example in examples:
            context = example['context']
            question = example['question']
            answer = example['answer']
            
            instruction = INSTRUCTION_TEMPLATE.format(context=context, question=question)
            response = RESPONSE_TEMPLATE.format(answer=answer)
            
            prompt = self.tokenizer.apply_chat_template([
                {"role": "user", "content": instruction},
                {"role": "assistant", "content": response}
            ], tokenize=False)
            
            encoded = self.tokenizer.encode(prompt, truncation=True, max_length=512)
            batch.append(encoded)
        
        return self.tokenizer.pad(
            {"input_ids": batch},
            padding=True,
            return_tensors="pt",
        )
    
data_collator = QADataCollator(tokenizer=tokenizer,
                              inst = INSTRUCTION_TEMPLATE,
                              resp = RESPONSE_TEMPLATE)

<s>[INST] <<SYS>>
너는 주어진 Context에서 Question에 대한 Answer를 찾는 챗봇이야. Context에서 Answer가 될 수 있는 부분을 찾아서 그대로 적어줘. 단, Answer는 주관식이 아니라 단답형으로 적어야 해
<</SYS>>

Context: d
Question: e [/INST] a </s>


# 학습

In [6]:
import wandb
wandb.login()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
huggingfac

True

In [26]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

args = SFTConfig(
    output_dir='my_model',
    eval_strategy='epoch',
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    warmup_ratio=0.1,
    do_train=True,
    do_eval=True,
    logging_strategy='steps',
    logging_dir='logs',
    logging_steps=1,
    save_steps=0.2,
    report_to="wandb",
    max_seq_length=4096
)

def formatting_func(example):
    return f"Context: {example['context']}\nQuestion: {example['question']}\nAnswer: {example['answer']}"

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset['train'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    peft_config=lora_config,
    formatting_func=formatting_func
)

trainer.train(resume_from_checkpoint=False)

Map:   0%|          | 0/33716 [00:00<?, ? examples/s]


KeyError: None

---
---
---
---
---

In [2]:
TEST_fOLDER = '/home/jovyan/work/prj_data/open/test.csv'
MODEL = "MLP-KTLim/llama-3-Korean-Bllossom-8B"
CHECK_POINT = "/home/jovyan/work/ai_chat_qa_task/code/huggingface/llama3/checkpoint-16858"
OUTPUT = "ensemble"

csv = pd.read_csv(TEST_fOLDER)

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    torch_dtype="auto",
    attn_implementation="eager",
)
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = PeftModelForCausalLM.from_pretrained(base_model, CHECK_POINT)

accelerater = Accelerator()
model, tokenizer = accelerater.prepare(model, tokenizer)

model.merge_and_unload()
pipe = pipeline(task='text-generation', model=model, tokenizer=tokenizer, eos_token_id=tokenizer.eos_token_id, device=0)
print(pipe.device)

def get_prompt(data, guide):
    return f"""
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

너는 주어진 Context에서 Question에 대한 Answer를 찾는 챗봇이야. '{guide}'이 부분들을 참고해서 Answer를 간결하게 문장이 아니라 정확한 짧은 표현으로 말해줘.<|eot_id|><|start_header_id|>user<|end_header_id|>

Context: {data['context']}
Question: {data['question']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

submission_dict = {}
guided = ["/home/jovyan/work/ai_chat_qa_task/code/submit/test_91.csv",
         "/home/jovyan/work/ai_chat_qa_task/code/submit/roBERT_0712.csv"
         ]
guidance = []
for i in range(len(guided)):
    guidance.append(pd.read_csv(guided[i]))
i = -1
for i, data in tqdm(csv.iterrows()):
    i += 1
    if i > 10:
        break
    refer = []
    for j in range(len(guided)):
        temp = guidance[j]
        guide = temp[temp['id'] == data['id']]['answer'].values[0]
        refer.append(guide)
    print(refer)
    prompt = get_prompt(data, refer)
    generated = pipe(prompt, num_return_sequences=1, max_new_tokens=50, pad_token_id=tokenizer.eos_token_id)[0]['generated_text']
    generated = generated[len(prompt):]
    submission_dict[data['id']] = generated
    print(f"ID: {data['id']} Question: {data['question']} Generated answer: {generated}")
    
df = pd.DataFrame(list(submission_dict.items()), columns=['id', 'answer'])
df.to_csv(f'{OUTPUT}.csv', index=False)

Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00,  9.46it/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapane

cuda:0


0it [00:00, ?it/s]

['제주특별자치도 경제통상진흥원', '제주특별자치도 경제통상진흥원']


1it [00:03,  3.19s/it]

ID: TEST_0000 Question: 어떤 기관이 지역 중소기업과 소상공인들에게 큰 힘이 되는 통상진흥원인가요 Generated answer: 제주특별자치도 경제통상진흥원
['중국과 일본', '중국과 일본']


2it [00:05,  2.69s/it]

ID: TEST_0001 Question: 제주 경제통상진흥원이 어떤 해외 시장을 우선적으로 공략하고 있나요 Generated answer: 중국과 일본
['2015년 8월', '2015년 8월']


3it [00:07,  2.58s/it]

ID: TEST_0002 Question: 상해대표처는 어느 시점에 설립되었습니까 Generated answer: 2015년 8월
['도쿄 신주쿠', '도쿄 신주쿠']


4it [00:10,  2.53s/it]

ID: TEST_0003 Question: 동경통상대표부는 어느 지역에 설립되었습니까 Generated answer: 도쿄 신주쿠
['31억7200만원', '31억7200만원']


5it [00:12,  2.48s/it]

ID: TEST_0004 Question: 제주경제통상진흥원이 지난해에 도내에서 생산된 제품에 대한 일본 수출에 얼마를 지원했나요 Generated answer: 31억7200만원
['풋살경기장', '풋살경기장']


6it [00:14,  2.37s/it]

ID: TEST_0005 Question: 서경대 실용음악과 보컬전공의 대입 시험은 어디에서 진행되었나 Generated answer: 풋살경기장
['코로나19 예방', '신종 코로나바이러스 감염증 ( 코로나19 ) 를 예방']


7it [00:17,  2.30s/it]

ID: TEST_0006 Question: 천막 고사장으로 변경한 이유는 무엇인가 Generated answer: 코로나19 예방
['1710명', '1710명']


8it [00:19,  2.22s/it]

ID: TEST_0007 Question: 2021학년도 서경대 실용음악과 보컬 전공 수시에 몇 명이 지원했나 Generated answer: 1710명
['25명', '25명 이내']


9it [00:21,  2.18s/it]

ID: TEST_0008 Question: 하나의 시험 당 약 몇 명이 참가했나 Generated answer: 25명 이내
['거리두기를 위해 시험장소를 야외나 대규모 시설로 옮겼다', "영상 업로드'방식"]


10it [00:24,  2.42s/it]

ID: TEST_0009 Question: 기존과 같은 방식으로 대면 면접·실기시험을 치르는 학교들은 어떤 방식을 채택했나 Generated answer: 거리두기를 위해 시험장소를 야외나 대규모 시설로 옮겼다





In [3]:
print(prompt)


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

너는 주어진 Context에서 Question에 대한 Answer를 찾는 챗봇이야. '['제주특별자치도 경제통상진흥원', '제주특별자치도 경제통상진흥원']'를 참고해서 Answer를 간결하게 문장이 아니라 정확한 짧은 표현으로 말해줘.<|eot_id|><|start_header_id|>user<|end_header_id|>

Context: 신종 코로나바이러스 감염증(코로나19) 확산 속에 제주특별자치도 경제통상진흥원(원장 문관영)이 추진하는 해외 시장 진출 지원 사업이 지역 중소기업과 소상공인들에게 큰 힘이 되고 있다. 

제주경제통상진흥원은 우수한 제품을 생산하고도 판로 개척에 어려움을 겪는 도내 유망 중소기업을 위해 해외통상사무소(상해대표처, 동경통상대표부)를 통해 중국과 일본을 중심으로 온라인 판매, 비대면 마케팅 지원 사업을 꾸준히 펼치고 있다. 

제주경제통상진흥원 해외통상사무소는 신규 바이어 발굴, 지역 업체와 바이어 간 매칭 및 미팅 주선, 제주 상품 통관 및 수출 관련 절차 지원, 수출 성사 및 실적 관리 등 제주지역 수출업체의 해외지사 역할을 수행하면서 호응을 얻고 있다. 

‘알리바바’ 등 현지 온라인몰에서 제주관을 운영하고 대형 쇼핑몰에 제주 상품 전용매대를 설치ㆍ운영하고 인플우언서 마케팅, 온ㆍ오프라인 판촉 이벤트도 활발히 펼치면서 제주 기업의 자생력을 키우고 있다. 

경제통상진흥원은 현지 대형 쇼핑몰에 제주 상품 전용 매대를 설치ㆍ운영하며 제주를 홍보하는 데도 앞장 서고 있다. 

중국 상해대표처=중국 상하이시 장녕구에 있는 상해대표처(소장 정명구)는 2015년 8월 설립 이후 제주 상품의 중국 진출과 기업 활동 지원 업무를 맡고 있다. 

지난해에는 ‘위챗’을 통해 40개사 285품목, ‘왕홍’ 생방송을 통해 3개사 5품목을 홍보하고 광명마트, 세븐일레븐, 올레마트 등 오프라인 178개 매장에서 7개사 16품목을 입점 지원