<a href="https://colab.research.google.com/github/kiyuyeon/dacon_LLm/blob/master/baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. 단어 사전 기반 매핑 복원

## 1) Import

In [None]:
import pandas as pd

## 2) Data Load

In [None]:
train = pd.read_csv('./train.csv', encoding = 'utf-8-sig')
test = pd.read_csv('./test.csv', encoding = 'utf-8-sig')

## 3) 단어 사전 생성

In [None]:
match_dict = {}

for input_text, output_text in zip(train['input'], train['output']):
    input_words = input_text.split()
    output_words = output_text.split()
    for iw, ow in zip(input_words, output_words):
        match_dict[iw] = ow

## 4) 변환 적용

In [None]:
def replace_words(input_text, match_dict):
    words = input_text.split()
    replaced_words = [match_dict.get(word, word) for word in words]
    return " ".join(replaced_words)

In [None]:
converted_reviews = test['input'].apply(lambda x: replace_words(x, match_dict)).tolist()

## 5) Submission

In [None]:
submission = pd.read_csv('./sample_submission.csv', encoding = 'utf-8-sig')

In [None]:
submission['output'] = converted_reviews

In [None]:
submission.to_csv('./baseline_submission.csv', index = False, encoding = 'utf-8-sig')

# 2. LLM 활용 (Gemma)

## 1) Import

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

## 2) Data Load

In [None]:
train = pd.read_csv('./train.csv', encoding = 'utf-8-sig')
test = pd.read_csv('./test.csv', encoding = 'utf-8-sig')

In [None]:
samples = []

for i in range(10):
    sample = f"input : {train['input'][i]} \n output : {train['output'][i]}"
    samples.append(sample)

## 3) Model Load

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

# Set the quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load the model with quantization configuration
model_id = 'beomi/gemma-ko-7b'
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"": 0})

# Load the tokenizer and set up padding
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

## 4) Inference

In [None]:
pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer
)

restored_reviews = []

for index, row in test.iterrows():
    query = row['input']

    messages = [
        {
            "role": "system",
            "content": (
                "Obfuscated Korean Review Restoration is a technique where a given obfuscated Korean sentence is restored to its original, clear, and natural form. "
                "The goal is to make the sentence easier to understand by reversing any unnecessary alterations or distortions. "
                "Obfuscated Korean relies on the fact that some words or letters might have been altered in ways that don’t affect the overall meaning. "
                "Given the following obfuscated Korean review, identify the errors in the altered text and restore it to its natural form.\n\n"
                "Example:\n"
                "Original: '이 프로그램은 매우 유용하고 효율적입니다.'\n"
                "Obfuscated: '이 프괒그램은 매뉘 유용하고 효힛적입니다.'"
            )
        },
        {
            "role": "user",
            "content": f"input : {query}, output : "
        },
    ]


    prompt = "\n".join([m["content"] for m in messages]).strip()

    outputs = pipe(
        prompt,
        do_sample=False,  # do_sample을 False로 설정하여 그리디 방식으로 텍스트를 생성
        temperature=0.7,  # temperature 값 조정 (0.7 정도로 시도)
        top_p=0.9,        # top_p 값을 적당히 조정
        max_new_tokens=min(len(query), 50),  # max_new_tokens의 크기를 적절하게 제한
        eos_token_id=pipe.tokenizer.eos_token_id
    )

    generated_text = outputs[0]['generated_text']
    # Extract the portion after "output : " to avoid including the prompt or "input :"
    result = generated_text.split("output : ")[-1].strip()

    restored_reviews.append(result)


Device set to use cuda:0
  attn_output = torch.nn.functional.scaled_dot_product_attention(
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


RuntimeError: probability tensor contains either `inf`, `nan` or element < 0

In [None]:
restored_reviews = []

for index, row in test.iterrows():
    query = row['input']

    messages = [
        {
            "role": "system",
            "content": (
                "Obfuscated Korean Review Restoration is a technique where a given obfuscated Korean sentence is restored to its original, clear, and natural form. "
                "The goal is to make the sentence easier to understand by reversing any unnecessary alterations or distortions. "
                "Obfuscated Korean relies on the fact that some words or letters might have been altered in ways that don’t affect the overall meaning. "
                "Given the following obfuscated Korean review, identify the errors in the altered text and restore it to its natural form.\n\n"
                "Example:\n"
                "Original: '이 프로그램은 매우 유용하고 효율적입니다.'\n"
                "Obfuscated: '이 프괒그램은 매뉘 유용하고 효힛적입니다.'"
            )
        },
        {
            "role": "user",
            "content": f"input : {query}, output : "
        },
    ]

    prompt = "\n".join([m["content"] for m in messages]).strip()

    outputs = pipe(
        prompt,
        do_sample=False,  # do_sample을 False로 설정하여 그리디 방식으로 텍스트를 생성
        temperature=0.7,  # temperature 값 조정 (0.7 정도로 시도)
        top_p=0.9,        # top_p 값을 적당히 조정
        max_new_tokens=min(len(query), 50),  # max_new_tokens의 크기를 적절하게 제한
        eos_token_id=pipe.tokenizer.eos_token_id
    )

    generated_text = outputs[0]['generated_text']
    # Extract the portion after "output : " to avoid including the prompt or "input :"
    result = generated_text.split("output : ")[-1].strip()

    restored_reviews.append(result)

# 결과 출력
for review in restored_reviews:
    print(review)



KeyboardInterrupt: 

## 5) Submission

In [None]:
submission = pd.read_csv('./sample_submission.csv', encoding = 'utf-8-sig')

In [None]:
submission.to_csv('./baseline_submission.csv', index = False, encoding = 'utf-8-sig')