# 1. 단어 사전 기반 매핑 복원

## 1) Import 

In [4]:
import pandas as pd

## 2) Data Load

In [5]:
train = pd.read_csv('/kaggle/input/daycon-review2/train.csv', encoding = 'utf-8-sig')
test = pd.read_csv('/kaggle/input/daycon-review2/test.csv', encoding = 'utf-8-sig')

## 3) 단어 사전 생성

In [6]:
match_dict = {}

for input_text, output_text in zip(train['input'], train['output']):
    input_words = input_text.split()
    output_words = output_text.split()  # input과 output 컬럼을 순차적으로 가져와서 단어 단위로 분리
    for iw, ow in zip(input_words, output_words):
        match_dict[iw] = ow  

## 4) 변환 적용

In [7]:
def replace_words(input_text, match_dict):
    words = input_text.split() 
    replaced_words = [match_dict.get(word, word) for word in words] 
    return " ".join(replaced_words)

In [8]:
converted_reviews = test['input'].apply(lambda x: replace_words(x, match_dict)).tolist()

#변환된 리뷰를 리스트로 저장

## 5) Submission

In [9]:
submission = pd.read_csv('/kaggle/input/daycon-review2/sample_submission.csv', encoding = 'utf-8-sig')

In [10]:
print(len(submission))  # sample_submission.csv의 행 개수
print(len(converted_reviews))  # 변환된 리뷰 개수

1689
1689


In [11]:
submission['output'] = converted_reviews

In [12]:
submission.to_csv('./baseline_submission.csv', index = False, encoding = 'utf-8-sig')

# 2. LLM 활용 (Gemma)

## 1) Import 

In [13]:
import pandas as pd 
import torch 
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

## 2) Data Load

In [14]:
train = pd.read_csv('/kaggle/input/daycon-review2/train.csv', encoding = 'utf-8-sig')
test = pd.read_csv('/kaggle/input/daycon-review2/test.csv', encoding = 'utf-8-sig')

In [15]:
samples = []

for i in range(10):
    sample = f"input : {train['input'][i]} \n output : {train['output'][i]}"
    samples.append(sample)

## 3) Model Load

In [16]:
!pip install -U bitsandbytes


Collecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl (69.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.2


In [25]:
!pip install pandas 
!pip install -q U peft==0.8.2
!pip install -q U trl==0.7.10
!pip install -q U accelerate==0.27.1
!pip install -q U datasets==2.17.0
!pip install -U bitsandbytes
!pip install --upgrade transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.4/58.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.9/150.9 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.4/116.4 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m279.7/279.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.6/536.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.4/166.4 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages

In [26]:
!pip install --upgrade trl

Collecting trl
  Downloading trl-0.15.0-py3-none-any.whl.metadata (11 kB)
Collecting accelerate>=0.34.0 (from trl)
  Downloading accelerate-1.3.0-py3-none-any.whl.metadata (19 kB)
Collecting datasets>=2.21.0 (from trl)
  Downloading datasets-3.3.0-py3-none-any.whl.metadata (19 kB)
Downloading trl-0.15.0-py3-none-any.whl (318 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.3/318.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading accelerate-1.3.0-py3-none-any.whl (336 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m336.6/336.6 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.3.0-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: datasets, accelerate, trl
  Attempting uninstall: datasets
    Found existing installation: datasets 2.17.0
    Uninstalling datasets-2.17.0

In [27]:
import pandas as pd
import torch 
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from datasets import Dataset
from peft import LoraConfig, get_peft_model

from transformers import TrainingArguments
from trl import SFTTrainer
from peft import PeftModel

In [28]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type= 'nf4',
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype=torch.bfloat16
    
    
)
model_id = 'beomi/gemma-ko-7b'
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

#모델과 토크나이저를 로드한 후, pad_token을 eos_token으로 설정

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [None]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

## 4) Inference

In [18]:
from datasets import Dataset


In [19]:
# pandas DataFrame을 datasets.Dataset으로 변환
dataset = Dataset.from_pandas(test)


In [20]:
pipe = pipeline(
    task="text-generation",
    model=model, 
    tokenizer=tokenizer 
)

Device set to use cuda:0


In [29]:
restored_reviews = []


for index, row in test.iloc[:10].iterrows():
    query = row['input']  
    prompt = (
    "<start_of_turn> Your task is to transform the given obfuscated Korean review into a clear, correct, and natural-sounding Korean review that reflects its original meaning.\n"
    f"Input: {query}\n"
    "<end_of_turn>\n"
    "<start_of_turn>Assistant:\n"
    "Output:"
)

    # 텍스트 생성
    generated = pipe(
        prompt,
        num_return_sequences=1,
        temperature=0.2,
        top_p=0.9,
        max_new_tokens=len(query),
        do_sample=True,
        eos_token_id=pipe.tokenizer.eos_token_id
    )
    
    # 생성된 텍스트에서 출력 부분 추출
    generated_text = generated[0]['generated_text']
    
    # 'Output:' 이후의 텍스트 추출
    output_start = generated_text.find("Output:")

    if output_start != -1:
        restored_reviews.append(generated_text[output_start + len("Output:"):].strip())
        print(generated_text[output_start + len("Output:"):].strip())
    else:
        restored_reviews.append(generated_text.strip())
        print(generated_text.strip())

KeyboardInterrupt: 

In [21]:
# 테스트 데이터를 datasets.Dataset으로 변환
dataset = Dataset.from_pandas(test)

In [22]:
# 프롬프트 생성 함수
def create_prompt(example):
    query = example['input']
    system_prompt = (
        "You are a helpful assistant specializing in restoring obfuscated Korean reviews. "
        "Your task is to transform the given obfuscated Korean review into a clear, correct, "
        "and natural-sounding Korean review that reflects its original meaning. "
        "Below are examples of obfuscated Korean reviews and their restored forms:\n\n"
        f"Example, {samples}\n"
        "Spacing and word length in the output must be restored to the same as in the input. "
        "Do not provide any description. Print only in Korean."
    )
    return f"{system_prompt}\n\ninput : {query}, output : "


In [23]:
# 모든 입력에 대해 프롬프트 생성
dataset = dataset.map(lambda example: {"prompt": create_prompt(example)})


Map:   0%|          | 0/1689 [00:00<?, ? examples/s]

In [None]:
from transformers import pipeline

pipe = pipeline("text-generation", model="beomi/gemma-ko-7b")  # 모델 확인


In [None]:
# 배치 처리 수행
outputs = pipe(
    dataset["prompt"],
    do_sample=True,
    temperature=0.2,
    top_p=0.9,
    max_new_tokens=100,  # 필요에 따라 조정
    eos_token_id=tokenizer.eos_token_id
)


In [None]:
# 결과 저장
restored_reviews = [output[0]['generated_text'].split("output : ")[-1].strip() for output in outputs]

# 제출 파일 로드
submission = pd.read_csv('/kaggle/input/daycon-review2/sample_submission.csv', encoding='utf-8-sig')

# 결과 저장
submission['output'] = restored_reviews

# 파일 저장
submission.to_csv('/kaggle/working/baseline_submission.csv', index=False, encoding='utf-8-sig')

print("파일 저장 완료!")

## 5) Submission

In [None]:
submission = pd.read_csv('/kaggle/input/daycon-review2/sample_submission.csv', encoding = 'utf-8-sig')

In [None]:
# 부족한 길이만큼 빈 값 추가
restored_reviews += [''] * (len(submission) - len(restored_reviews))


In [None]:
print(len(restored_reviews))  # restored_reviews의 길이 확인
print(len(submission))  # submission의 행 수 확인


In [None]:
submission['output'] = restored_reviews

In [None]:
submission.to_csv('/kaggle/working/baseline_submission2.csv', index = False, encoding = 'utf-8-sig')

In [None]:
# CSV 파일 불러오기
file_path = '/kaggle/working/baseline_submission2.csv'
df = pd.read_csv(file_path)

# 컬럼명 확인
print(df.columns)

In [None]:
# CSV 파일 불러오기
file_path = '/kaggle/working/baseline_submission2.csv'
df = pd.read_csv(file_path)

# 'restored_reviews' 컬럼에서 NaN이나 None을 빈 문자열로 처리
df['output'] = df['output'].fillna('')  # NaN을 빈 문자열로 대체

In [None]:
# 'restored_reviews'에 대해 문자열로 변환하여 처리
df['output'] = df['output'].apply(lambda x: str(x) if isinstance(x, str) else '')

In [None]:
print(df.columns)


In [None]:
# ground_truth와 restored_reviews 컬럼을 가져오기
ground_truth = df['input'].tolist()  # 실제 텍스트가 포함된 컬럼명
restored_reviews = df['output'].tolist()  # 복원된 텍스트가 포함된 컬럼명

In [None]:
# 단어 수준으로 비교하기 위한 함수
def word_level_metrics(ground_truth, restored_reviews):
    ground_truth_words = [set(review.split()) for review in ground_truth]
    restored_words = [set(review.split()) for review in restored_reviews]
    
    precision_list = []
    recall_list = []
    f1_list = []
    
    for true, pred in zip(ground_truth_words, restored_words):
        common_words = true & pred
        precision = len(common_words) / len(pred) if len(pred) > 0 else 0
        recall = len(common_words) / len(true) if len(true) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

    precision = sum(precision_list) / len(precision_list)
    recall = sum(recall_list) / len(recall_list)
    f1 = sum(f1_list) / len(f1_list)
    
    return precision, recall, f1

In [None]:
# 성능 지표 계산
precision, recall, f1 = word_level_metrics(restored_reviews)

# 성능 지표 출력
print(f"정밀도: {precision:.4f}")
print(f"재현율: {recall:.4f}")
print(f"F1 점수: {f1:.4f}")

