# 1. 단어 사전 기반 매핑 복원

## 1) Import 

In [2]:
import pandas as pd

## 2) Data Load

In [3]:
train = pd.read_csv('/kaggle/input/daycon-review2/train.csv', encoding = 'utf-8-sig')
test = pd.read_csv('/kaggle/input/daycon-review2/test.csv', encoding = 'utf-8-sig')

## 3) 단어 사전 생성

In [4]:
match_dict = {}

for input_text, output_text in zip(train['input'], train['output']):
    input_words = input_text.split()
    output_words = output_text.split()  # input과 output 컬럼을 순차적으로 가져와서 단어 단위로 분리
    for iw, ow in zip(input_words, output_words):
        match_dict[iw] = ow  

## 4) 변환 적용

In [5]:
def replace_words(input_text, match_dict):
    words = input_text.split() 
    replaced_words = [match_dict.get(word, word) for word in words] 
    return " ".join(replaced_words)

In [6]:
converted_reviews = test['input'].apply(lambda x: replace_words(x, match_dict)).tolist()

#변환된 리뷰를 리스트로 저장

## 5) Submission

In [7]:
submission = pd.read_csv('/kaggle/input/daycon-review2/sample_submission.csv', encoding = 'utf-8-sig')

In [8]:
print(len(submission))  # sample_submission.csv의 행 개수
print(len(converted_reviews))  # 변환된 리뷰 개수

1689
1689


In [9]:
submission['output'] = converted_reviews

In [10]:
submission.to_csv('./baseline_submission.csv', index = False, encoding = 'utf-8-sig')

# 2. LLM 활용 (Gemma)

## 1) Import 

In [11]:
import pandas as pd 
import torch 
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

## 2) Data Load

In [12]:
train = pd.read_csv('/kaggle/input/daycon-review2/train.csv', encoding = 'utf-8-sig')
test = pd.read_csv('/kaggle/input/daycon-review2/test.csv', encoding = 'utf-8-sig')

In [13]:
samples = []

for i in range(10):
    sample = f"input : {train['input'][i]} \n output : {train['output'][i]}"
    samples.append(sample)

## 3) Model Load

In [20]:
!pip install transformers==4.36.2
!pip install bitsandbytes==0.41.1
!pip install torch==2.1.2


Collecting transformers==4.36.2
  Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers==4.36.2)
  Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m65.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hDownloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m77.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.0
    Uninstalling tokenizers-0.21.0

In [35]:
pip install -U bitsandbytes


Note: you may need to restart the kernel to use updated packages.


In [36]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type= 'nf4',
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype=torch.bfloat16
    
    
)
model_id = 'beomi/gemma-ko-7b'
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

#모델과 토크나이저를 로드한 후, pad_token을 eos_token으로 설정

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/6 [00:00<?, ?it/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/2.93G [00:00<?, ?B/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/2.92G [00:00<?, ?B/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/2.99G [00:00<?, ?B/s]

model-00004-of-00006.safetensors:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

model-00005-of-00006.safetensors:   0%|          | 0.00/2.92G [00:00<?, ?B/s]

model-00006-of-00006.safetensors:   0%|          | 0.00/2.37G [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

## 4) Inference

In [38]:
from datasets import Dataset


In [39]:
# pandas DataFrame을 datasets.Dataset으로 변환
dataset = Dataset.from_pandas(test)


In [42]:
pipe = pipeline(
    task="text-generation",
    model=model, 
    tokenizer=tokenizer 
)

Device set to use cuda:0


In [43]:
restored_reviews = []

In [44]:
for index, row in test.iterrows():
    query = row['input']  
    
    messages = [
        {
            "role": "system",
            "content": (
                "You are a helpful assistant specializing in restoring obfuscated Korean reviews. "
                "Your task is to transform the given obfuscated Korean review into a clear, correct, "
                "and natural-sounding Korean review that reflects its original meaning. "
                "Below are examples of obfuscated Korean reviews and their restored forms:\n\n"
                f"Example, {samples}"  
                "Spacing and word length in the output must be restored to the same as in the input. "
                "Do not provide any description. Print only in Korean."
            )
        },
        {
            "role": "user",
            "content": f"input : {query}, output : "
        },
    ]

In [45]:
prompt = "\n".join([m["content"] for m in messages]).strip()

In [46]:
outputs = pipe(
        prompt,
        do_sample=True,
        temperature=0.2,
        top_p=0.9,
        max_new_tokens=len(query),
        eos_token_id=pipe.tokenizer.eos_token_id
    )

In [48]:
generated_text = outputs[0]['generated_text']
result = generated_text[len(prompt):].strip()
        

restored_reviews.append(result)

## 5) Submission

In [51]:
submission = pd.read_csv('/kaggle/input/daycon-review2/sample_submission.csv', encoding = 'utf-8-sig')

In [54]:
# 부족한 길이만큼 빈 값 추가
restored_reviews += [''] * (len(submission) - len(restored_reviews))


In [55]:
print(len(restored_reviews))  # restored_reviews의 길이 확인
print(len(submission))  # submission의 행 수 확인


1689
1689


In [56]:
submission['output'] = restored_reviews

In [59]:
submission.to_csv('/kaggle/working/baseline_submission.csv', index = False, encoding = 'utf-8-sig')

In [66]:
# CSV 파일 불러오기
file_path = '/kaggle/working/baseline_submission.csv'
df = pd.read_csv(file_path)

# 컬럼명 확인
print(df.columns)

Index(['ID', 'output'], dtype='object')


In [72]:
# CSV 파일 불러오기
file_path = '/kaggle/working/baseline_submission.csv'
df = pd.read_csv(file_path)

# 'restored_reviews' 컬럼에서 NaN이나 None을 빈 문자열로 처리
df['output'] = df['output'].fillna('')  # NaN을 빈 문자열로 대체

In [74]:
# 'restored_reviews'에 대해 문자열로 변환하여 처리
df['output'] = df['output'].apply(lambda x: str(x) if isinstance(x, str) else '')

In [75]:
# ground_truth와 restored_reviews 컬럼을 가져오기
#ground_truth = df['ground_truth'].tolist()  # 실제 텍스트가 포함된 컬럼명
restored_reviews = df['output'].tolist()  # 복원된 텍스트가 포함된 컬럼명

In [77]:
# 단어 수준으로 비교하기 위한 함수
def word_level_metrics(ground_truth, restored_reviews):
    ground_truth_words = [set(review.split()) for review in ground_truth]
    restored_words = [set(review.split()) for review in restored_reviews]
    
    precision_list = []
    recall_list = []
    f1_list = []
    
    for true, pred in zip(ground_truth_words, restored_words):
        common_words = true & pred
        precision = len(common_words) / len(pred) if len(pred) > 0 else 0
        recall = len(common_words) / len(true) if len(true) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

    precision = sum(precision_list) / len(precision_list)
    recall = sum(recall_list) / len(recall_list)
    f1 = sum(f1_list) / len(f1_list)
    
    return precision, recall, f1

In [78]:
# 성능 지표 계산
precision, recall, f1 = word_level_metrics(ground_truth, restored_reviews)

# 성능 지표 출력
print(f"정밀도: {precision:.4f}")
print(f"재현율: {recall:.4f}")
print(f"F1 점수: {f1:.4f}")



정밀도: 0.0304
재현율: 0.5000
F1 점수: 0.0574
