# 1. 단어 사전 기반 매핑 복원

## 1) Import 

In [1]:
import pandas as pd

## 2) Data Load

In [2]:
train = pd.read_csv('/kaggle/input/daycon-review2/train.csv', encoding = 'utf-8-sig')
test = pd.read_csv('/kaggle/input/daycon-review2/test.csv', encoding = 'utf-8-sig')

## 3) 단어 사전 생성

In [3]:
match_dict = {}

for input_text, output_text in zip(train['input'], train['output']):
    input_words = input_text.split()
    output_words = output_text.split()  # input과 output 컬럼을 순차적으로 가져와서 단어 단위로 분리
    for iw, ow in zip(input_words, output_words):
        match_dict[iw] = ow  

## 4) 변환 적용

In [4]:
def replace_words(input_text, match_dict):
    words = input_text.split() 
    replaced_words = [match_dict.get(word, word) for word in words] 
    return " ".join(replaced_words)

In [5]:
converted_reviews = test['input'].apply(lambda x: replace_words(x, match_dict)).tolist()

#변환된 리뷰를 리스트로 저장

## 5) Submission

In [6]:
submission = pd.read_csv('/kaggle/input/daycon-review2/sample_submission.csv', encoding = 'utf-8-sig')

In [7]:
print(len(submission))  # sample_submission.csv의 행 개수
print(len(converted_reviews))  # 변환된 리뷰 개수

1689
1689


In [8]:
submission['output'] = converted_reviews

In [9]:
submission.to_csv('./baseline_submission.csv', index = False, encoding = 'utf-8-sig')

# 2. LLM 활용 (Gemma)

## 1) Import 

In [10]:
import pandas as pd 
import torch 
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

## 2) Data Load

In [11]:
train = pd.read_csv('/kaggle/input/daycon-review2/train.csv', encoding = 'utf-8-sig')
test = pd.read_csv('/kaggle/input/daycon-review2/test.csv', encoding = 'utf-8-sig')

In [12]:
samples = []

for i in range(10):
    sample = f"input : {train['input'][i]} \n output : {train['output'][i]}"
    samples.append(sample)

## 3) Model Load

In [18]:
!pip install -U bitsandbytes




In [19]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type= 'nf4',
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype=torch.bfloat16
    
    
)
model_id = 'beomi/gemma-ko-7b'
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

#모델과 토크나이저를 로드한 후, pad_token을 eos_token으로 설정

config.json:   0%|          | 0.00/668 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/6 [00:00<?, ?it/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/2.93G [00:00<?, ?B/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/2.92G [00:00<?, ?B/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/2.99G [00:00<?, ?B/s]

model-00004-of-00006.safetensors:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

model-00005-of-00006.safetensors:   0%|          | 0.00/2.92G [00:00<?, ?B/s]

model-00006-of-00006.safetensors:   0%|          | 0.00/2.37G [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

## 4) Inference

In [45]:
from datasets import Dataset


In [46]:
# pandas DataFrame을 datasets.Dataset으로 변환
dataset = Dataset.from_pandas(test)


In [47]:
pipe = pipeline(
    task="text-generation",
    model=model, 
    tokenizer=tokenizer 
)

Device set to use cuda:0


In [49]:
# 테스트 데이터를 datasets.Dataset으로 변환
dataset = Dataset.from_pandas(test)

In [50]:
# 프롬프트 생성 함수
def create_prompt(example):
    query = example['input']
    system_prompt = (
        "You are a helpful assistant specializing in restoring obfuscated Korean reviews. "
        "Your task is to transform the given obfuscated Korean review into a clear, correct, "
        "and natural-sounding Korean review that reflects its original meaning. "
        "Below are examples of obfuscated Korean reviews and their restored forms:\n\n"
        f"Example, {samples}\n"
        "Spacing and word length in the output must be restored to the same as in the input. "
        "Do not provide any description. Print only in Korean."
    )
    return f"{system_prompt}\n\ninput : {query}, output : "


In [51]:
# 모든 입력에 대해 프롬프트 생성
dataset = dataset.map(lambda example: {"prompt": create_prompt(example)})


Map:   0%|          | 0/1689 [00:00<?, ? examples/s]

In [None]:
# 배치 처리 수행
outputs = pipe(
    dataset["prompt"],
    do_sample=True,
    temperature=0.2,
    top_p=0.9,
    max_new_tokens=256,  # 필요에 따라 조정
    eos_token_id=tokenizer.eos_token_id
)


In [None]:
# 결과 저장
restored_reviews = [output[0]['generated_text'].split("output : ")[-1].strip() for output in outputs]

# 제출 파일 로드
submission = pd.read_csv('/kaggle/input/daycon-review2/sample_submission.csv', encoding='utf-8-sig')

# 결과 저장
submission['output'] = restored_reviews

# 파일 저장
submission.to_csv('/kaggle/working/baseline_submission.csv', index=False, encoding='utf-8-sig')

print("파일 저장 완료!")

## 5) Submission

In [29]:
submission = pd.read_csv('/kaggle/input/daycon-review2/sample_submission.csv', encoding = 'utf-8-sig')

In [30]:
# 부족한 길이만큼 빈 값 추가
restored_reviews += [''] * (len(submission) - len(restored_reviews))


In [31]:
print(len(restored_reviews))  # restored_reviews의 길이 확인
print(len(submission))  # submission의 행 수 확인


1689
1689


In [32]:
submission['output'] = restored_reviews

In [33]:
submission.to_csv('/kaggle/working/baseline_submission2.csv', index = False, encoding = 'utf-8-sig')

In [34]:
# CSV 파일 불러오기
file_path = '/kaggle/working/baseline_submission2.csv'
df = pd.read_csv(file_path)

# 컬럼명 확인
print(df.columns)

Index(['ID', 'output'], dtype='object')


In [35]:
# CSV 파일 불러오기
file_path = '/kaggle/working/baseline_submission2.csv'
df = pd.read_csv(file_path)

# 'restored_reviews' 컬럼에서 NaN이나 None을 빈 문자열로 처리
df['output'] = df['output'].fillna('')  # NaN을 빈 문자열로 대체

In [36]:
# 'restored_reviews'에 대해 문자열로 변환하여 처리
df['output'] = df['output'].apply(lambda x: str(x) if isinstance(x, str) else '')

In [42]:
print(df.columns)


Index(['ID', 'output'], dtype='object')


In [41]:
# ground_truth와 restored_reviews 컬럼을 가져오기
ground_truth = df['input'].tolist()  # 실제 텍스트가 포함된 컬럼명
restored_reviews = df['output'].tolist()  # 복원된 텍스트가 포함된 컬럼명

KeyError: 'input'

In [38]:
# 단어 수준으로 비교하기 위한 함수
def word_level_metrics(ground_truth, restored_reviews):
    ground_truth_words = [set(review.split()) for review in ground_truth]
    restored_words = [set(review.split()) for review in restored_reviews]
    
    precision_list = []
    recall_list = []
    f1_list = []
    
    for true, pred in zip(ground_truth_words, restored_words):
        common_words = true & pred
        precision = len(common_words) / len(pred) if len(pred) > 0 else 0
        recall = len(common_words) / len(true) if len(true) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

    precision = sum(precision_list) / len(precision_list)
    recall = sum(recall_list) / len(recall_list)
    f1 = sum(f1_list) / len(f1_list)
    
    return precision, recall, f1

In [40]:
# 성능 지표 계산
precision, recall, f1 = word_level_metrics(restored_reviews)

# 성능 지표 출력
print(f"정밀도: {precision:.4f}")
print(f"재현율: {recall:.4f}")
print(f"F1 점수: {f1:.4f}")



TypeError: word_level_metrics() missing 1 required positional argument: 'restored_reviews'