## 필요한 패키지 Import & Model Load

In [None]:
import transformers
import torch
import pandas as pd
import numpy as np
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, AdamW, BitsAndBytesConfig
from tqdm.notebook import tqdm
from nltk.translate.bleu_score import sentence_bleu

## Check for CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast

## 모델과 토크나이저의 경로
## 본인 경로에 맞는 경로를 설정해주세요
model_path = 'checkpoints/checkpoint_151000'

## 모델과 토크나이저 로드
model = BartForConditionalGeneration.from_pretrained(model_path)
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_path)

model.to(device)

Using device: cpu


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer.json:   0%|          | 0.00/682k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(30000, 768, padding_idx=3)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(30000, 768, padding_idx=3)
      (embed_positions): BartLearnedPositionalEmbedding(1028, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), 

## Validation Data Load & Data Wrangling



In [None]:
## 데이터 프레임 로드 및 준비
df_val = pd.read_csv("df_val.csv")

## 번역 Task의 문장만 남기기 & 결측치 제거
df_val = df_val.loc[df_val.isDialect == True]
df_val = df_val.dropna(subset=['standard_form', 'dialect_form'])

## 불용어 제거
df_val.form_샵 = df_val.form.apply(lambda x : 1 if '#' in x else 0)
df_val = df_val.loc[df_val.form_샵 == 0]
df_val.form_앤드 = df_val.form.apply(lambda x : 1 if '&' in x else 0)
df_val = df_val.loc[df_val.form_앤드 == 0]

## 문장을 4 ~ 64로 조정
df_val['form_len'] = df_val.standard_form.apply(lambda x : len(str(x)))
df_val = df_val.loc[3 < df_val.form_len <= 64]

# 번역 진행

In [None]:
from nltk.translate.bleu_score import corpus_bleu
from transformers import PreTrainedTokenizerFast
from transformers import pipeline

# 번역을 위한 파이프라인 생성
translation_pipeline = pipeline(
    "translation_xx_to_yy",
    model=model,
    tokenizer=tokenizer,
    device=0,
    max_length = 64
)

jeju_token = "[제주]"
standard_token = "[표준]"

# 모든 번역을 저장할 리스트
dialect_targets = []
standard_targets = []
dialect_predictions = []
standard_predictions = []

for index, row in df_val.iterrows():
    dialect_form = row['dialect_form']
    standard_form = row['standard_form']

    ## 제주어 -> 표준어 번역 수행
    predicted_sentence_dialect = translation_pipeline(jeju_token + " " + dialect_form)[0]['translation_text']

    ## 표준어 -> 제주어 번역 수행
    predicted_sentence_standard = translation_pipeline(standard_token + " " + standard_form)[0]['translation_text']

    # 토크나이저를 사용하여 문장을 토큰화
    dialect_form_tokens = tokenizer.tokenize(dialect_form)
    standard_form_tokens = tokenizer.tokenize(standard_form)
    predicted_tokens_dialect = tokenizer.tokenize(predicted_sentence_dialect)
    predicted_tokens_standard = tokenizer.tokenize(predicted_sentence_standard)

    # 토큰화된 문장을 리스트에 추가
    dialect_targets.append([dialect_form_tokens])  # 참조는 리스트의 리스트가 되어야 함
    standard_targets.append([standard_form_tokens])  # 참조는 리스트의 리스트가 되어야 함
    dialect_predictions.append(predicted_tokens_dialect)
    standard_predictions.append(predicted_tokens_standard)

# corpus_bleu 함수를 사용한 BLEU 점수 계산
from_jeju_to_standard_bleu_score = corpus_bleu(standard_targets, dialect_predictions)
from_standard_to_jeju_bleu_score = corpus_bleu(dialect_targets, standard_predictions)
print(f"제주어 -> 표준어 번역 BLEU Score : {from_jeju_to_standard_bleu_score}")
print(f"표준어 -> 제주어 번역 BLEU Score : {from_standard_to_jeju_bleu_score}")

# 임의로 한 문장만 번역 Task를 수행해보고 싶다면?

In [None]:
from transformers import pipeline

# 번역을 위한 파이프라인 생성
translation_pipeline = pipeline(
    "translation_xx_to_yy",
    model=model,
    tokenizer=tokenizer,
    device=0
)

jeju_token = "[제주]"
standard_token = "[표준]"

# 특정 문장 번역 예시
input_sentence = "식사는 하셨어요? 날씨가 많이 더워졌죠?"  # 예시 입력 문장
translated_sentence = translation_pipeline(standard_token + " " + input_sentence)[0]['translation_text']
print('input_sentence :', input_sentence)
print("표준어 -> 제주어:", translated_sentence)
print()

input_sentence = "안녕하수꽈 혼저옵서예 뭐하맨?"  # 예시 입력 문장
translated_sentence = translation_pipeline(jeju_token + " " + input_sentence)[0]['translation_text']
print('input_sentence :', input_sentence)
print("제주어 -> 표준어:", translated_sentence)