In [2]:
!pip install matplotlib scikit-learn
!pip install bert-score
!pip install evaluate
!pip install sacremoses sacrebleu

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.

In [3]:
# 평균 문장 길이 계산 함수

def compute_avg_sentence_length(sentences):
    """
    주어진 문장 리스트에 대해 평균 단어 수를 계산합니다.
    """
    if not sentences:
        return 0.0
    word_counts = [len(sentence.split()) for sentence in sentences]
    avg_length = sum(word_counts) / len(word_counts)
    return avg_length


In [4]:
# SARI 점수 계산 함수

import evaluate
sari_metric = evaluate.load("sari")

def compute_sari(originals, rewrites, references):
    scores = []
    for src, pred, ref in zip(originals, rewrites, references):
        result = sari_metric.compute(
            predictions=[pred],
            references=[[ref]],
            sources=[src]
        )
        scores.append(result['sari'])
    return sum(scores) / len(scores)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/12.1k [00:00<?, ?B/s]

In [5]:
# BERTScore계산 함수

from bert_score import score

def compute_bertscore(originals, rewrites, model_type='bert-base-multilingual-cased'):
    """
    BERTScore 유사도 평균 계산
    """
    P, R, F1 = score(cands=rewrites, refs=originals, model_type=model_type, rescale_with_baseline=False)
    return float(F1.mean())


In [6]:
# 평가 결과 표 출력

def print_model_report(model_name, f1, avg_len, sari):
    """
    평가 결과를 출력합니다.
    - f1: BERTScore (float)
    - avg_len: 평균 문장 길이 (float)
    - sari: SARI 점수 (float)
    """
    print(f"평가 결과 - {model_name}")
    print(f"──────────────────────────────")
    print(f"KoBERTScore:     {f1:.4f}")
    print(f"SARI Score:      {sari:.2f}")
    print(f"평균 문장 길이:   {avg_len:.2f} 단어")
    print(f"──────────────────────────────")


In [7]:
# Original과 Rewrite 출력
def print_originals_and_rewrites(originals, rewrites):
    for i, (ori, rewrite) in enumerate(zip(originals, rewrites)):
        print(f"[{i+1}] Original : {ori}")
        print(f"[{i+1}] Rewrite  : {rewrite}")
        print("-" * 40)


In [8]:
# 전체 시스템 실행 함수

def evaluate_model(originals, rewrites, references, model_name='MyModel'):
    """
    전체 평가 흐름을 통합 실행합니다.

    Parameters:
    - originals: 원본 문장 리스트
    - rewrites: 모델이 생성한 문장 리스트
    - references: 정답 문장 리스트 -> `original_text`에 대응되는 `simple_text`를 참조하여 구성해야 함.
    - model_name: 출력 시 모델 이름

    Returns:
    - dict: 평가 결과 (BERTScore, SARI, 평균 문장 길이)
    """
    bertscore = compute_bertscore(originals, rewrites)
    avg_len = compute_avg_sentence_length(rewrites)
    sari = compute_sari(originals, rewrites, references)

    print_model_report(model_name, bertscore, avg_len, sari)
    print_originals_and_rewrites(originals, rewrites)


In [9]:
originals = [
    "금식 후에 수술을 진행합니다.",
    "자가배뇨가 가능한지 확인하세요."
]

rewrites = [
    "아무것도 먹지 않은 후에 수술해요.",
    "혼자 소변을 볼 수 있는지 확인하세요."
]

references = [
    "아무것도 먹지 않으면 수술을 해요.",
    "혼자 소변을 보는지 확인해요."
]

evaluate_model(originals, rewrites, references,model_name="test")


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

평가 결과 - test
──────────────────────────────
KoBERTScore:     0.8037
SARI Score:      63.33
평균 문장 길이:   5.50 단어
──────────────────────────────
[1] Original : 금식 후에 수술을 진행합니다.
[1] Rewrite  : 아무것도 먹지 않은 후에 수술해요.
----------------------------------------
[2] Original : 자가배뇨가 가능한지 확인하세요.
[2] Rewrite  : 혼자 소변을 볼 수 있는지 확인하세요.
----------------------------------------
