In [6]:
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer

In [7]:
dataset = pd.read_parquet('train-00000-of-00001-1ae224438dce829b.parquet')
dataset
results_df = pd.DataFrame(columns=["instruction", "reference", "generated", "bleu_score"])

In [8]:
def load_model_and_tokenizer(model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)
    return model, tokenizer

In [9]:
def generate_response(model, tokenizer, prompt, max_length=256):
    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors="pt")
    
    # Initialize the progress bar
    total_steps = max_length  # Number of decoding steps corresponds to max_length
    with tqdm(total=total_steps, desc="Generating response", unit="step") as pbar:
        # Generate the response
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_beams=5,
            early_stopping=True,
            # Update the progress bar on each decoding step
            output_scores=True, 
            return_dict_in_generate=True,
        )
        
        # Update the progress bar
        for _ in range(len(outputs.sequences[0])):
            pbar.update(1)
    
    # Decode the generated response
    response = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
    return response

In [None]:
def evaluate_model(dataset, model, tokenizer):
    global results_df
    bleu_scores = []
    for idx in range(len(dataset)):
        instruction = dataset.at[idx, "instruction"]
        reference = dataset.at[idx, "output"] 
        generated_text = generate_response(model, tokenizer, instruction)

        bleu_score = sentence_bleu(reference, generated_text, weights=(0.25,0.25,0.25,0.25))
        bleu_scores.append(bleu_score)

        print(f"Input: {instruction}")
        print(f"Reference: {reference}")
        print(f"Generated: {generated_text}")
        print(f"BLEU Score: {bleu_score}\n")
                # DataFrame에 결과 추가
        results_df = pd.concat([results_df, pd.DataFrame({
            "instruction": [instruction],
            "reference": [reference],
            "generated": [generated_text],
            "bleu_score": [bleu_score]
        })], ignore_index=True)

    # CSV 파일로 저장
    results_df.to_csv("EEVE-Korean-Instruct-10.8B-v1.0_evaluation.csv", index=False)

In [None]:
def evaluate_model(dataset, model, tokenizer):
    global results_df
    meteor_scores = []
    for idx in range(len(dataset)):
        instruction = dataset.at[idx, "instruction"]
        reference = dataset.at[idx, "output"] 
        generated_text = generate_response(model, tokenizer, instruction)

        # METEOR 점수 계산
        meteor_score_value = meteor_score([reference], generated_text)
        meteor_scores.append(meteor_score_value)

        print(f"Input: {instruction}")
        print(f"Reference: {reference}")
        print(f"Generated: {generated_text}")
        print(f"METEOR Score: {meteor_score_value}\n")
        
        # DataFrame에 결과 추가
        results_df = pd.concat([results_df, pd.DataFrame({
            "instruction": [instruction],
            "reference": [reference],
            "generated": [generated_text],
            "meteor_score": [meteor_score_value]
        })], ignore_index=True)

    # CSV 파일로 저장
    results_df.to_csv("EEVE-Korean-Instruct-10.8B-v1.0_evaluation_meteor.csv", index=False)


In [None]:
def evaluate_model(dataset, model, tokenizer):
    global results_df
    results = []
    scorer = rouge_scorer.RougeScorer(metrics=["rouge1", "rouge2", "rougeL"], use_stemmer=True)  # ROUGE-1, ROUGE-2, ROUGE-L
    
    for idx in range(len(dataset)):
        instruction = dataset.at[idx, "instruction"]
        reference = dataset.at[idx, "output"] 
        generated_text = generate_response(model, tokenizer, instruction)

        # ROUGE 점수 계산
        scores = scorer.score(reference, generated_text)
        
        # ROUGE-1, ROUGE-2, ROUGE-L 점수를 하나의 딕셔너리로 합침
        rouge_scores = {
            "rouge_1": scores["rouge1"].fmeasure,
            "rouge_2": scores["rouge2"].fmeasure,
            "rouge_L": scores["rougeL"].fmeasure
        }

        print(f"Input: {instruction}")
        print(f"Reference: {reference}")
        print(f"Generated: {generated_text}")
        print(f"ROUGE Scores: {rouge_scores}\n")
        
        # DataFrame에 결과 추가
        results_df = pd.concat([results_df, pd.DataFrame({
            "instruction": [instruction],
            "reference": [reference],
            "generated": [generated_text],
            "rouge_scores": [rouge_scores]  # ROUGE 점수를 하나의 열로 저장
        })], ignore_index=True)

    # CSV 파일로 저장
    results_df.to_csv("EEVE-Korean-Instruct-10.8B-v1.0_evaluation_rouge.csv", index=False)


In [None]:
if __name__ == "__main__":
    parquet_path = "train-00000-of-00001-1ae224438dce829b.parquet"  
    model_path = "yanolja/EEVE-Korean-Instruct-10.8B-v1.0"  

    model = AutoModelForCausalLM.from_pretrained("yanolja/EEVE-Korean-Instruct-10.8B-v1.0")
    tokenizer = AutoTokenizer.from_pretrained("yanolja/EEVE-Korean-Instruct-10.8B-v1.0")

    # 평가 실행
    evaluate_model(dataset, model, tokenizer)