In [6]:
import pandas as pd
import ollama
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer

# Load the dataset
dataset = pd.read_parquet('train-00000-of-00001-1ae224438dce829b.parquet')

# Initialize an empty DataFrame to store results
results_df = pd.DataFrame(columns=["instruction", "reference", "generated", "bleu_score", "meteor_score", "rouge_1", "rouge_2", "rouge_L"])


In [7]:
# Function to load Ollama model
def load_ollama_model(model_name):
    # Load the Ollama model using its API
    return model_name  # The model itself is accessed through Ollama's API

def generate_response_ollama(model_name, prompt, max_length=256):
    # Use Ollama's API to generate the response
    response = ollama.chat(model=model_name, messages=[{"role": "user", "content": prompt}])
    return response['text']

In [8]:
def calculate_bleu(reference: str, generated: str):
    reference_tokens = reference.split()  # Tokenize the reference text
    generated_tokens = generated.split()  # Tokenize the generated text
    return sentence_bleu([reference_tokens], generated_tokens)

def calculate_meteor(reference: str, generated: str):
    return meteor_score([reference], generated)

def calculate_rouge(reference: str, generated: str):
    scorer = rouge_scorer.RougeScorer(metrics=["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    scores = scorer.score(reference, generated)
    return {
        "rouge_1": scores["rouge1"].fmeasure,
        "rouge_2": scores["rouge2"].fmeasure,
        "rouge_L": scores["rougeL"].fmeasure
    }

In [9]:
def evaluate_model(dataset, model_name):
    global results_df
    for idx in range(len(dataset)):
        instruction = dataset.at[idx, "instruction"]
        reference = dataset.at[idx, "output"] 
        generated_text = generate_response_ollama(model_name, instruction)

        # Calculate BLEU, METEOR, and ROUGE scores
        bleu_score = calculate_bleu(reference, generated_text)
        meteor_score_value = calculate_meteor(reference, generated_text)
        rouge_scores = calculate_rouge(reference, generated_text)

        # Print the evaluation details
        print(f"Input: {instruction}")
        print(f"Reference: {reference}")
        print(f"Generated: {generated_text}")
        print(f"BLEU Score: {bleu_score}")
        print(f"METEOR Score: {meteor_score_value}")
        print(f"ROUGE Scores: {rouge_scores}\n")
        
        # Add results to DataFrame
        results_df = pd.concat([results_df, pd.DataFrame({
            "instruction": [instruction],
            "reference": [reference],
            "generated": [generated_text],
            "bleu_score": [bleu_score],
            "meteor_score": [meteor_score_value],
            "rouge_1": [rouge_scores["rouge_1"]],
            "rouge_2": [rouge_scores["rouge_2"]],
            "rouge_L": [rouge_scores["rouge_L"]]
        })], ignore_index=True)

    # Save results to CSV
    results_df.to_csv("evaluation_results.csv", index=False)

In [None]:
if __name__ == "__main__":
    model_name = "DUCChatbot5ep:latest"  # Ollama model name

    # Load Ollama model
    model = load_ollama_model(model_name)

    # Run evaluation
    evaluate_model(dataset, model)