In [None]:
from ollama import chat
from ollama import ChatResponse

def chat_response(input):
    response: ChatResponse = chat(model='gemma3n:e2b', messages=[
      {
        'role': 'user',
        'content': input,
      },
    ])

    return response['message']['content'].strip().lower()

In [None]:
###First test case
###Sentiment Analysis, Task Fidelity, Exact Match
def test_sentiment_002():
    assert chat_response(f"Classify this as 'positive', 'negative', 'neutral', or 'mixed': The movie is not good. Just return the emotion") == "negative"
    

ipytest.run()
    
#print(response['message']['content'])

In [None]:
####Second test case
####Consistency, FAQ chatbot, Cosine similarity

from sentence_transformers import SentenceTransformer
import numpy as np

faq_variations = [
    {"questions": ["What's your return policy?", "How can I return an item?", "Wut's yur retrn polcy?"], "answer": "Our return policy allows..."},  # Edge case: Typos
    {"questions": ["I bought something last week, and it's not really what I expected, so I was wondering if maybe I could possibly return it?", "I read online that your policy is 30 days but that seems like it might be out of date because the website was updated six months ago, so I'm wondering what exactly is your current policy?"], "answer": "Our return policy allows..."},  # Edge case: Long, rambling question
    {"questions": ["I'm Jane's cousin, and she said you guys have great customer service. Can I return this?", "Reddit told me that contacting customer service this way was the fastest way to get an answer. I hope they're right! What is the return window for a jacket?"], "answer": "Our return policy allows..."},  # Edge case: Irrelevant info
    # ... 47 more FAQs
]

def check_cosine_similarity(outputs):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = np.array([model.encode(output) for output in outputs])
    print("Embeddings ready")
    cosine_similarities = np.dot(embeddings, embeddings.T) / (np.linalg.norm(embeddings, axis=1) * np.linalg.norm(embeddings, axis=1).T)
    print("Got similarity score")
    return np.mean(cosine_similarities)

def sim_score():
    score_list = []
    for faq in faq_variations:
        outputs = [chat_response(question) for question in faq["questions"]]
        print("Got chat response")
        similarity_score = check_cosine_similarity(outputs)
        score_list.append(similarity_score)
    return score_list

score_list = sim_score()
@pytest.mark.parametrize("score",score_list)
def test_faq_002(score):
    if score > 0.7:
        assert True
    else:
        assert False

In [None]:
pip install -U sentence-transformers

In [None]:
pip install ipytest

In [None]:
import ipytest
ipytest.autoconfig()

In [None]:
import pytest

In [None]:
#####Third test case
####Summarization, Relevance, Coherance, ROUGE-L
from rouge import Rouge

articles = [
    {"text": "In a groundbreaking study, researchers at MIT...", "summary": "MIT scientists discover a new antibiotic..."},
    {"text": "Jane Doe, a local hero, made headlines last week for saving... In city hall news, the budget... Meteorologists predict...", "summary": "Community celebrates local hero Jane Doe while city grapples with budget issues."},  # Edge case: Multi-topic
    {"text": "You won't believe what this celebrity did! ... extensive charity work ...", "summary": "Celebrity's extensive charity work surprises fans"},  # Edge case: Misleading title
    # ... 197 more articles
]

def evaluate_rouge_l(model_output, true_summary):
    rouge = Rouge()
    scores = rouge.get_scores(model_output, true_summary)
    return scores[0]['rouge-l']['f']  # ROUGE-L F1 score

def get_relevance_score():
    outputs = [chat_response(f"Summarize this article in 1-2 sentences:\n\n{article['text']}") for article in articles]
    relevance_scores = [evaluate_rouge_l(output, article['summary']) for output, article in zip(outputs, articles)]
    av_score = sum(relevance_scores) / len(relevance_scores)
    print(f"Average ROUGE-L F1 Score: {av_score}")
    return av_score

def test_summary_002():
    av_score = get_relevance_score()
    assert av_score > 0.5


In [None]:
pip install rouge

In [None]:
####Fourth test case
####Tone and style, Customer service, LLM Based Likert scale, LLM as a Judge

inquiries = [
    {"text": "This is the third time you've messed up my order. I want a refund NOW!", "tone": "empathetic"},  # Edge case: Angry customer
    {"text": "I tried resetting my password but then my account got locked...", "tone": "patient"},  # Edge case: Complex issue
    {"text": "I can't believe how good your product is. It's ruined all others for me!", "tone": "professional"},  # Edge case: Compliment as complaint
    # ... 97 more inquiries
]

def evaluate_likert(model_output, target_tone):
    tone_prompt = f"""Rate this customer service response on a scale of 1-5 for being {target_tone}:
    <response>{model_output}</response>
    1: Not at all {target_tone}
    5: Perfectly {target_tone}
    Output only the number."""

    # Generally best practice to use a different model to evaluate than the model used to generate the evaluated output 
    #response = client.messages.create(model="claude-sonnet-4-20250514", max_tokens=50, messages=[{"role": "user", "content": tone_prompt}])
    response: ChatResponse = chat(model='gpt-oss:20b', messages=[
      {
        'role': 'user',
        'content': tone_prompt,
      },
    ])
    return response['message']['content'].strip()

def get_evaluations():
    outputs = [chat_response(f"Respond to this customer inquiry: {inquiry['text']}") for inquiry in inquiries]
    tone_scores = [evaluate_likert(output, inquiry['tone']) for output, inquiry in zip(outputs, inquiries)]
    av_score = sum(tone_scores) / len(tone_scores)
    print(f"Average Tone Score: {av_score}")
    return av_score

def test_customer_enquiry_002():
    av_score = get_evaluations()
    assert av_score > 0.5

