In [31]:
# Import helper functions
import sys
sys.path.append('../')

import importlib
import json
import utils.claude_helpers
importlib.reload(utils.claude_helpers)
from utils.claude_helpers import add_user_message, add_assistant_message, chat, simple_chat

## Generate testing dataset

In [25]:
def generate_dataset():
    
    system = "Your task is to generate some dataset for prompt evaluation."
    prompt = """
    Generate a dataset for prompt evaluation. The dataset should be in a json format with two fields: 'title' and 'imdb_id'. 
    The dataset should contain 10 entries. 

    Respond with JSON. Keep your response concise and direct.

    for example 
    ```json
    {"title" : "top gun", "imdb_id" : "tt0092099"}

    """
    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```json") ## or "```code"

    json_results = chat(messages, system= system, stop_sequences=["```"])
    return json.loads(json_results)

In [11]:
generate_dataset()

[{'title': 'The Shawshank Redemption', 'imdb_id': 'tt0111161'},
 {'title': 'The Godfather', 'imdb_id': 'tt0068646'},
 {'title': 'The Dark Knight', 'imdb_id': 'tt0468569'},
 {'title': 'Inception', 'imdb_id': 'tt1375666'},
 {'title': 'Forrest Gump', 'imdb_id': 'tt0109830'},
 {'title': 'The Matrix', 'imdb_id': 'tt0133093'},
 {'title': 'Pulp Fiction', 'imdb_id': 'tt0110912'},
 {'title': 'The Lord of the Rings: The Fellowship of the Ring',
  'imdb_id': 'tt0120737'},
 {'title': "Schindler's List", 'imdb_id': 'tt0108052'},
 {'title': 'The Silence of the Lambs', 'imdb_id': 'tt0102926'}]

In [13]:
with open("../dataset/dataset.json", "w") as f:
    json.dump(generate_dataset(), f, indent=2)

In [16]:
# Create dummy data to test each method
dummy_data =   {
    "title": "Forrest Gump",
    "imdb_id": "tt0109830"
  }


a = run_test_case(dummy_data)
a

":\nShawshank Redemption, Pulp Fiction, Schindler's List, Saving Private Ryan, The Shining"

## Running the Eval

In [41]:
prompt = """
Your task is to generate 5 movie or tv shows titles similar to the given title.
The titles should be just like a list of titles with comma separated only.

Respond with JSON. Keep your response concise and direct.

for example 
```json
{"out" : "Top Gun", "The Dark Knight", "The Godfather"}
"""

def run_prompt(prompt, data):
    messages = []
    add_user_message(messages, prompt + "the given title is " + data)
    add_assistant_message(messages, "```json")
    out = chat(messages, stop_sequences=["```"])
    return out

def llm_grade_output(test_input, output):
    grading_prompt = f"""
    You are an expert evaluator. Grade this AI response on a scale of 1-10 based on if the AI response titles are similar to the input title.

    TEST INPUT: {test_input}
    AI RESPONSE: {output}

    CRITERIA: How similar and relevant are the suggested titles to the input title?
    
    IMPORTANT: Respond with ONLY a single integer from 1-10. No explanations, no text, just the number.
    Examples of valid responses: 8, 3, 10
    Examples of invalid responses: "I give this an 8", "8/10", "eight"
    
    Your score:""".strip()
    
    try:
        grade_response = simple_chat(grading_prompt, max_tokens=5, temperature=0)
        
        # Extract number from response more robustly
        import re
        
        # First try to find any number in the response
        numbers = re.findall(r'\d+', grade_response.strip())
        
        if numbers:
            score = int(numbers[0])  # Take first number found
            # Ensure it's in valid range
            score = max(1, min(10, score))
            return score
        else:
            print(f"No number found in grade response: '{grade_response}'")
            return 5
            
    except Exception as e:
        print(f"Grading failed with error: {e}")
        return 5

def run_test_case(test_case):
    out = run_prompt(prompt, test_case["title"])
    
    # Use LLM to grade the output on a 1-10 scale
    score = llm_grade_output(test_case["title"], out)
    
    return {"test_case": test_case, "out": out, "score": score}

def run_eval():
    with open("../dataset/dataset.json", "r") as f:
        dataset = json.load(f)
    results = []
    for data in dataset:
        results.append(run_test_case(data))
    return results

def get_eval_metrics(eval_results):
    scores = [result["score"] for result in eval_results]
    return sum(scores) / len(scores)
    
eval_results = run_eval()
print(f"Average score: {get_eval_metrics(eval_results)}")
print(f"Individual results: {eval_results}")


Average score: 5.3
Individual results: [{'test_case': {'title': 'The Shawshank Redemption', 'imdb_id': 'tt0111161'}, 'out': '\n{"out": "The Alcatraz Escape", "The Redemption Saga", "Jailhouse Blues", "Locked Up Legends", "Shackled Souls"}\n', 'score': 7}, {'test_case': {'title': 'The Godfather', 'imdb_id': 'tt0068646'}, 'out': '\n{"out" : "The Sopranos, Breaking Bad, Game of Thrones, Stranger Things, House of Cards"}\n', 'score': 7}, {'test_case': {'title': 'The Dark Knight', 'imdb_id': 'tt0468569'}, 'out': '\n{"out": "The Avengers", "Guardians of the Galaxy", "Interstellar", "Inception", "Dunkirk"}\n', 'score': 2}, {'test_case': {'title': 'Inception', 'imdb_id': 'tt1375666'}, 'out': '\n{"out" : "Interstellar", "Memento", "The Matrix", "Shutter Island", "Minority Report"}\n', 'score': 7}, {'test_case': {'title': 'Forrest Gump', 'imdb_id': 'tt0109830'}, 'out': '\n{"out" : "Saving Private Ryan, Schindler\'s List, The Shawshank Redemption, Titanic, Jurassic Park"}\n', 'score': 2}, {'test_

In [37]:
eval_results

[{'test_case': {'title': 'The Shawshank Redemption', 'imdb_id': 'tt0111161'},
  'out': ':\n{\n  "out": "The Green Mile", "The Shawshank Redemption", "The Silence of the Lambs", "The Schindler\'s List", "The Forrest Gump"\n}',
  'score': 8},
 {'test_case': {'title': 'The Godfather', 'imdb_id': 'tt0068646'},
  'out': ':\n\n{\n  "out": "The Sopranos, Goodfellas, Scarface, Casino, The Untouchables"\n}',
  'score': 7},
 {'test_case': {'title': 'The Dark Knight', 'imdb_id': 'tt0468569'},
  'out': ':\n\n{\n  "out": "The Avengers", "Guardians of the Galaxy", "Captain America: The Winter Soldier", "Thor: Ragnarok", "Ant-Man and the Wasp"\n}',
  'score': 2},
 {'test_case': {'title': 'Inception', 'imdb_id': 'tt1375666'},
  'out': ':\n\n{\n  "out": "Interstellar", "Memento", "Shutter Island", "Minority Report", "The Matrix"\n}',
  'score': 6},
 {'test_case': {'title': 'Forrest Gump', 'imdb_id': 'tt0109830'},
  'out': ':\n\n{\n  "out": "Shawshank Redemption, Saving Private Ryan, Schindler\'s List, 

## Validate via code

In [33]:
# Functions to validate the output structure
import re
import ast


def validate_json(text):
    try:
        json.loads(text.strip())
        return 10
    except json.JSONDecodeError:
        return 0


def validate_python(text):
    try:
        ast.parse(text.strip())
        return 10
    except SyntaxError:
        return 0


def validate_regex(text):
    try:
        re.compile(text.strip())
        return 10
    except re.error:
        return 0


def grade_syntax(response, test_case):
    format = test_case["format"]
    if format == "json":
        return validate_json(response)
    elif format == "python":
        return validate_python(response)
    else:
        return validate_regex(response)

def grade_syntax(response):
    return validate_json(response)


In [None]:
def run_test_case(test_case):
    out = run_prompt(prompt, test_case["title"])
    
    # Use LLM to grade the output on a 1-10 scale
    llm_score = llm_grade_output(test_case["title"], out)
    syntax_score = grade_syntax(out)
    
    score = (llm_score + syntax_score) / 2
    
    return {"test_case": test_case, "out": out, "score": score}

def run_eval():
    with open("../dataset/dataset.json", "r") as f:
        dataset = json.load(f)
    results = []
    for data in dataset:
        results.append(run_test_case(data))
    return results

def get_eval_metrics(eval_results):
    scores = [result["score"] for result in eval_results]
    return sum(scores) / len(scores)
    
eval_results = run_eval()
print(f"Average score: {get_eval_metrics(eval_results)}")
print(f"Individual results: {eval_results}")

Average score: 5.05
Individual results: [{'test_case': {'title': 'The Shawshank Redemption', 'imdb_id': 'tt0111161'}, 'out': '\n{"out" : "The Green Mile", "The Shawshank Redemption", "The Silence of the Lambs", "The Departed", "The Schindler\'s List"}\n', 'score': 3.5}, {'test_case': {'title': 'The Godfather', 'imdb_id': 'tt0068646'}, 'out': '\n{"out" : "The Sopranos, Breaking Bad, Game of Thrones, House of Cards, Peaky Blinders"}\n', 'score': 8.5}, {'test_case': {'title': 'The Dark Knight', 'imdb_id': 'tt0468569'}, 'out': '\n{"out": "The Avengers", "Inception", "Interstellar", "Jurassic Park", "The Matrix"}\n', 'score': 1.0}, {'test_case': {'title': 'Inception', 'imdb_id': 'tt1375666'}, 'out': '\n{"out" : "Interstellar", "Memento", "Shutter Island", "Minority Report", "Looper"}\n', 'score': 3.5}, {'test_case': {'title': 'Forrest Gump', 'imdb_id': 'tt0109830'}, 'out': '\n{"out": "Shawshank Redemption, Schindler\'s List, Saving Private Ryan, Catch Me If You Can, The Shining"}\n', 'score

In [43]:
eval_results

[{'test_case': {'title': 'The Shawshank Redemption', 'imdb_id': 'tt0111161'},
  'out': '\n{"out" : "The Green Mile", "The Shawshank Redemption", "The Silence of the Lambs", "The Departed", "The Schindler\'s List"}\n',
  'score': 3.5},
 {'test_case': {'title': 'The Godfather', 'imdb_id': 'tt0068646'},
  'out': '\n{"out" : "The Sopranos, Breaking Bad, Game of Thrones, House of Cards, Peaky Blinders"}\n',
  'score': 8.5},
 {'test_case': {'title': 'The Dark Knight', 'imdb_id': 'tt0468569'},
  'out': '\n{"out": "The Avengers", "Inception", "Interstellar", "Jurassic Park", "The Matrix"}\n',
  'score': 1.0},
 {'test_case': {'title': 'Inception', 'imdb_id': 'tt1375666'},
  'out': '\n{"out" : "Interstellar", "Memento", "Shutter Island", "Minority Report", "Looper"}\n',
  'score': 3.5},
 {'test_case': {'title': 'Forrest Gump', 'imdb_id': 'tt0109830'},
  'out': '\n{"out": "Shawshank Redemption, Schindler\'s List, Saving Private Ryan, Catch Me If You Can, The Shining"}\n',
  'score': 6.0},
 {'test