In [1]:
import asyncio
from concurrent.futures import ThreadPoolExecutor
import litellm
import os
import pandas as pd
import json
import re
import copy
import numpy as np
import requests

In [2]:
# set the env for LLMs
from dotenv import load_dotenv
os.environ["OPENROUTER_API_KEY"] = "XXXXX"
# you can also create a .env file and load it as such:
#
"""
from dotenv import load_dotenv
import os
load_dotenv()
"""

In [3]:
# first make sure API
messages = [{
  "role": "user",
  "content": "Write a founder-style tweet about launching a project that unexpectedly went viral."
}]
# 1. GPT-3.5 via OpenRouter
response_gpt = litellm.completion(
    model="openrouter/openai/gpt-3.5-turbo",
    messages=messages,
    temperature=0.7,
    max_tokens=50
)

# 2. Cohere's Command R+ via OpenRouter
response_commandr = litellm.completion(
    model="openrouter/cohere/command-r-plus",
    messages=messages,
    temperature=0.7,
    max_tokens=50
)

# 3. Mistral-7B via OpenRouter
response_mistral = litellm.completion(
    model="openrouter/mistralai/mistral-7b-instruct",
    messages=messages,
    temperature=0.7,
    max_tokens=50
)

In [4]:
# function that extracts the content of messy output
def extract_message(response):
    contents = [choice['message']['content'] for choice in response['choices']]
    return contents[0] if len(contents) == 1 else contents

In [5]:
print('\nGPT-3.5:\n', extract_message(response_gpt))
print('\nCOHERE:\n', extract_message(response_commandr))
print('\nMISTRAL:\n', extract_message(response_mistral))


GPT-3.5:
 Just launched our project and the response has been INSANE! 😱🚀 Thank you to everyone who has shown us love and support. Let's keep this momentum going! #viral #startuplife #grateful

COHERE:
 "Humbled and amazed by the incredible response to our project launch! We built something special and the world took notice. This is just the beginning - so much more to come. Thank you all for your support and belief in our vision. Let's

MISTRAL:
  🌐💥 Just dropped our little project into the digital world & whoa, it's taken off like a rocket! 🚀 gratitude to each one of you who've embraced our vision. Your support


In [6]:
# a wrapper to run a blocking function asynchronously
async def execute_async(task, *args, **kwargs):
    """
    Runs a synchronous (blocking) function in an asynchronous event loop.
    
    Args:
        task: The blocking function to execute.
        *args: Positional arguments to pass to the task.
        **kwargs: Keyword arguments to pass to the task.
    
    Returns:
        the result of the blocking task, executed in a thread.
    
    Design choice:
        - uses ThreadPoolExecutor to prevent blocking the main asyncio loop.
        - this is essential when mixing blocking I/O (e.g., network requests, file I/O) with async code.
    
    Tradeoff:
        - Threads use more memory than async tasks, but allow reuse of blocking code without modification.
    """
    loop = asyncio.get_event_loop()
    with ThreadPoolExecutor() as executor:
        return await loop.run_in_executor(executor, lambda: task(*args, **kwargs))

# asynchronously run tasks in batches, so we can scale
async def batch_runner(task_fn, inputs, batch_size=1, **kwargs):
    """
    Executes a task function over a list of inputs in asynchronous batches.
    
    Args:
        task_fn: The function to execute on each input (should be blocking).
        inputs: A list of inputs to process.
        batch_size: Number of tasks to run concurrently in each batch.
        **kwargs: Additional arguments passed to each call of task_fn.
    
    Returns:
        a flat list of all outputs collected from the task_fn calls.

    Design choice:
        - Batching reduces memory usage and avoids overwhelming system resources or APIs.
        - Uses asyncio.gather to run multiple wrapped blocking tasks concurrently per batch.
    
    Tooling:
        - asyncio.gather is used for concurrency within a batch.
        - ThreadPoolExecutor via execute_async lets us call blocking code safely.
    
    Tradeoff:
        - too large a batch_size may overload CPU/network; too small will underutilize resources.
        - this approach balances concurrency with safety by controlling how many tasks run at once.
    """
    output = []
    total = len(inputs)

    for start in range(0, total, batch_size):
        end = min(start + batch_size, total)
        print(f"Running batch {start + 1}-{end} of {total}")
        batch = inputs[start:end]
        batch_results = await asyncio.gather(*[
            execute_async(task_fn, messages=item, **kwargs) for item in batch
        ])
        output.extend(batch_results)

    return output

In [7]:
# let's load the benchmark questions
import json
vibe_questions = json.load(open('vibe_check.json', 'r'))

In [8]:
# convert each benchmark question into a message format suitable for chat models 
# (list of dicts with 'role' and 'content')
messages = [[{"role": "user", "content": q['question']}] for q in vibe_questions]

In [9]:
model_list = [
    "openrouter/openai/gpt-3.5-turbo",
    "openrouter/cohere/command-r-plus",
    "openrouter/mistralai/mistral-7b-instruct",
]

# dict to store all outputs
all_model_outputs = {}

for model_name in model_list:
    print(f"\n🔍 Evaluating model: {model_name}")

    # run the LLM in batches using async runner
    raw_outputs = await batch_runner(
        task_fn=litellm.completion,
        inputs=messages,
        batch_size=10,
        model=model_name,
        temperature=0,
        max_tokens=2048
    )

    # copy original questions to avoid mutation
    eval_data = copy.deepcopy(vibe_questions)

    # insert model answers
    for i, item in enumerate(eval_data):
        item["model_answer"] = extract_message(raw_outputs[i])
        item["score"] = ""  # we add a placeholder for future scoring

    # save results to JSON file
    filename = f"./outputs/answers-{model_name.split('/')[-1]}.json"
    pd.DataFrame(eval_data).set_index('index').to_json(filename, orient="index")

    # Store in dictionary for further use
    all_model_outputs[model_name] = pd.read_json(filename, orient='index')


🔍 Evaluating model: openrouter/openai/gpt-3.5-turbo
Running batch 1-10 of 10

🔍 Evaluating model: openrouter/cohere/command-r-plus
Running batch 1-10 of 10

🔍 Evaluating model: openrouter/mistralai/mistral-7b-instruct
Running batch 1-10 of 10


In [10]:
# it's a dict that stores keys
print(all_model_outputs.keys())

dict_keys(['openrouter/openai/gpt-3.5-turbo', 'openrouter/cohere/command-r-plus', 'openrouter/mistralai/mistral-7b-instruct'])


In [11]:
# let's see the output of gpt3.5 for example, the score is the field we'll update later
all_model_outputs['openrouter/openai/gpt-3.5-turbo'].head()

Unnamed: 0,category,question,human_answer,model_answer,score
1,Tone Match,Rewrite this sentence in a Gen-Z tone: 'We are...,"'Bruh the servers dipped 💀 hold up, we’re fixi...","Sorry y'all, we're dealing with some tech issu...",
2,Politeness Check,"Your boss says, 'We’ll need that report by end...","'Absolutely, I’ll make sure it’s ready by then...","Of course, I will make sure to have the report...",
3,Social Media Vibe,Write a founder-style tweet announcing a side ...,'Launched a tiny tool over the weekend. No mar...,🚀 Exciting news! What started as a passion pro...,
4,Aesthetic Tone,Describe a living room with a brutalist vibe.,"'Exposed concrete walls, steel beams, sharp an...","The living room is spacious and minimalist, wi...",
5,Flirt vs Friendly,Someone texts you: 'You looked really good tod...,'Omg you noticed? 😳 maybe I should dress like ...,"Thank you, but I always look good 😉",


In [12]:
def format_eval_prompt(question, ideal_answer, generated_answer):
    return f"""VIBE CHECK SCORING GUIDE
100%: Nailed the exact tone, vibe, and intent — no mismatch at all.
80%: Captures the overall vibe with small tone or style mismatches.
60%: Mostly matches the vibe, but noticeably off in phrasing or energy.
40%: Some effort, but the vibe is clearly off or inconsistent.
20%: Minimal vibe alignment — feels generic or mismatched.
0%: Completely misses the vibe or uses the wrong tone entirely.

ORIGINAL REQUEST (Vibe Prompt):
{question}

REFERENCE RESPONSE (Perfect Vibe):
{ideal_answer}

CANDIDATE RESPONSE:
{generated_answer}

TASK:
Does the CANDIDATE RESPONSE match the vibe and intent of the REFERENCE RESPONSE? Use the VIBE CHECK SCORING GUIDE. Explain your reasoning, then return only a JSON object with the final score percentage like:
{{"score": 80}}
"""


In [13]:
# pulls a json object from llm output string
def find_json_score(text):
    pattern = re.compile(r'\{.*?\}|\[.*?\]', re.DOTALL)
    candidates = pattern.findall(text)
    for block in candidates:
        try:
            return json.loads(block.replace('%', ''))
        except json.JSONDecodeError:
            continue
    return None
def query_judge_llm(messages: list, model='openrouter/openai/gpt-3.5-turbo', max_tokens=1000, temperature=0, n=1):
    """
    Sends a batch of messages to the specified LLM model for evaluation.

    Args:
        messages: A list of message dictionaries in chat format.
        model: The LLM identifier to use.
        max_tokens: Maximum tokens to generate.
        temperature: Sampling temperature.
        n: Number of completions to generate.

    Returns:
        The raw response object from the LLM.
    """
    return litellm.completion(
        model=model,
        messages=messages,
        max_tokens=max_tokens,
        temperature=temperature,
        n=n
    )

In [14]:
from tqdm import tqdm

scored_outputs = {}

for model_id in model_list:
    print(f"\n🔍 Evaluating completions from: {model_id}")
    
    model_answers = all_model_outputs[model_id]['model_answer']
    prompts_to_score = []

    # design choice: create custom evaluation prompts per question
    # this allows the judge model to compare generated answers to human answers.
    # we assume model_answers[1] corresponds to q[0], so we offset index by +1.
    # Build evaluation prompts for each question
    for i, q in enumerate(vibe_questions):
        prompt = format_eval_prompt(
            question=q["question"],
            ideal_answer=q["human_answer"],
            generated_answer=model_answers[i + 1]  # assuming index starts at 1
        )
        prompts_to_score.append([{"role": "user", "content": prompt}])

    # use batch_runner for scalable, async evaluation
    # we run up to 30 prompts in parallel to improve throughput.
    # query_judge_llm is a blocking function like an openai call), so batch_runner handles threading.
    # Run eval LLM to judge quality
    evaluation_outputs = await batch_runner(
        task_fn=query_judge_llm,
        inputs=prompts_to_score,
        batch_size=30,
        model="openrouter/openai/gpt-3.5-turbo",  # judge model
        max_tokens=4000,
        temperature=0,
    )

    # Extract score from each response
    scores = []
    for response in evaluation_outputs:
        result_text = extract_message(response)
        score_obj = find_json_score(result_text)
        scores.append(score_obj.get("score") if score_obj else None)

    # store scores per model_id
    # this makes it easy to compare different models later.
    scored_outputs[model_id] = scores


🔍 Evaluating completions from: openrouter/openai/gpt-3.5-turbo
Running batch 1-10 of 10

🔍 Evaluating completions from: openrouter/cohere/command-r-plus
Running batch 1-10 of 10

🔍 Evaluating completions from: openrouter/mistralai/mistral-7b-instruct
Running batch 1-10 of 10


In [15]:
scored_outputs

{'openrouter/openai/gpt-3.5-turbo': [60, 100, 80, 80, 60, 80, 100, 80, 80, 80],
 'openrouter/cohere/command-r-plus': [60,
  80,
  60,
  80,
  80,
  80,
  100,
  80,
  100,
  60],
 'openrouter/mistralai/mistral-7b-instruct': [80,
  80,
  80,
  80,
  80,
  80,
  100,
  80,
  80,
  60]}

In [16]:
# post-processing step: update the original answer files with model evaluation scores
for model_id, scores in scored_outputs.items():
    short_name = model_id.split('/')[-1]  # e.g., "gpt-3.5-turbo"
    path = f'./outputs/answers-{short_name}.json'

    # Load existing answers
    df = pd.read_json(path, orient='index')

    # Assign scores (by row index)
    df['score'] = scores

    # Save back to file
    df.to_json(path, orient='index')

    print(f"✅ Updated scores for {short_name}")

✅ Updated scores for gpt-3.5-turbo
✅ Updated scores for command-r-plus
✅ Updated scores for mistral-7b-instruct


In [18]:
# note: file path can be easily automated instead of manually listing,
# but for this assignment it's faster and clearer to specify them directly.
model_files = {
    "Cohere": "outputs/answers-command-r-plus.json",
    "GPT-3.5": "outputs/answers-gpt-3.5-turbo.json",
    "Mistral": "outputs/answers-mistral-7b-instruct.json"
}

for model_name, filepath in model_files.items():
    print(f"\n🔍 {model_name} — Sample of Evaluated Completions\n")

    df = pd.read_json(filepath, orient="index")

    # Show first 5 rows with category, question, answers, and score
    for i, row in df.head(5).iterrows():
        print(f"Category     : {row.get('category', 'N/A')}")
        print(f"Question     : {row['question']}")
        print(f"Human Answer : {row['human_answer']}")
        print(f"Model Answer : {row['model_answer']}")
        print(f"Score        : {row.get('score', 'N/A')}")
        print("-" * 80)


🔍 Cohere — Sample of Evaluated Completions

Category     : Tone Match
Question     : Rewrite this sentence in a Gen-Z tone: 'We are currently experiencing technical difficulties.'
Human Answer : 'Bruh the servers dipped 💀 hold up, we’re fixing it.'
Model Answer : Oops, we're having some tech issues right now. Our bad!
Score        : 60
--------------------------------------------------------------------------------
Category     : Politeness Check
Question     : Your boss says, 'We’ll need that report by end of day.' Respond with a polite and professional tone.
Human Answer : 'Absolutely, I’ll make sure it’s ready by then. Let me know if you need anything else.'
Model Answer : Of course! I will have the report ready and on your desk before the end of the business day.
Score        : 80
--------------------------------------------------------------------------------
Category     : Social Media Vibe
Question     : Write a founder-style tweet announcing a side project that got unexpectedl