# Multi-Model Evaluation Pipeline

This notebook implements a comparative analysis pipeline that queries multiple LLM providers (OpenAI, Anthropic, Google, DeepSeek, Groq, Ollama) with the same prompt and uses a "Judge" model to rank the responses based on quality. This pattern demonstrates model interoperability and automated evaluation techniques.

In [None]:
# Import dependencies
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
from anthropic import Anthropic
from IPython.display import Markdown, display

In [None]:
# Load environment configuration
load_dotenv(override=True)

In [None]:
# Verify API Key Availability
api_keys = {
    "OpenAI": os.getenv('OPENAI_API_KEY'),
    "Anthropic": os.getenv('ANTHROPIC_API_KEY'),
    "Google": os.getenv('GOOGLE_API_KEY'),
    "DeepSeek": os.getenv('DEEPSEEK_API_KEY'),
    "Groq": os.getenv('GROQ_API_KEY')
}

for provider, key in api_keys.items():
    status = "Configured" if key else "Not Set"
    print(f"{provider}: {status}")

In [None]:
# Generate Evaluation Question using a strong model
request = "Please come up with a challenging, nuanced question that I can ask a number of LLMs to evaluate their intelligence. "
request += "Answer only with the question, no explanation."
messages = [{"role": "user", "content": request}]

openai = OpenAI()
response = openai.chat.completions.create(
    model="gpt-4o",
    messages=messages,
)
question = response.choices[0].message.content
print(f"Generated Question: {question}")

In [None]:
# Initialize result storage
competitors = []
answers = []
messages = [{"role": "user", "content": question}]

In [None]:
# Query OpenAI (GPT-4o-mini)
model_name = "gpt-4o-mini"

response = openai.chat.completions.create(model=model_name, messages=messages)
answer = response.choices[0].message.content

display(Markdown(f"**{model_name}**:\n{answer}"))
competitors.append(model_name)
answers.append(answer)

In [None]:
# Query Anthropic (Claude 3.5 Sonnet)
model_name = "claude-3-5-sonnet-20240620"

if os.getenv('ANTHROPIC_API_KEY'):
    claude = Anthropic()
    response = claude.messages.create(model=model_name, messages=messages, max_tokens=1000)
    answer = response.content[0].text

    display(Markdown(f"**{model_name}**:\n{answer}"))
    competitors.append(model_name)
    answers.append(answer)
else:
    print("Skipping Anthropic (Key not set)")

In [None]:
# Query Google (Gemini 1.5 Flash)
model_name = "gemini-1.5-flash"

if os.getenv('GOOGLE_API_KEY'):
    gemini = OpenAI(api_key=os.getenv('GOOGLE_API_KEY'), base_url="https://generativelanguage.googleapis.com/v1beta/openai/")
    response = gemini.chat.completions.create(model=model_name, messages=messages)
    answer = response.choices[0].message.content

    display(Markdown(f"**{model_name}**:\n{answer}"))
    competitors.append(model_name)
    answers.append(answer)
else:
    print("Skipping Google (Key not set)")

In [None]:
# Query DeepSeek
model_name = "deepseek-chat"

if os.getenv('DEEPSEEK_API_KEY'):
    deepseek = OpenAI(api_key=os.getenv('DEEPSEEK_API_KEY'), base_url="https://api.deepseek.com/v1")
    response = deepseek.chat.completions.create(model=model_name, messages=messages)
    answer = response.choices[0].message.content

    display(Markdown(f"**{model_name}**:\n{answer}"))
    competitors.append(model_name)
    answers.append(answer)
else:
    print("Skipping DeepSeek (Key not set)")

In [None]:
# Query Groq (Llama 3 70B)
model_name = "llama3-70b-8192"

if os.getenv('GROQ_API_KEY'):
    groq = OpenAI(api_key=os.getenv('GROQ_API_KEY'), base_url="https://api.groq.com/openai/v1")
    response = groq.chat.completions.create(model=model_name, messages=messages)
    answer = response.choices[0].message.content

    display(Markdown(f"**{model_name}**:\n{answer}"))
    competitors.append(model_name)
    answers.append(answer)
else:
    print("Skipping Groq (Key not set)")

In [None]:
# Query Local Ollama (Llama 3.2)
# Ensure 'ollama serve' is running locally
try:
    ollama = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')
    model_name = "llama3.2"
    
    # Pull model if not present (commented out for production speed)
    # !ollama pull llama3.2

    response = ollama.chat.completions.create(model=model_name, messages=messages)
    answer = response.choices[0].message.content

    display(Markdown(f"**{model_name}**:\n{answer}"))
    competitors.append(model_name)
    answers.append(answer)
except Exception as e:
    print(f"Skipping Ollama: {e}")

In [None]:
# Aggregate Responses
together = ""
for index, answer in enumerate(answers):
    together += f"# Response from competitor {index+1}\n\n"
    together += answer + "\n\n"
    
print(f"Aggregated {len(answers)} responses for evaluation.")

In [None]:
# Construct Evaluation Prompt
judge_prompt = f"""You are judging a competition between {len(competitors)} competitors.
Each model has been given this question:

{question}

Your job is to evaluate each response for clarity and strength of argument, and rank them in order of best to worst.
Respond with JSON, and only JSON, with the following format:
{{"results": ["best competitor number", "second best competitor number", "third best competitor number", ...]}}

Here are the responses from each competitor:

{together}

Now respond with the JSON with the ranked order of the competitors, nothing else. Do not include markdown formatting or code blocks."""

judge_messages = [{"role": "user", "content": judge_prompt}]

In [None]:
# Execute Evaluation (Judge: GPT-4o)
openai = OpenAI()
response = openai.chat.completions.create(
    model="gpt-4o",
    messages=judge_messages,
)
results = response.choices[0].message.content
print(results)

In [None]:
# Parse and Display Rankings
results_dict = json.loads(results)
ranks = results_dict["results"]
for index, result in enumerate(ranks):
    competitor = competitors[int(result)-1]
    print(f"Rank {index+1}: {competitor}")