A lightweight, zero-dependency Python framework for scoring and comparing LLM prompt outputs.
pip install git+https://github.com/hoeberigs/prompteval.gitEvaluating LLM outputs shouldn't require heavy ML frameworks or paid APIs. prompteval gives you a fast, composable toolkit to score, compare, and report on prompt performance — all in pure Python.
- Zero core dependencies — runs anywhere Python runs
- 10+ built-in scorers — exact match, regex, JSON validation, sentiment, fluency, and more
- Composable — combine scorers with weights, or write your own in one line
- Model-agnostic — plug in OpenAI, Anthropic, or any LLM via simple runner adapters
- Compare across models — side-by-side benchmarking with variance analysis
- Multiple report formats — ASCII tables, Markdown, JSON export
from prompteval import Evaluation, ExactMatch, LengthRange
ev = Evaluation(scorers=[ExactMatch(case_sensitive=False), LengthRange(1, 50)])
result = ev.run(
prompt="What is the capital of France?",
output="Paris",
expected="paris",
)
print(result.passed) # True
print(result.avg_score) # 1.0from prompteval import EvalSuite, ExactMatch, MockRunner, Report
suite = (
EvalSuite("geography")
.add("Capital of France?", expected="Paris")
.add("Capital of Japan?", expected="Tokyo")
.add("Capital of Brazil?", expected="Brasília")
)
runner = MockRunner(responses={
"Capital of France?": "Paris",
"Capital of Japan?": "Tokyo",
"Capital of Brazil?": "São Paulo",
})
results = suite.run(scorers=[ExactMatch()], runner=runner)
print(Report(results).to_table())+---+----------------------+------+------------+-------+---------+
| # | Prompt | Pass | ExactMatch | Avg | Latency |
+---+----------------------+------+------------+-------+---------+
| 1 | Capital of France? | PASS | 1.00 | 1.000 | 0ms |
| 2 | Capital of Japan? | PASS | 1.00 | 1.000 | 0ms |
| 3 | Capital of Brazil? | FAIL | 0.00 | 0.000 | 0ms |
+---+----------------------+------+------------+-------+---------+
Total: 3 cases | Pass rate: 67% | Avg score: 0.667
from prompteval import Comparator
comp = Comparator()
result = comp.compare({
"gpt-4o": results_gpt4o,
"claude-sonnet": results_claude,
})
print(result.to_table())| Scorer | What it checks |
|---|---|
ExactMatch |
Output equals expected (case-insensitive option) |
Contains |
Output contains all specified substrings |
RegexMatch |
Output matches a regex pattern |
LengthRange |
Output length within min/max characters |
JsonValid |
Valid JSON with optional required keys |
SentimentPositive |
Positive sentiment via keyword heuristics |
Fluency |
Sentence structure, vocabulary richness |
CustomScorer |
Wrap any (output, expected) -> float function |
CompositeScorer |
Weighted combination of multiple scorers |
from prompteval import CustomScorer
def politeness(output, expected=None, **kwargs):
polite = {"please", "thank", "thanks", "kindly"}
words = set(output.lower().split())
return min(1.0, len(words & polite) / 2)
scorer = CustomScorer(politeness, name="politeness")# OpenAI
from prompteval import OpenAIRunner
runner = OpenAIRunner(model="gpt-4o-mini", temperature=0)
# Anthropic
from prompteval import AnthropicRunner
runner = AnthropicRunner(model="claude-sonnet-4-20250514")
# Mock (for testing)
from prompteval import MockRunner
runner = MockRunner(responses={"prompt": "response"})Install provider extras:
pip install "git+https://github.com/hoeberigs/prompteval.git#egg=prompteval[openai]" # OpenAI support
pip install "git+https://github.com/hoeberigs/prompteval.git#egg=prompteval[anthropic]" # Anthropic support
pip install "git+https://github.com/hoeberigs/prompteval.git#egg=prompteval[all]" # Bothreport = Report(results)
report.to_table() # ASCII table
report.to_markdown() # Markdown table
report.to_json("out.json") # JSON file
report.to_dict() # Python dictgit clone https://github.com/hoeberigs/prompteval.git
cd prompteval
python -m venv .venv && source .venv/bin/activate
pip install -e ".[dev]"
pytest tests/ -vMIT