In [1]:
from core import github_client

def build_evaluation_suite(repo_full_name: str, state: str = 'open', limit: int = 10):
    print(f"Building evaluation suite for {repo_full_name} with PRs in state {state}")
    repo = github_client.get_repo(repo_full_name)

    pr_numbers = []
    
    for i, pr in enumerate(repo.get_pulls(state=state)):
        if pr.user.login == "github-actions[bot]":
            continue

        print(f"\tPR #{pr.number}, {pr.user.login}: {pr.title}")
        pr_numbers.append(pr.number)

        if i >= limit:
            print(f"\tReached limit of {limit} PRs")
            break

    print("\tExample config: {}\n".format({
        repo_full_name: pr_numbers
    }))
    

# Example usage
# build_evaluation_suite("locustio/locust")
# build_evaluation_suite("pandas-dev/pandas")
# build_evaluation_suite("ktrnka/update-your-readme", state='closed', limit=10)


In [2]:
import difflib

def diff_readmes(original_readme: str, updated_readme: str):
    diff = difflib.unified_diff(
        original_readme.splitlines(),
        updated_readme.splitlines(),
    )
    lines_added = 0
    lines_removed = 0
    for line in diff:
        if line.startswith('+') and not line.startswith('+++'):
            lines_added += 1
        elif line.startswith('-') and not line.startswith('---'):
            lines_removed += 1
    return lines_added, lines_removed

# Test

diff_readmes("Hello\nWorld", "Hello\nWorld\nGoodbye")



(1, 0)

In [3]:
medium_test_suite = {
    'locustio/locust': [2899, 2856, 2820, 2786],
    'ktrnka/update-your-readme': [50, 49, 46, 44, 43, 41, 40],
}

small_test_suite = {
    'ktrnka/update-your-readme': [41, 40],
}

In [4]:
from typing import NamedTuple, Optional, Tuple
from core import ReadmeRecommendation, review_pull_request
from time import time

class SingleOutcome(NamedTuple):
    result: Optional[ReadmeRecommendation]
    error: Optional[ValueError]
    seconds: float
    diff: Optional[Tuple]

test_suite = medium_test_suite

outcomes = {}
for repo_name, pr_numbers in test_suite.items():
    print(f"Testing {repo_name}...")
    for pr_number in pr_numbers:
        print(f"\tTesting PR #{pr_number}...")


        start_time = time()

        try:
            repo = github_client.get_repo(repo_name)
            pr = repo.get_pull(pr_number)

            # Get the base README
            base_readme = repo.get_contents("README.md", ref=pr.base.sha).decoded_content.decode()

            result = review_pull_request(repo, pr)

            diff_results = None
            if result.should_update:
                diff_results = diff_readmes(base_readme, result.updated_readme)

            outcomes[(repo_name, pr_number)] = SingleOutcome(result, None, time() - start_time, diff_results)
        except ValueError as e:
            outcomes[(repo_name, pr_number)] = SingleOutcome(None, e, time() - start_time, None)

# summarize the results
percent_failed = len([outcome for outcome in outcomes.values() if outcome.result is None]) / len(outcomes)
total_runtime = sum(outcome.seconds for outcome in outcomes.values())
mean_runtime = total_runtime / len(outcomes)

print(f"""
Tested against {len(outcomes)} PRs in {len(test_suite)} repos.

{percent_failed:.0%} failed.
Total runtime: {total_runtime:.0f}s
Mean runtime per PR: {mean_runtime:.0f}s
""")

Testing locustio/locust...
	Testing PR #2899...
	Testing PR #2856...
	Testing PR #2820...
	Testing PR #2786...
Testing ktrnka/update-your-readme...
	Testing PR #50...
	Testing PR #49...
	Testing PR #46...
	Testing PR #44...
	Testing PR #43...
	Testing PR #41...
	Testing PR #40...

Tested against 11 PRs in 2 repos.

36% failed.
Total runtime: 99s
Mean runtime per PR: 9s



In [5]:
from pprint import pprint

# Review errors
for outcome_id, outcome in outcomes.items():
    if outcome.error:
        print(f"{outcome_id}: {outcome.result} in {outcome.seconds:.0f}s")
        print(f"\tError: {outcome.error}")
    # if outcome.diff:
    #     print(f"\tDiff: {outcome.diff}")
# pprint(outcomes[('ktrnka/update-your-readme', 41)].error.json())

('locustio/locust', 2856): None in 12s
	Error: 1 validation error for ReadmeRecommendation
  Value error, updated_readme must be provided if should_update is True [type=value_error, input_value={'should_update': True, '...r direction for users.'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/value_error
('locustio/locust', 2786): None in 12s
	Error: 1 validation error for ReadmeRecommendation
  Value error, updated_readme must be provided if should_update is True [type=value_error, input_value={'should_update': True, '...ion these new options.'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/value_error
('ktrnka/update-your-readme', 50): None in 10s
	Error: 1 validation error for ReadmeRecommendation
  Value error, updated_readme must be provided if should_update is True [type=value_error, input_value={'should_update': True, '...tructure and features.'}, input_type=dict]
    For further information visit htt

In [8]:
from pprint import pprint

# Review non-errors
for outcome_id, outcome in outcomes.items():
    if outcome.result:
        print(f"""
# {outcome_id}
Should update? {outcome.result.should_update} in {outcome.seconds:.0f}s
""")
        if outcome.result.should_update:
            print(f"""Reason: {outcome.result.reason}\nDiff: +{outcome.diff[0]}, -{outcome.diff[1]}""")
            if sum(outcome.diff) > 0:
                print(f"""Updated README: \n{outcome.result.updated_readme}""")



# ('locustio/locust', 2899)
Should update? False in 4s


# ('locustio/locust', 2820)
Should update? False in 4s


# ('ktrnka/update-your-readme', 49)
Should update? True in 8s

Reason: The pull request includes a change to the core.py file, which is part of the project structure. This change should be reflected in the README to keep it up-to-date and accurate.
Diff: +0, -0

# ('ktrnka/update-your-readme', 46)
Should update? True in 10s

Reason: The existing README could be improved with some additional details and formatting to enhance readability and usability. The pull request changes indicate that the project structure and GitHub Actions workflows have been updated, which should be reflected in the README.
Diff: +0, -0

# ('ktrnka/update-your-readme', 44)
Should update? True in 8s

Reason: The README should be updated to reflect the new changes, including the addition of the `main.py` file in the `src` directory.
Diff: +2, -1
Updated README: 
# Update Your README

This project auto

# Monday afternoon

I learned the hard way that something about the ChatPromptTemplate was disabling the Python variables in the Human step,
so it was generating the readme purely from guidelines

## After the fix:

    Tested against 11 PRs in 2 repos.

    36% failed.
    Total runtime: 91s
    Mean runtime per PR: 8s

The failed cases are doing should_update=True and updated_readme = nothing. I saw in the raw output that one of the really bad subtractions just had [rest of readme the same] or some such at the bottom.

I'm going to re-run this to be sure:


# Dev log: Monday

Refactor:
- More controlled experiment (multiple repos, build a fixed set of PRs ahead of time)
- Track failure rate and execution time
- Hold onto any objects to adhoc analysis after running

Test suites:

    medium_test_suite = {
        'locustio/locust': [2899, 2856, 2820, 2786],
        'ktrnka/update-your-readme': [50, 49, 46, 44, 43, 41, 40],
    }

    small_test_suite = {
        'ktrnka/update-your-readme': [41, 40],
    }

## Medium test suite

### Baseline with Haiku, before removing the directory tree

    Tested against 11 PRs in 2 repos.

    18% failed.
    Total runtime: 135s
    Mean runtime per PR: 12s

### After removing the directory tree

    Tested against 11 PRs in 2 repos.

    18% failed.
    Total runtime: 129s
    Mean runtime per PR: 12s

It's slightly faster but not a lot. I'll keep the change though.

### Adding prompt caching

    Tested against 11 PRs in 2 repos.

    0% failed.
    Total runtime: 54s
    Mean runtime per PR: 5s

Notes
- It throws an annoying warning "extra_headers was transferred to model_kwargs" but that's what the docs show: https://api.python.langchain.com/en/latest/chat_models/langchain_anthropic.chat_models.ChatAnthropic.html
- The speedup is wonderful! That's what I'd hoped for
- The 0% failure rate is surprising. It's possible that it's a result of needing to refactor to use the SystemMessage vs HumanMessage

I'm going to re-run this without any changes cause I kind of don't even believe that we have no errors now:

    Tested against 11 PRs in 2 repos.

    0% failed.
    Total runtime: 53s
    Mean runtime per PR: 5s

Huh

## Small test suite

### Baseline test

    Tested against 2 PRs in 1 repos.

    50% failed.
    Total runtime: 55s
    Mean runtime per PR: 28s

### With Claude 3 Haiku

    Tested against 2 PRs in 1 repos.

    0% failed.
    Total runtime: 22s
    Mean runtime per PR: 11s



# Dev log: Sunday

## Before prompt engineering, running on Locust
Counter({'ValidationError': 3, 'should_update': 1})

## Stronger guidance in the prompt itself, like the Pydantic field descriptions and how they're mentioned in the prompt itself
Counter({'ValidationError': 4})
Counter({'ValidationError': 4})
Counter({'ValidationError': 2, 'should_update': 1, 'no_update': 1})

## Retries
Counter({'ValidationError': 3, 'should_update': 1})

## Prompt updates, Pydantic model updates
Counter({'should_update': 3, 'ValueError': 1})

