In [1]:
from core import github_client

def build_evaluation_suite(repo_full_name: str, state: str = 'open', limit: int = 10):
    print(f"Building evaluation suite for {repo_full_name} with PRs in state {state}")
    repo = github_client.get_repo(repo_full_name)

    pr_numbers = []
    
    for i, pr in enumerate(repo.get_pulls(state=state)):
        if pr.user.login == "github-actions[bot]":
            continue

        print(f"\tPR #{pr.number}, {pr.user.login}: {pr.title}")
        pr_numbers.append(pr.number)

        if i >= limit:
            print(f"\tReached limit of {limit} PRs")
            break

    print("\tExample config: {}\n".format({
        repo_full_name: pr_numbers
    }))
    

# Example usage
# build_evaluation_suite("locustio/locust")
# build_evaluation_suite("pandas-dev/pandas")
# build_evaluation_suite("ktrnka/update-your-readme", state='closed', limit=10)


In [2]:
medium_test_suite = {
    'locustio/locust': [2899, 2856, 2820, 2786],
    'ktrnka/update-your-readme': [50, 49, 46, 44, 43, 41, 40],
}

small_test_suite = {
    'ktrnka/update-your-readme': [41, 40],
}

In [4]:
from typing import NamedTuple, Optional
from core import ReadmeRecommendation, review_pull_request
from time import time

class SingleOutcome(NamedTuple):
    result: Optional[ReadmeRecommendation]
    error: Optional[ValueError]
    seconds: float

test_suite = medium_test_suite

outcomes = {}
for repo_name, pr_numbers in test_suite.items():
    print(f"Testing {repo_name}...")
    for pr_number in pr_numbers:
        print(f"\tTesting PR #{pr_number}...")

        start_time = time()

        try:
            repo = github_client.get_repo(repo_name)
            pr = repo.get_pull(pr_number)
            result = review_pull_request(repo, pr)
            outcomes[(repo_name, pr_number)] = SingleOutcome(result, None, time() - start_time)
        except ValueError as e:
            outcomes[(repo_name, pr_number)] = SingleOutcome(None, e, time() - start_time)

# summarize the results
percent_failed = len([outcome for outcome in outcomes.values() if outcome.result is None]) / len(outcomes)
total_runtime = sum(outcome.seconds for outcome in outcomes.values())
mean_runtime = total_runtime / len(outcomes)

print(f"""
Tested against {len(outcomes)} PRs in {len(test_suite)} repos.

{percent_failed:.0%} failed.
Total runtime: {total_runtime:.0f}s
Mean runtime per PR: {mean_runtime:.0f}s
""")

Testing locustio/locust...
	Testing PR #2899...
	Testing PR #2856...
	Testing PR #2820...
	Testing PR #2786...
Testing ktrnka/update-your-readme...
	Testing PR #50...
	Testing PR #49...
	Testing PR #46...
	Testing PR #44...
	Testing PR #43...
	Testing PR #41...
	Testing PR #40...

Tested against 11 PRs in 2 repos.

0% failed.
Total runtime: 57s
Mean runtime per PR: 5s



In [4]:
            # diff = difflib.unified_diff(
            #     repo_name.get_readme().decoded_content.decode("utf-8").splitlines(),
            #     result.updated_readme.splitlines(),
            # )
            # for line in diff:
            #     print(line)

# Dev log: Monday

Refactor:
- More controlled experiment (multiple repos, build a fixed set of PRs ahead of time)
- Track failure rate and execution time
- Hold onto any objects to adhoc analysis after running

Test suites:

    medium_test_suite = {
        'locustio/locust': [2899, 2856, 2820, 2786],
        'ktrnka/update-your-readme': [50, 49, 46, 44, 43, 41, 40],
    }

    small_test_suite = {
        'ktrnka/update-your-readme': [41, 40],
    }

## Medium test suite

### Baseline with Haiku, before removing the directory tree

    Tested against 11 PRs in 2 repos.

    18% failed.
    Total runtime: 135s
    Mean runtime per PR: 12s

### After removing the directory tree

    Tested against 11 PRs in 2 repos.

    18% failed.
    Total runtime: 129s
    Mean runtime per PR: 12s

It's slightly faster but not a lot. I'll keep the change though.

### Adding prompt caching

    Tested against 11 PRs in 2 repos.

    0% failed.
    Total runtime: 54s
    Mean runtime per PR: 5s

Notes
- It throws an annoying warning "extra_headers was transferred to model_kwargs" but that's what the docs show: https://api.python.langchain.com/en/latest/chat_models/langchain_anthropic.chat_models.ChatAnthropic.html
- The speedup is wonderful! That's what I'd hoped for
- The 0% failure rate is surprising. It's possible that it's a result of needing to refactor to use the SystemMessage vs HumanMessage

I'm going to re-run this without any changes cause I kind of don't even believe that we have no errors now:

    Tested against 11 PRs in 2 repos.

    0% failed.
    Total runtime: 53s
    Mean runtime per PR: 5s

Huh

## Small test suite

### Baseline test

    Tested against 2 PRs in 1 repos.

    50% failed.
    Total runtime: 55s
    Mean runtime per PR: 28s

### With Claude 3 Haiku

    Tested against 2 PRs in 1 repos.

    0% failed.
    Total runtime: 22s
    Mean runtime per PR: 11s



# Dev log: Sunday

## Before prompt engineering, running on Locust
Counter({'ValidationError': 3, 'should_update': 1})

## Stronger guidance in the prompt itself, like the Pydantic field descriptions and how they're mentioned in the prompt itself
Counter({'ValidationError': 4})
Counter({'ValidationError': 4})
Counter({'ValidationError': 2, 'should_update': 1, 'no_update': 1})

## Retries
Counter({'ValidationError': 3, 'should_update': 1})

## Prompt updates, Pydantic model updates
Counter({'should_update': 3, 'ValueError': 1})

