In [21]:
from core import github_client
from pprint import pprint

def build_evaluation_suite(repo_full_name: str, state: str = 'open', limit: int = 10):
    print(f"Building evaluation suite for {repo_full_name} with PRs in state {state}")
    repo = github_client.get_repo(repo_full_name)

    pr_numbers = []
    
    for i, pr in enumerate(repo.get_pulls(state=state)):
        if pr.user.login == "github-actions[bot]":
            continue

        print(f"\tPR #{pr.number}, {pr.user.login}: {pr.title}")
        pr_numbers.append(pr.number)

        if i >= limit:
            print(f"\tReached limit of {limit} PRs")
            break

    print("\tExample config: {}\n".format({
        repo_full_name: pr_numbers
    }))
    

# Example usage
build_evaluation_suite("locustio/locust")
# build_evaluation_suite("pandas-dev/pandas")
build_evaluation_suite("ktrnka/update-your-readme", state='closed', limit=10)


Building evaluation suite for locustio/locust with PRs in state open
	PR #2899, fletelli42: Add polling mechanism in tests
	PR #2856, plaindocs: Light refactor, questions to address, and alt-text
	PR #2820, bakhtos: Handle task weights efficiently in `TaskSet`s
	PR #2786, bakhtos: CLI argument for selecting `UserClasses` to run
	Example config: {'locustio/locust': [2899, 2856, 2820, 2786]}

Building evaluation suite for ktrnka/update-your-readme with PRs in state closed
	PR #50, martilar: Remove main.py
	PR #49, martilar: Try commiting change as contributor
	PR #46, ktrnka: Debugging, checkout into enlistment
	PR #44, martilar: Add nonsense to README
	PR #43, martilar: Add option to supply user-feedback
	PR #41, ktrnka: Making it more reliable against validation errors
	PR #40, martilar: Add option to supply user-feedback
	Reached limit of 10 PRs
	Example config: {'ktrnka/update-your-readme': [50, 49, 46, 44, 43, 41, 40]}



In [22]:
test_suite = {
    'locustio/locust': [2899, 2856, 2820, 2786],
    'ktrnka/update-your-readme': [50, 49, 46, 44, 43, 41, 40],
}

In [23]:
for repo, pr_numbers in test_suite.items():
    print(f"Testing {repo}...")
    for pr_number in pr_numbers:
        print(f"\tTesting PR #{pr_number}...")
        # pr = github_client.get_repo(repo).get_pull(pr_number)
        # pprint(pr.get_files())
        # print("\n\n")

Testing locustio/locust...
	Testing PR #2899..
	Testing PR #2856..
	Testing PR #2820..
	Testing PR #2786..
Testing ktrnka/update-your-readme...
	Testing PR #50..
	Testing PR #49..
	Testing PR #46..
	Testing PR #44..
	Testing PR #43..
	Testing PR #41..
	Testing PR #40..


In [3]:
from typing import Counter
from core import *
from pydantic import ValidationError
from pprint import pprint

repo = github_client.get_repo("locustio/locust")

import difflib

stats = Counter()
for i, pr in enumerate(repo.get_pulls(state='open')):
    if i > 10:
        print("Stopping after 10 PRs")
        break

    try:
        result = review_pull_request(repo, pr, tries_remaining=1)
        print(f"PR #{pr.number}: {result}")

        if result.should_update:
            stats["should_update"] += 1

            print(f"""

                  
CHANGES FOR PR #{pr.number}:

Reason:
{result.reason}

Updated README (diff):
""")
            
            diff = difflib.unified_diff(
                repo.get_readme().decoded_content.decode("utf-8").splitlines(),
                result.updated_readme.splitlines(),
            )
            for line in diff:
                print(line)

        else:
            stats["no_update"] += 1
    except ValidationError as e:
        stats["ValidationError"] += 1
        pprint(e.json())
    except ValueError as e:
        stats["ValueError"] += 1
        pprint(e)

pprint(stats)

KeyboardInterrupt: 

In [2]:
# Dev log

## Before prompt engineering, running on Locust
Counter({'ValidationError': 3, 'should_update': 1})

## Stronger guidance in the prompt itself, like the Pydantic field descriptions and how they're mentioned in the prompt itself
Counter({'ValidationError': 4})
Counter({'ValidationError': 4})
Counter({'ValidationError': 2, 'should_update': 1, 'no_update': 1})

## Retries
Counter({'ValidationError': 3, 'should_update': 1})

## Prompt updates, Pydantic model updates
Counter({'should_update': 3, 'ValueError': 1})



Counter({'should_update': 3, 'ValueError': 1})