In [1]:
from core import github_client

def build_evaluation_suite(repo_full_name: str, state: str = 'open', limit: int = 10):
    print(f"Building evaluation suite for {repo_full_name} with PRs in state {state}")
    repo = github_client.get_repo(repo_full_name)

    pr_numbers = []
    
    for i, pr in enumerate(repo.get_pulls(state=state)):
        if pr.user.login == "github-actions[bot]":
            continue

        print(f"\tPR #{pr.number}, {pr.user.login}: {pr.title}")
        pr_numbers.append(pr.number)

        if i >= limit:
            print(f"\tReached limit of {limit} PRs")
            break

    print("\tExample config: {}\n".format({
        repo_full_name: pr_numbers
    }))
    

# Example usage
# build_evaluation_suite("locustio/locust")
# build_evaluation_suite("pandas-dev/pandas")
# build_evaluation_suite("ktrnka/update-your-readme", state='closed', limit=10)


                extra_headers was transferred to model_kwargs.
                Please confirm that extra_headers is what you intended.
  exec(code_obj, self.user_global_ns, self.user_ns)


In [2]:
import difflib
from typing import NamedTuple

class ReadmeDiff(NamedTuple):
    added: int
    removed: int

def diff_readmes(original_readme: str, updated_readme: str):
    diff = difflib.unified_diff(
        original_readme.splitlines(),
        updated_readme.splitlines(),
    )
    lines_added = 0
    lines_removed = 0
    for line in diff:
        if line.startswith('+') and not line.startswith('+++'):
            lines_added += 1
        elif line.startswith('-') and not line.startswith('---'):
            lines_removed += 1
    return ReadmeDiff(lines_added, lines_removed)

# Test

assert diff_readmes("Hello\nWorld", "Hello\nWorld\nGoodbye")


from typing import Iterable, Hashable, List


def iterate_ngrams(tokens: List[Hashable], n: int) -> Iterable[tuple]:
    for i in range(len(tokens) - n + 1):
        yield tuple(tokens[i : i + n])


def test_iterate_ngrams():
    assert list(iterate_ngrams(["a", "b", "c", "d"], 2)) == [
        ("a", "b"),
        ("b", "c"),
        ("c", "d"),
    ]

import re
def tokenize(text: str) -> List[str]:
    # NOTE: This is a very, very basic tokenizer for very basic tasks.
    return re.split(r"\W+", text)


def test_tokenize():
    assert tokenize("a b c") == ["a", "b", "c"]
    assert tokenize("a, b, c") == ["a", "b", "c"]


def extractive_fraction(summary: str, source: str, n: int = 4):
    summary_ngrams = set(iterate_ngrams(tokenize(summary), n))
    source_ngrams = set(iterate_ngrams(tokenize(source), n))
    return len(summary_ngrams & source_ngrams) / len(summary_ngrams)


def test_extractive_fraction():
    example_source = "a b c d e f g h i j k l m n o p q r s t u v w x y z"
    example_summary = "a b c d e f g h i j k"

    assert extractive_fraction(example_summary, example_source) == 1.0

    example_source = "a b c d e f g h i j k l m n o p q r s t u v w x y z"
    example_summary = "a b c d e g h i j k"

    assert extractive_fraction(example_summary, example_source) < 1.0


In [3]:
medium_test_suite = {
    'locustio/locust': [2899, 2856, 2820, 2786],
    'ktrnka/update-your-readme': [50, 49, 46, 44, 43, 41, 40],
}

small_test_suite = {
    'ktrnka/update-your-readme': [41, 40],
}

In [4]:
from typing import NamedTuple, Optional, Tuple
from core import ReadmeRecommendation, review_pull_request
from time import time

class SingleOutcome(NamedTuple):
    result: Optional[ReadmeRecommendation]
    error: Optional[ValueError]
    seconds: float
    diff: Optional[Tuple]
    extractive_ngram_fraction: Optional[float]

test_suite = medium_test_suite

outcomes = {}
for repo_name, pr_numbers in test_suite.items():
    print(f"Testing {repo_name}...")
    for pr_number in pr_numbers:
        print(f"\tTesting PR #{pr_number}...")


        start_time = time()

        try:
            repo = github_client.get_repo(repo_name)
            pr = repo.get_pull(pr_number)

            # Get the base README
            base_readme = repo.get_contents("README.md", ref=pr.base.sha).decoded_content.decode()

            result = review_pull_request(repo, pr)

            diff_results = None
            extractive_ngram_fraction = None
            if result.should_update:
                diff_results = diff_readmes(base_readme, result.updated_readme)
                extractive_ngram_fraction = extractive_fraction(result.updated_readme, base_readme)

            outcomes[(repo_name, pr_number)] = SingleOutcome(result, None, time() - start_time, diff_results, extractive_ngram_fraction)
        except ValueError as e:
            outcomes[(repo_name, pr_number)] = SingleOutcome(None, e, time() - start_time, None, None)

# summarize the results
percent_failed = len([outcome for outcome in outcomes.values() if outcome.result is None]) / len(outcomes)
total_runtime = sum(outcome.seconds for outcome in outcomes.values())
mean_runtime = total_runtime / len(outcomes)

print(f"""
Tested against {len(outcomes)} PRs in {len(test_suite)} repos.

{percent_failed:.0%} failed.
Total runtime: {total_runtime:.0f}s
Mean runtime per PR: {mean_runtime:.0f}s
""")

Testing locustio/locust...
	Testing PR #2899...
	Testing PR #2856...
	Testing PR #2820...
	Testing PR #2786...
Testing ktrnka/update-your-readme...
	Testing PR #50...
	Testing PR #49...
	Testing PR #46...
	Testing PR #44...
	Testing PR #43...
	Testing PR #41...
	Testing PR #40...

Tested against 11 PRs in 2 repos.

0% failed.
Total runtime: 253s
Mean runtime per PR: 23s



In [5]:
# Review errors
errors = [outcome for outcome in outcomes.values() if outcome.error is not None]

for outcome_id, outcome in outcomes.items():
    if outcome.error:
        print(f"{outcome_id}: {outcome.result} in {outcome.seconds:.0f}s")
        errors = outcome.error.errors()
        for error in errors:
            print(f"\tError: {error['msg']}")
        example_error_outcome = outcome
        # print(f"\tError: {outcome.error.msg}")


In [6]:
# pydantic_core._pydantic_core.ValidationError

# dir(outcome.error)
# outcome.error.errors()

In [7]:
should_updates = [outcome for outcome in outcomes.values() if outcome.result is not None and outcome.result.should_update]

percent_bad_diff = len([outcome for outcome in should_updates if sum(outcome.diff) == 0]) / len(should_updates)
percent_bad_extractive = len([outcome for outcome in should_updates if outcome.extractive_ngram_fraction < 0.75 or outcome.extractive_ngram_fraction  == 1]) / len(should_updates)

print(f"""
{percent_bad_diff:.0%} of PRs with should_update had no changes.
{percent_bad_extractive:.0%} of PRs with should_update had bad extractive ngram percent.
""")



0% of PRs with should_update had no changes.
9% of PRs with should_update had bad extractive ngram percent.



In [8]:
good_icon = "✅"
bad_icon = "❌"


# Review non-errors
for outcome_id, outcome in outcomes.items():
    if outcome.result:
        print(f"""
# {outcome_id}
Automated review took {outcome.seconds:.0f}s
Should update? {outcome.result.should_update}
""")
        if outcome.result.should_update:
            diff_review = good_icon if sum(outcome.diff) > 0 else bad_icon
            extractive_review = good_icon if 0.75 < outcome.extractive_ngram_fraction < 1. else bad_icon
            print(f"""Reason: {outcome.result.reason}
Diff: +{outcome.diff.added}, -{outcome.diff.removed} {diff_review}
Extractive n-gram fraction: {outcome.extractive_ngram_fraction:.1%} {extractive_review}
""")

            # if sum(outcome.diff) > 0:
            #     print(f"""Updated README: \n{outcome.result.updated_readme}""")



# ('locustio/locust', 2899)
Automated review took 36s
Should update? True

Reason: The pull request introduces significant changes to the testing infrastructure, particularly the implementation of polling mechanisms to improve test reliability. This is an important development that should be reflected in the README to highlight Locust's commitment to robust testing and continuous improvement.
Diff: +5, -1 ✅
Extractive n-gram fraction: 94.1% ✅


# ('locustio/locust', 2856)
Automated review took 21s
Should update? True

Reason: The README should be updated to improve clarity, add more detailed information about features, and include guidance on getting started. The current README is good but can be enhanced to provide a better overview of Locust's capabilities and usage.
Diff: +51, -47 ✅
Extractive n-gram fraction: 45.5% ❌


# ('locustio/locust', 2820)
Automated review took 35s
Should update? True

Reason: The README should be updated to include information about the recent changes in h

# Monday afternoon

I learned the hard way that something about the ChatPromptTemplate was disabling the Python variables in the Human step,
so it was generating the readme purely from guidelines

## Trying to see if Sonnet fixes some of the quality issues

Baseline (Haiku):

    20% of PRs with should_update had no changes.
    40% of PRs with should_update had bad extractive ngram percent.

Sonnet:

    0% failed.
    Total runtime: 253s
    Mean runtime per PR: 23s (slightly faster than the original non-cached results)

    0% of PRs with should_update had no changes.
    9% of PRs with should_update had bad extractive ngram percent.

The one bad case looked like it did a more extensive README rewrite


## Increased the max_tokens to 4096 (Haiku's max)

    Tested against 11 PRs in 2 repos.

    0% failed.
    Total runtime: 105s
    Mean runtime per PR: 10s

Run 2:

    Tested against 11 PRs in 2 repos.

    0% failed.
    Total runtime: 94s
    Mean runtime per PR: 9s

Ok that did it. Now I'll move on to the next wave of issues:

- should_update = True but it outputs the exact README as the input

## 1 try only, and lowering temperature

Temp 0

    55% failed.
    Total runtime: 97s
    Mean runtime per PR: 9s

Oof

Temp 0.3, which some summarization folks like:

    45% failed.
    Total runtime: 102s
    Mean runtime per PR: 9s

Eh, I'm going to dial back the summarization stuff

## After the fix:

    Tested against 11 PRs in 2 repos.

    36% failed.
    Total runtime: 91s
    Mean runtime per PR: 8s

The failed cases are doing should_update=True and updated_readme = nothing. I saw in the raw output that one of the really bad subtractions just had [rest of readme the same] or some such at the bottom.

I've done some re-runs and it tends to be 27%-36% error rate which is worse than before (18% just before prompt caching)

I attempted to manually specify the input variables but they're completely ignored. I also tried wrapping the human message in an array like the system message but that also failed.

After some more experiments I think I figured out why the results are different than before:
- Previously I defaulted tries_remaining to 1 but that actually meant it could try and retry
- I set that to 1, and re-ran twice and got 27% failure and 18%, so I think that brings it closer to previous results

Also I added more instrumentation on the output. This makes it easy to catch cases where it generates a README unrelated to the input repo. It also led me to find many cases in which it said should_update=True and regenerated the EXACT input README which is another red-flag to consider

# Dev log: Monday

Refactor:
- More controlled experiment (multiple repos, build a fixed set of PRs ahead of time)
- Track failure rate and execution time
- Hold onto any objects to adhoc analysis after running

Test suites:

    medium_test_suite = {
        'locustio/locust': [2899, 2856, 2820, 2786],
        'ktrnka/update-your-readme': [50, 49, 46, 44, 43, 41, 40],
    }

    small_test_suite = {
        'ktrnka/update-your-readme': [41, 40],
    }

## Medium test suite

### Baseline with Haiku, before removing the directory tree

    Tested against 11 PRs in 2 repos.

    18% failed.
    Total runtime: 135s
    Mean runtime per PR: 12s

### After removing the directory tree

    Tested against 11 PRs in 2 repos.

    18% failed.
    Total runtime: 129s
    Mean runtime per PR: 12s

It's slightly faster but not a lot. I'll keep the change though.

### Adding prompt caching

    Tested against 11 PRs in 2 repos.

    0% failed.
    Total runtime: 54s
    Mean runtime per PR: 5s

Notes
- It throws an annoying warning "extra_headers was transferred to model_kwargs" but that's what the docs show: https://api.python.langchain.com/en/latest/chat_models/langchain_anthropic.chat_models.ChatAnthropic.html
- The speedup is wonderful! That's what I'd hoped for
- The 0% failure rate is surprising. It's possible that it's a result of needing to refactor to use the SystemMessage vs HumanMessage

I'm going to re-run this without any changes cause I kind of don't even believe that we have no errors now:

    Tested against 11 PRs in 2 repos.

    0% failed.
    Total runtime: 53s
    Mean runtime per PR: 5s

Huh

## Small test suite

### Baseline test

    Tested against 2 PRs in 1 repos.

    50% failed.
    Total runtime: 55s
    Mean runtime per PR: 28s

### With Claude 3 Haiku

    Tested against 2 PRs in 1 repos.

    0% failed.
    Total runtime: 22s
    Mean runtime per PR: 11s



# Dev log: Sunday

## Before prompt engineering, running on Locust
Counter({'ValidationError': 3, 'should_update': 1})

## Stronger guidance in the prompt itself, like the Pydantic field descriptions and how they're mentioned in the prompt itself
Counter({'ValidationError': 4})
Counter({'ValidationError': 4})
Counter({'ValidationError': 2, 'should_update': 1, 'no_update': 1})

## Retries
Counter({'ValidationError': 3, 'should_update': 1})

## Prompt updates, Pydantic model updates
Counter({'should_update': 3, 'ValueError': 1})

