In [1]:
from main import github_client

def build_evaluation_suite(repo_full_name: str, state: str = 'open', limit: int = 10):
    """Helper to identify PRs in a repository that can be used for evaluation."""
    print(f"Building evaluation suite for {repo_full_name} with PRs in state {state}")
    repo = github_client.get_repo(repo_full_name)

    pr_numbers = []
    
    for i, pr in enumerate(repo.get_pulls(state=state)):
        if pr.user.login == "github-actions[bot]":
            continue

        print(f"\tPR #{pr.number}, {pr.user.login}: {pr.title}")
        pr_numbers.append(pr.number)

        if i >= limit:
            print(f"\tReached limit of {limit} PRs")
            break

    print("\tExample config: {}\n".format({
        repo_full_name: pr_numbers
    }))
    

# Example usage
# build_evaluation_suite("locustio/locust")
# build_evaluation_suite("pandas-dev/pandas")
# build_evaluation_suite("ktrnka/update-your-readme", state='closed', limit=10)

build_evaluation_suite("ktrnka/company-detective", state='closed', limit=10)


Building evaluation suite for ktrnka/company-detective with PRs in state closed
	PR #14, ktrnka: Exclude a network-based test
	PR #12, ktrnka: Bring Crunchbase back
	PR #9, ktrnka: Dynamic rebuild cadence
	PR #7, ktrnka: Move the generic article scraping into utils
	PR #5, ktrnka: Re-organize the repo / cleanup
	Reached limit of 10 PRs
	Example config: {'ktrnka/company-detective': [14, 12, 9, 7, 5]}



In [2]:
import difflib
from typing import NamedTuple

class ReadmeDiff(NamedTuple):
    added: int
    removed: int

def diff_readmes(original_readme: str, updated_readme: str) -> ReadmeDiff:
    diff = difflib.unified_diff(
        original_readme.splitlines(),
        updated_readme.splitlines(),
    )
    lines_added = 0
    lines_removed = 0
    for line in diff:
        if line.startswith('+') and not line.startswith('+++'):
            lines_added += 1
        elif line.startswith('-') and not line.startswith('---'):
            lines_removed += 1
    return ReadmeDiff(lines_added, lines_removed)

# Test

assert diff_readmes("Hello\nWorld", "Hello\nWorld\nGoodbye")


from typing import Iterable, Hashable, List


def iterate_ngrams(tokens: List[Hashable], n: int) -> Iterable[tuple]:
    for i in range(len(tokens) - n + 1):
        yield tuple(tokens[i : i + n])


def test_iterate_ngrams():
    assert list(iterate_ngrams(["a", "b", "c", "d"], 2)) == [
        ("a", "b"),
        ("b", "c"),
        ("c", "d"),
    ]

import re
def tokenize(text: str) -> List[str]:
    # NOTE: This is a very, very basic tokenizer for very basic tasks.
    return re.split(r"\W+", text)


def test_tokenize():
    assert tokenize("a b c") == ["a", "b", "c"]
    assert tokenize("a, b, c") == ["a", "b", "c"]


def jaccard_similarity(text_a: str, text_b: str, n: int = 4):
    summary_ngrams = set(iterate_ngrams(tokenize(text_a), n))
    source_ngrams = set(iterate_ngrams(tokenize(text_b), n))

    intersection_size = len(summary_ngrams & source_ngrams)
    union_size = len(summary_ngrams | source_ngrams)
    
    return intersection_size / union_size if union_size != 0 else 0


example_source = "a b c d e f g h i j k l m n o p q r s t u v w x y z"
example_summary = "a b c d e f g h i j k"


jaccard_similarity(example_summary, example_source), jaccard_similarity(example_summary, example_source, 2)

(0.34782608695652173, 0.4)

In [3]:
medium_test_suite = {
    'locustio/locust': [2899, 2856, 2820, 2786],
    'ktrnka/update-your-readme': [50, 49, 46, 44, 43, 41, 40],
    'ktrnka/company-detective': [14, 12, 9, 7, 5],
}

small_test_suite = {'ktrnka/company-detective': [14, 12, 9, 7, 5]}

In [4]:
from typing import NamedTuple, Optional, Tuple
from main import ReadmeRecommendation, review_pull_request
from time import time

class SingleOutcome(NamedTuple):
    result: Optional[ReadmeRecommendation]
    error: Optional[ValueError]
    seconds: float
    diff: Optional[Tuple]
    similarity: Optional[float]
    base_readme: Optional[str] = None

test_suite = small_test_suite

outcomes = {}
for repo_name, pr_numbers in test_suite.items():
    print(f"Testing {repo_name}...")
    for pr_number in pr_numbers:
        print(f"\tTesting PR #{pr_number}...")


        start_time = time()

        try:
            repo = github_client.get_repo(repo_name)
            pr = repo.get_pull(pr_number)

            # Get the base README
            base_readme = repo.get_contents("README.md", ref=pr.base.sha).decoded_content.decode()

            result = review_pull_request(repo, pr, use_base_readme=True)

            diff_results = None
            similarity = None
            if result.should_update:
                diff_results = diff_readmes(base_readme, result.updated_readme)
                similarity = jaccard_similarity(result.updated_readme, base_readme)

            outcomes[(repo_name, pr_number)] = SingleOutcome(result, None, time() - start_time, diff_results, similarity, base_readme)
        except ValueError as e:
            outcomes[(repo_name, pr_number)] = SingleOutcome(None, e, time() - start_time, None, None, base_readme)

# summarize the results
percent_failed = len([outcome for outcome in outcomes.values() if outcome.result is None]) / len(outcomes)
total_runtime = sum(outcome.seconds for outcome in outcomes.values())
mean_runtime = total_runtime / len(outcomes)

print(f"""
Tested against {len(outcomes)} PRs in {len(test_suite)} repos.

{percent_failed:.0%} failed.
Total runtime: {total_runtime:.0f}s
Mean runtime per PR: {mean_runtime:.0f}s
""")

Testing ktrnka/company-detective...
	Testing PR #14...
	Testing PR #12...
	Testing PR #9...
	Testing PR #7...
	Testing PR #5...

Tested against 5 PRs in 1 repos.

0% failed.
Total runtime: 73s
Mean runtime per PR: 15s



In [5]:
# Review errors
errors = [outcome for outcome in outcomes.values() if outcome.error is not None]

for outcome_id, outcome in outcomes.items():
    if outcome.error:
        print(f"{outcome_id}: {outcome.result} in {outcome.seconds:.0f}s")
        errors = outcome.error.errors()
        for error in errors:
            print(f"\tError: {error['msg']}")
        example_error_outcome = outcome
        # print(f"\tError: {outcome.error.msg}")


In [6]:
# pydantic_core._pydantic_core.ValidationError

# dir(outcome.error)
# outcome.error.errors()

In [7]:
should_updates = [outcome for outcome in outcomes.values() if outcome.result is not None and outcome.result.should_update]

lines_changed = [sum(outcome.diff) for outcome in should_updates]
similarities = [outcome.similarity for outcome in should_updates]

percent_bad_diff = len([outcome for outcome in should_updates if outcome.diff == (0, 0)]) / len(should_updates)
mean_similarity = sum(similarities) / len(similarities)
mean_changes = sum(lines_changed) / len(lines_changed)

percent_lengthened = len([outcome for outcome in should_updates if outcome.diff[0] > outcome.diff[1]]) / len(should_updates)

print(f"""
{percent_bad_diff:.0%} of PRs with should_update had no changes.
    Lines changed: {lines_changed} (avg: {mean_changes:.1f})
Percent of READMEs that were lengthened: {percent_lengthened:.0%}
Mean similarity: {mean_similarity:.1%}
""")



0% of PRs with should_update had no changes.
    Lines changed: [10, 12, 18, 25] (avg: 16.2)
Percent of READMEs that were lengthened: 100%
Mean similarity: 64.2%



In [13]:
good_icon = "✅"
bad_icon = "❌"


# Review non-errors
for outcome_id, outcome in outcomes.items():
    if outcome.result:
        print(f"""
# {outcome_id}
Automated review took {outcome.seconds:.0f}s
Should update? {outcome.result.should_update}
Reason: {outcome.result.reason}
""")
        if outcome.result.should_update:
            diff_review = good_icon if sum(outcome.diff) > 0 else bad_icon
            similarity_review = good_icon if 0.75 < outcome.similarity < 1. else bad_icon
            print(f"""
Diff: +{outcome.diff.added}, -{outcome.diff.removed} {diff_review}
Jaccard coef of 4grams: {outcome.similarity:.1%} {similarity_review}
""")

            # if sum(outcome.diff) > 0:
            #     print(f"""Updated README: \n{outcome.result.updated_readme}""")



# ('ktrnka/company-detective', 14)
Automated review took 14s
Should update? True
Reason: The pull request introduces several changes that should be reflected in the README, including updates to API key handling, additional data sources, and testing information. These updates will provide users with more accurate and comprehensive information about the project.


Diff: +8, -2 ✅
Jaccard coef of 4grams: 76.9% ✅


# ('ktrnka/company-detective', 12)
Automated review took 17s
Should update? True
Reason: The pull request introduces significant changes to the project, including the addition of Crunchbase as a new data source and updates to the Glassdoor scraping functionality. These changes should be reflected in the README to provide users with up-to-date information about the project's capabilities and dependencies.


Diff: +10, -2 ✅
Jaccard coef of 4grams: 77.4% ✅


# ('ktrnka/company-detective', 9)
Automated review took 19s
Should update? True
Reason: The pull request introduces a signifi

In [9]:
outcomes.keys()

dict_keys([('ktrnka/company-detective', 14), ('ktrnka/company-detective', 12), ('ktrnka/company-detective', 9), ('ktrnka/company-detective', 7), ('ktrnka/company-detective', 5)])

In [15]:
# show one as a diff
example = ('ktrnka/company-detective', 14)

outcome = outcomes[example]

import difflib
diff = difflib.unified_diff(
    outcome.base_readme.splitlines(),
    outcome.result.updated_readme.splitlines(),
)
for line in diff:
    print(line)


--- 

+++ 

@@ -6,7 +6,7 @@

 
 ## Features
 
-- Aggregates information from multiple sources including Crunchbase, Glassdoor, news articles, and company websites
+- Aggregates information from multiple sources including Crunchbase, Glassdoor, news articles, company websites, and Reddit
 - Utilizes AI to summarize and analyze data
 - Provides a unified summary of company information
 - Dynamic rebuild cadence for up-to-date information
@@ -27,8 +27,9 @@

 - AWS
 - Langsmith (Optional)
 - Crunchbase (via Scrapfly)
+- Airtable
 
-Ensure you have obtained the necessary API keys before proceeding with the setup.
+Ensure you have obtained the necessary API keys before proceeding with the setup. The project is designed to handle missing API keys gracefully, but functionality may be limited without them.
 
 ## Installation
 
@@ -62,10 +63,15 @@

 - Glassdoor: Offers employee reviews and sentiment analysis.
 - News Articles: Gathers recent news about the company.
 - Company Website: Extracts i

# Monday afternoon

I learned the hard way that something about the ChatPromptTemplate was disabling the Python variables in the Human step,
so it was generating the readme purely from guidelines

## Checking for "addition bias"

    0% of PRs with should_update had no changes.
        Lines changed: [13, 90, 18, 10, 5, 15, 3, 6, 10, 6]
    Percent of READMEs that were lengthened: 100%
    Mean similarity: 82.6%

So 100% of the suggestions here were additions

## Trying Jaccard coef

Using Jaccard coefficient to assess the extensiveness of the change is useful BUT I have to get used to it and calibrate towards a "healthy" range.

Updated summary stats (Haiku):

    9% of PRs with should_update had no changes.
        Lines changed: [15, 76, 87, 14, 9, 0, 8, 3, 19, 73, 50]
    Mean similarity: 68.4%

Same thing but with Sonnet:

    0% of PRs with should_update had no changes.
        Lines changed: [13, 90, 18, 10, 5, 15, 3, 6, 10, 6]
    Mean similarity: 82.6%

Ok that's a good sign.

## Trying to see if Sonnet fixes some of the quality issues

Baseline (Haiku):

    20% of PRs with should_update had no changes.
    40% of PRs with should_update had bad extractive ngram percent.

Sonnet:

    0% failed.
    Total runtime: 253s
    Mean runtime per PR: 23s (slightly faster than the original non-cached results)

    0% of PRs with should_update had no changes.
    9% of PRs with should_update had bad extractive ngram percent.

The one bad case looked like it did a more extensive README rewrite


## Increased the max_tokens to 4096 (Haiku's max)

    Tested against 11 PRs in 2 repos.

    0% failed.
    Total runtime: 105s
    Mean runtime per PR: 10s

Run 2:

    Tested against 11 PRs in 2 repos.

    0% failed.
    Total runtime: 94s
    Mean runtime per PR: 9s

Ok that did it. Now I'll move on to the next wave of issues:

- should_update = True but it outputs the exact README as the input

## 1 try only, and lowering temperature

Temp 0

    55% failed.
    Total runtime: 97s
    Mean runtime per PR: 9s

Oof

Temp 0.3, which some summarization folks like:

    45% failed.
    Total runtime: 102s
    Mean runtime per PR: 9s

Eh, I'm going to dial back the summarization stuff

## After the fix:

    Tested against 11 PRs in 2 repos.

    36% failed.
    Total runtime: 91s
    Mean runtime per PR: 8s

The failed cases are doing should_update=True and updated_readme = nothing. I saw in the raw output that one of the really bad subtractions just had [rest of readme the same] or some such at the bottom.

I've done some re-runs and it tends to be 27%-36% error rate which is worse than before (18% just before prompt caching)

I attempted to manually specify the input variables but they're completely ignored. I also tried wrapping the human message in an array like the system message but that also failed.

After some more experiments I think I figured out why the results are different than before:
- Previously I defaulted tries_remaining to 1 but that actually meant it could try and retry
- I set that to 1, and re-ran twice and got 27% failure and 18%, so I think that brings it closer to previous results

Also I added more instrumentation on the output. This makes it easy to catch cases where it generates a README unrelated to the input repo. It also led me to find many cases in which it said should_update=True and regenerated the EXACT input README which is another red-flag to consider

# Dev log: Monday

Refactor:
- More controlled experiment (multiple repos, build a fixed set of PRs ahead of time)
- Track failure rate and execution time
- Hold onto any objects to adhoc analysis after running

Test suites:

    medium_test_suite = {
        'locustio/locust': [2899, 2856, 2820, 2786],
        'ktrnka/update-your-readme': [50, 49, 46, 44, 43, 41, 40],
    }

    small_test_suite = {
        'ktrnka/update-your-readme': [41, 40],
    }

## Medium test suite

### Baseline with Haiku, before removing the directory tree

    Tested against 11 PRs in 2 repos.

    18% failed.
    Total runtime: 135s
    Mean runtime per PR: 12s

### After removing the directory tree

    Tested against 11 PRs in 2 repos.

    18% failed.
    Total runtime: 129s
    Mean runtime per PR: 12s

It's slightly faster but not a lot. I'll keep the change though.

### Adding prompt caching

    Tested against 11 PRs in 2 repos.

    0% failed.
    Total runtime: 54s
    Mean runtime per PR: 5s

Notes
- It throws an annoying warning "extra_headers was transferred to model_kwargs" but that's what the docs show: https://api.python.langchain.com/en/latest/chat_models/langchain_anthropic.chat_models.ChatAnthropic.html
- The speedup is wonderful! That's what I'd hoped for
- The 0% failure rate is surprising. It's possible that it's a result of needing to refactor to use the SystemMessage vs HumanMessage

I'm going to re-run this without any changes cause I kind of don't even believe that we have no errors now:

    Tested against 11 PRs in 2 repos.

    0% failed.
    Total runtime: 53s
    Mean runtime per PR: 5s

Huh

## Small test suite

### Baseline test

    Tested against 2 PRs in 1 repos.

    50% failed.
    Total runtime: 55s
    Mean runtime per PR: 28s

### With Claude 3 Haiku

    Tested against 2 PRs in 1 repos.

    0% failed.
    Total runtime: 22s
    Mean runtime per PR: 11s



# Dev log: Sunday

## Before prompt engineering, running on Locust
Counter({'ValidationError': 3, 'should_update': 1})

## Stronger guidance in the prompt itself, like the Pydantic field descriptions and how they're mentioned in the prompt itself
Counter({'ValidationError': 4})
Counter({'ValidationError': 4})
Counter({'ValidationError': 2, 'should_update': 1, 'no_update': 1})

## Retries
Counter({'ValidationError': 3, 'should_update': 1})

## Prompt updates, Pydantic model updates
Counter({'should_update': 3, 'ValueError': 1})

