Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Abbreviate benchmark results posted to pull requests #6124

Merged
merged 3 commits into from
Jun 8, 2021
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
306 changes: 245 additions & 61 deletions build_tools/android/post_benchmarks_as_pr_comment.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@
This script is meant to be used by Buildkite for automation. It requires the
following environment to be set:

- BUILDKITE_BUILD_NUMBER: the build number of current Buildkite build.
- BUILDKITE_BUILD_URL: the link to the current Buildkite build.
- BUILDKITE_COMMIT: the pull request HEAD commit.
- BUILDKITE_PULL_REQUEST: the current pull request number.
- GITHUB_TOKEN: personal access token to authenticate against GitHub API.
- GITHUB_TOKEN: personal access token to authenticate against GitHub API;
it should have "public_repo" and "gist" scope.

if --query-base in toggled on, then it additionally requires:

Expand All @@ -36,14 +38,28 @@
import requests
import markdown_strings as md

from typing import Any, Dict, Sequence, Tuple, Union
from dataclasses import dataclass
from typing import Any, Dict, Optional, Sequence, Tuple, Union

from common.benchmark_description import BenchmarkResults, get_output

GITHUB_GIST_API_PREFIX = "https://api.github.com/gists"
GITHUB_IREE_API_PREFIX = "https://api.github.com/repos/google/iree"
GITHUB_IREE_REPO_PREFIX = "https://github.com/google/iree"
# TODO: Replace this with a bot account, probably iree-github-actions-bot.
GITHUB_USER = "antiagainst"
IREE_PROJECT_ID = 'IREE'
SIMILAR_BECNHMARK_THRESHOLD = 0.05
TABLE_SIZE_CUT = 3
THIS_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
RESULT_EMPHASIS_THRESHOLD = 0.05


def get_required_env_var(var: str) -> str:
"""Gets the value for a required environment variable."""
value = os.getenv(var, None)
if value is None:
raise RuntimeError(f'Missing environment variable "{var}"')
return value


def get_git_commit_hash(commit: str, verbose: bool = False) -> str:
Expand All @@ -61,12 +77,13 @@ def get_git_total_commit_count(commit: str, verbose: bool = False) -> int:
return int(count)


def get_required_env_var(var: str) -> str:
"""Gets the value for a required environment variable."""
value = os.getenv(var, None)
if value is None:
raise RuntimeError(f'Missing environment variable "{var}"')
return value
def get_origin_tree_top_commit(verbose: bool = False) -> str:
"""Returns the top of the tree commit for the origin base branch."""
base_branch = get_required_env_var("BUILDKITE_PULL_REQUEST_BASE_BRANCH")
get_output(['git', 'fetch', '--prune', '--', 'origin', base_branch],
cwd=THIS_DIRECTORY,
verbose=verbose)
return get_git_commit_hash(f'origin/{base_branch}', verbose)


def get_from_dashboard(url: str,
Expand All @@ -84,19 +101,32 @@ def get_from_dashboard(url: str,
raise requests.RequestException(
f'Failed to get from dashboard server with status code {code}')

return response.json()
data = response.json()
if verbose:
print(f'Queried base benchmark data: {data}')
return data


@dataclass
class AggregateBenchmarkLatency:
"""An object for describing aggregate latency numbers for a benchmark."""
mean_time: int
median_time: int
stddev_time: int
# The average latency time for the base commit to compare against.
base_mean_time: Optional[int] = None


def aggregate_all_benchmarks(
benchmark_files: Sequence[str]) -> Sequence[Tuple[Union[str, int]]]:
benchmark_files: Sequence[str]) -> Dict[str, AggregateBenchmarkLatency]:
"""Aggregates all benchmarks in the given files.

Args:
- benchmark_files: A list of JSON files, each can be decoded as a
BenchmarkResults.

Returns:
- A list of (name, mean-latency, median-latency, stddev-latency) tuples.
- A dict of benchmark names to AggregateBenchmarkLatency numbers.
"""

pr_commit = get_required_env_var("BUILDKITE_COMMIT")
Expand All @@ -123,9 +153,10 @@ def aggregate_all_benchmarks(
median_time = file_results.get_aggregate_time(benchmark_index, "median")
stddev_time = file_results.get_aggregate_time(benchmark_index, "stddev")

aggregate_results[name] = (mean_time, median_time, stddev_time)
aggregate_results[name] = AggregateBenchmarkLatency(
mean_time, median_time, stddev_time)

return sorted([(k,) + v for k, v in aggregate_results.items()])
return aggregate_results


def query_base_benchmark_results(commit,
Expand All @@ -138,68 +169,217 @@ def query_base_benchmark_results(commit,
return get_from_dashboard(f'{url}/apis/getBuild', payload, verbose=verbose)


def get_comparsion_against_base(pr_means: Sequence[int],
base_means: Sequence[int]) -> Sequence[str]:
"""Returns a tuple of strings comparsing mean latency numbers."""
comparisions = []
def add_header_and_get_markdown_table(names: Tuple[str],
means: Tuple[Any],
medians: Tuple[int],
stddevs: Tuple[int],
size_cut: Optional[int] = None) -> str:
"""Generates a markdown table with proper headers for benchmarks.

for pr, base in zip(pr_means, base_means):
if base is None:
comparisions.append(str(pr))
continue
Args:
- size_cut: If not None, only show the top N results for each table.
"""
total_size = len(names)
if size_cut is not None:
names = names[0:size_cut]
means = means[0:size_cut]
medians = medians[0:size_cut]
stddevs = stddevs[0:size_cut]

diff = abs(pr - base) / base
if pr > base:
percent = "{:.2%}".format(diff)
direction = "↑"
if diff > RESULT_EMPHASIS_THRESHOLD:
direction += ", 🚩"
elif pr < base:
percent = "{:.2%}".format(diff)
direction = "↓"
if diff > RESULT_EMPHASIS_THRESHOLD:
direction += ", 🎉"
else:
percent = "{:.0%}".format(diff)
direction = ""
names = ("Benchmark Name",) + names
means = ("Average Latency (ms)",) + means
medians = ("Median Latency (ms)",) + medians
stddevs = ("Latency Standard Deviation (ms)",) + stddevs

table_str = md.table([names, means, medians, stddevs])
if size_cut is not None and size_cut < total_size:
table_str += "\n\n"
table_str += md.italics(
f"[Top {size_cut} out of {total_size} benchmark results showed]")
return table_str

comparisions.append(f"{pr} (vs. {base}, {percent}{direction})")

return tuple(comparisions)
def sort_benchmarks_and_get_table(benchmarks: Dict[str,
AggregateBenchmarkLatency],
size_cut: Optional[int] = None):
"""Sorts all benchmarks according to the improvement/regression ratio and
returns a markdown table for it.

Args:
- size_cut: If not None, only show the top N results for each table.
"""
sorted_benchmarks = []
for k, v in benchmarks.items():
ratio = abs(v.mean_time - v.base_mean_time) / v.base_mean_time
sorted_benchmarks.append((k, (v.mean_time, v.base_mean_time, ratio),
v.median_time, v.stddev_time))
# Sort according to ratio in the reverse order.
sorted_benchmarks.sort(key=lambda benchmark: benchmark[1][2], reverse=True)

# Split each field into its own tuple in prepration for markdown table.
names, means, medians, stddevs = zip(*sorted_benchmarks)

# Turn the tuple about means into a string representation.
str_means = []
for pr, base, ratio in means:
direction = "↑" if pr > base else "↓"
str_means.append(f"{pr} (vs. {base}, {ratio:.2%}{direction})")
str_means = tuple(str_means)

return add_header_and_get_markdown_table(names, str_means, medians, stddevs,
size_cut)


def categorize_benchmarks_into_tables(benchmarks: Dict[
str, AggregateBenchmarkLatency],
similar_threshold: float,
size_cut: Optional[int] = None) -> str:
"""Splits benchmarks into regressed/improved/similar/raw categories and
returns their markdown tables.

Args:
- similar_threshold: the threshold under which a benchmark will be
considered as similar to its base commit.
- size_cut: If not None, only show the top N results for each table.
"""
regressed, improved, similar, raw = {}, {}, {}, {}

for name, results in benchmarks.items():
# If no informatio about the base result. Then we cannot analyze.
if results.base_mean_time is None:
raw[name] = results
continue

current = results.mean_time
base = results.base_mean_time
ratio = abs(current - base) / base
if ratio <= similar_threshold:
similar[name] = results
elif current > base:
regressed[name] = results
else:
improved[name] = results

tables = []
if regressed:
tables.append(md.header("Regressed Benchmarks 🚩", 3))
tables.append(sort_benchmarks_and_get_table(regressed, size_cut))
if improved:
tables.append(md.header("Improved Benchmarks 🎉", 3))
tables.append(sort_benchmarks_and_get_table(improved, size_cut))
# If we want to abbreviate, similar results won't be interesting.
if similar and size_cut is None:
tables.append(md.header("Similar Benchmarks", 3))
similar_list = [(k, v.mean_time, v.median_time, v.stddev_time)
for k, v in similar.items()]
names, means, medians, stddevs = zip(*similar_list)
tables.append(
add_header_and_get_markdown_table(names=names,
means=means,
medians=medians,
stddevs=stddevs,
size_cut=size_cut))
if raw:
tables.append(md.header("Similar Benchmarks", 3))
raw_list = [
(k, v.mean_time, v.median_time, v.stddev_time) for k, v in raw.items()
]
names, means, medians, stddevs = zip(*raw_list)
tables.append(
add_header_and_get_markdown_table(names=names,
means=means,
medians=medians,
stddevs=stddevs,
size_cut=size_cut))
return "\n\n".join(tables)


def get_benchmark_result_markdown(benchmark_files: Sequence[str],
query_base: bool,
verbose: bool = False) -> str:
"""Gets markdown summary of all benchmarks in the given files."""
verbose: bool = False) -> Tuple[str, str]:
"""Gets the full/abbreviated markdown summary of all benchmarks in files."""
all_benchmarks = aggregate_all_benchmarks(benchmark_files)
names, means, medians, stddevs = zip(*all_benchmarks)

build_url = get_required_env_var("BUILDKITE_BUILD_URL")
pr_number = get_required_env_var("BUILDKITE_PULL_REQUEST")
pr_commit = get_required_env_var("BUILDKITE_COMMIT")
pr_commit = md.link(pr_commit,
f"{GITHUB_IREE_REPO_PREFIX}/commit/{pr_commit}")
if query_base:
base_branch = get_required_env_var("BUILDKITE_PULL_REQUEST_BASE_BRANCH")
commit = get_git_commit_hash(base_branch, verbose)
base_benchmarks = query_base_benchmark_results(commit, verbose)
base_means = [base_benchmarks.get(v) for v in names]
means = get_comparsion_against_base(means, base_means)
commit_info = f"@ commit {pr_commit} (vs. base {commit})"
# Update the aggregate benchmarks with base numbers.
base_commit = get_origin_tree_top_commit(verbose)
base_benchmarks = query_base_benchmark_results(base_commit, verbose)
for bench in base_benchmarks:
if bench in all_benchmarks:
all_benchmarks[bench].base_mean_time = base_benchmarks[bench]
base_commit = md.link(base_commit,
f"{GITHUB_IREE_REPO_PREFIX}/commit/{base_commit}")
commit_info = f"@ commit {pr_commit} (vs. base {base_commit})"
else:
commit_info = f"@ commit {pr_commit}"

names = ("Benchmark Name",) + names
means = ("Average Latency (ms)",) + means
medians = ("Median Latency (ms)",) + medians
stddevs = ("Latency Standard Deviation (ms)",) + stddevs
pr_info = md.link("Pull request",
f"{GITHUB_IREE_REPO_PREFIX}/pull/{pr_number}")
buildkite_info = md.link("Buildkite build", build_url)

# Compose the full benchmark tables.
full_table = [md.header("Full Benchmark Summary", 2)]
full_table.append(md.unordered_list([commit_info, pr_info, buildkite_info]))
full_table.append(
categorize_benchmarks_into_tables(all_benchmarks,
SIMILAR_BECNHMARK_THRESHOLD))

# Compose the abbreviated benchmark tables.
abbr_table = [md.header("Abbreviated Benchmark Summary", 2)]
abbr_table.append(commit_info)
abbr_table.append(
categorize_benchmarks_into_tables(all_benchmarks,
SIMILAR_BECNHMARK_THRESHOLD,
TABLE_SIZE_CUT))
abbr_table.append("For more information:")
# We don't know until a Gist is really created. Use a placeholder for now
# and replace later.
full_result_info = md.link("Full benchmark result tables",
"<<placeholder-link>>")
abbr_table.append(md.unordered_list([full_result_info, buildkite_info]))

return "\n\n".join(full_table), "\n\n".join(abbr_table)


def post_to_gist(filename: str, content: str, verbose: bool = False):
"""Posts the given content to a new GitHub Gist and returns the URL to it."""
api_token = get_required_env_var('GITHUB_TOKEN')
headers = {
"Accept": "application/vnd.github.v3+json",
"Authorization": f"token {api_token}",
}
payload = json.dumps({
"public": True,
"files": {
filename: {
"content": content
}
}
})

api_endpoint = GITHUB_GIST_API_PREFIX
response = requests.post(api_endpoint, data=payload, headers=headers)
if response.status_code != 201:
raise requests.RequestException(
f"Failed to comment on GitHub; error code: {response.status_code}")

response = response.json()
if verbose:
print(response)

header = md.header("Benchmark results", 3)
benchmark_table = md.table([names, means, medians, stddevs])
link = "See more details on " + md.link("Buildkite", build_url)
if response["truncated"]:
raise requests.RequestException(f"Content too large and gotten truncated")

return "\n\n".join([header, commit_info, benchmark_table, link])
gist_id = response["id"]
return f"https://gist.github.com/{GITHUB_USER}/{gist_id}"


def comment_on_pr(content):
def comment_on_pr(content, verbose: bool = False):
"""Posts the given content as comments to the current pull request."""
pr_number = get_required_env_var("BUILDKITE_PULL_REQUEST")
# Buildkite sets this to "false" if not running on a PR:
Expand Down Expand Up @@ -254,14 +434,18 @@ def check_file_path(path):


def main(args):
benchmarks_md = get_benchmark_result_markdown(args.benchmark_files,
query_base=args.query_base,
verbose=args.verbose)
full_md, abbr_md = get_benchmark_result_markdown(args.benchmark_files,
query_base=args.query_base,
verbose=args.verbose)

if args.dry_run:
print(benchmarks_md)
print(full_md, "\n\n", abbr_md)
else:
comment_on_pr(benchmarks_md)
build_number = get_required_env_var("BUILDKITE_BUILD_NUMBER")
filename = f"iree-full-benchmark-result-{build_number}.md"
gist_url = post_to_gist(filename, full_md, args.verbose)
abbr_md = abbr_md.replace("<<placeholder-link>>", gist_url)
comment_on_pr(abbr_md, args.verbose)


if __name__ == "__main__":
Expand Down