Skip to content

Commit

Permalink
Format codebase
Browse files Browse the repository at this point in the history
This commit formats all .py files in the codebase
with the black formatter python package, as well as
adds the black package to the environment.yml file.

Issue princeton-nlp#195 Black Format Codebase
  • Loading branch information
hminsky2002 committed Apr 17, 2024
1 parent 799f07e commit a906e25
Show file tree
Hide file tree
Showing 25 changed files with 1,066 additions and 578 deletions.
2 changes: 2 additions & 0 deletions config/commands/_split_string.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python3
import sys


def print_flake8_output(input_string, show_line_numbers=False):
for value in input_string.split("\n"):
parts = value.split()
Expand All @@ -10,6 +11,7 @@ def print_flake8_output(input_string, show_line_numbers=False):
line_nums = ":".join(parts[0].split(":")[1:])
print(f"- {line_nums} {' '.join(parts[1:])}")


if __name__ == "__main__":
lint_output = sys.argv[1]
print_flake8_output(lint_output)
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ dependencies:
- together
- ollama
- rich-argparse
- black
95 changes: 64 additions & 31 deletions evaluation/aggregate_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

from pathlib import Path
Expand Down Expand Up @@ -79,25 +80,29 @@ def convert_experiments_to_rows(folder_name, runs_max):
if len(folder_data) != 8:
# TODO: This might be too strict?
continue
temperature = float(folder_data[3][len("t-"):].strip())
top_p = float(folder_data[4][len("p-"):].strip())
cost = float(folder_data[5][len("c-"):].strip())
temperature = float(folder_data[3][len("t-") :].strip())
top_p = float(folder_data[4][len("p-") :].strip())
cost = float(folder_data[5][len("c-") :].strip())
install = "Y" if folder_data[6].strip() == "install-1" else "N"

# Parse out run number
run = folder_data[-1]
if "run" not in run:
continue

try:
if "run-" in run:
run = int(run.split("run-")[-1].split("-")[0].replace("_", "").strip())
run = int(
run.split("run-")[-1].split("-")[0].replace("_", "").strip()
)
else:
run = int(run.split("run")[-1].split("-")[0].replace("_", "").strip())
run = int(
run.split("run")[-1].split("-")[0].replace("_", "").strip()
)
except Exception as e:
print(run)
raise e

if runs_max is not None and run > runs_max:
continue

Expand All @@ -108,11 +113,15 @@ def convert_experiments_to_rows(folder_name, runs_max):

# Extract resolved ids (to calculate pass@k)
resolved_ids = []
if "resolved" in results_data and isinstance(results_data["resolved"], list):
if "resolved" in results_data and isinstance(
results_data["resolved"], list
):
resolved_ids = results_data["resolved"]
elif "counts" in results_data and isinstance(results_data["counts"]["resolved"], list):
elif "counts" in results_data and isinstance(
results_data["counts"]["resolved"], list
):
resolved_ids = results_data["counts"]["resolved"]

# Extract instance costs from trajectories
costs_overall = []
costs_success = []
Expand Down Expand Up @@ -156,10 +165,7 @@ def convert_experiments_to_rows(folder_name, runs_max):

def get_results_df(folder_name, runs_max):
rows = convert_experiments_to_rows(folder_name, runs_max)
return (
pd.DataFrame(rows, columns=COLUMNS)
.sort_values(by=COLUMNS[:8])
)
return pd.DataFrame(rows, columns=COLUMNS).sort_values(by=COLUMNS[:8])


def get_results_csv(folder_name):
Expand All @@ -169,12 +175,29 @@ def get_results_csv(folder_name):

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Aggregate results from experiments")
parser.add_argument("--folder", type=str, help="Folder containing experiment results", default="../trajectories")
parser.add_argument("--model", nargs='+', type=str, help="Model(s) to filter results by.")
parser.add_argument("--dataset", nargs='+', type=str, help="Dataset to filter results by.")
parser.add_argument("--setup", nargs='+', type=str, help="Setup to filter results by.")
parser.add_argument("--runs_min", type=int, help="Minimum number of runs that experiment should have been run for.")
parser.add_argument("--runs_max", type=int, help="Maximum number of runs taken into account")
parser.add_argument(
"--folder",
type=str,
help="Folder containing experiment results",
default="../trajectories",
)
parser.add_argument(
"--model", nargs="+", type=str, help="Model(s) to filter results by."
)
parser.add_argument(
"--dataset", nargs="+", type=str, help="Dataset to filter results by."
)
parser.add_argument(
"--setup", nargs="+", type=str, help="Setup to filter results by."
)
parser.add_argument(
"--runs_min",
type=int,
help="Minimum number of runs that experiment should have been run for.",
)
parser.add_argument(
"--runs_max", type=int, help="Maximum number of runs taken into account"
)
args = parser.parse_args()

df = get_results_df(args.folder, args.runs_max)
Expand All @@ -188,10 +211,18 @@ def get_results_csv(folder_name):
"Generated": "mean",
"Applied": "mean",
"Resolved": "mean",
"Resolved IDs": lambda x: len(set([item for sublist in x for item in sublist])),
"Costs Success": lambda x: np.mean([item for sublist in x for item in sublist]),
"Costs Failure": lambda x: np.mean([item for sublist in x for item in sublist]),
"Costs Overall": lambda x: np.mean([item for sublist in x for item in sublist]),
"Resolved IDs": lambda x: len(
set([item for sublist in x for item in sublist])
),
"Costs Success": lambda x: np.mean(
[item for sublist in x for item in sublist]
),
"Costs Failure": lambda x: np.mean(
[item for sublist in x for item in sublist]
),
"Costs Overall": lambda x: np.mean(
[item for sublist in x for item in sublist]
),
}
)
.round(2)
Expand All @@ -201,19 +232,21 @@ def get_results_csv(folder_name):

# Filtering
if args.model:
grouped_data = grouped_data[grouped_data['Model'].isin(args.model)]
grouped_data = grouped_data[grouped_data["Model"].isin(args.model)]
if args.dataset:
grouped_data = grouped_data[grouped_data['Dataset'].isin(args.dataset)]
grouped_data = grouped_data[grouped_data["Dataset"].isin(args.dataset)]
if args.setup:
grouped_data = grouped_data[grouped_data['Setup'].isin(args.setup)]
grouped_data = grouped_data[grouped_data["Setup"].isin(args.setup)]
if args.runs_min:
grouped_data = grouped_data[grouped_data['Run'] >= args.runs_min]
grouped_data = grouped_data[grouped_data["Run"] >= args.runs_min]

print(f"Total experiments run: {grouped_data.shape[0]}")
grouped_data_sorted = grouped_data.sort_values(by=['Dataset', 'Resolved'], ascending=[True, False])
grouped_data_sorted = grouped_data.sort_values(
by=["Dataset", "Resolved"], ascending=[True, False]
)
pd.set_option("display.max_rows", None)
grouped = grouped_data_sorted.groupby('Dataset')
grouped = grouped_data_sorted.groupby("Dataset")

for name, group in grouped:
print(f'\n-----------------\nDataset: {name}\n-----------------')
print(f"\n-----------------\nDataset: {name}\n-----------------")
print(group.to_string(index=False))
40 changes: 29 additions & 11 deletions evaluation/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,26 @@
from unidiff import PatchSet


def main(predictions_path, log_dir, swe_bench_tasks, testbed, skip_existing, timeout, verbose, conda_link, log_suffix, num_processes):
def main(
predictions_path,
log_dir,
swe_bench_tasks,
testbed,
skip_existing,
timeout,
verbose,
conda_link,
log_suffix,
num_processes,
):
# Check if paths exist
if not os.path.exists(predictions_path):
raise FileNotFoundError(f"Predictions path {predictions_path} does not exist")
eval_refs = get_eval_refs(swe_bench_tasks)
for k, v in eval_refs.items():
eval_refs[k] = {key: v[key] for key in [KEY_INSTANCE_ID, "FAIL_TO_PASS", "PASS_TO_PASS"]}
eval_refs[k] = {
key: v[key] for key in [KEY_INSTANCE_ID, "FAIL_TO_PASS", "PASS_TO_PASS"]
}

# Change model_name_or_patch field to directory name for all predictions
directory = os.path.dirname(predictions_path)
Expand Down Expand Up @@ -65,7 +78,7 @@ def main(predictions_path, log_dir, swe_bench_tasks, testbed, skip_existing, tim
verbose=verbose,
conda_link=conda_link,
log_suffix=log_suffix,
num_processes=num_processes
num_processes=num_processes,
)
print("✅ Finished evaluation")
except Exception as e:
Expand All @@ -91,9 +104,13 @@ def main(predictions_path, log_dir, swe_bench_tasks, testbed, skip_existing, tim
scorecard["stats"]["traj_action_dist"] = dict(
Counter(
[
entry["action"].strip().split()[0]
if entry["role"] == "assistant" and "action" in entry and len(entry["action"]) > 0
else None
(
entry["action"].strip().split()[0]
if entry["role"] == "assistant"
and "action" in entry
and len(entry["action"]) > 0
else None
)
for entry in traj_data["history"]
]
)
Expand Down Expand Up @@ -144,7 +161,7 @@ def main(predictions_path, log_dir, swe_bench_tasks, testbed, skip_existing, tim
"success": {
"FAIL_TO_PASS": report["FAIL_TO_PASS"]["success"],
"PASS_TO_PASS": report["PASS_TO_PASS"]["success"],
}
},
}
resolution_status = get_resolution_status(report)
scorecard["statuses"].append(resolution_status)
Expand Down Expand Up @@ -218,13 +235,14 @@ def main(predictions_path, log_dir, swe_bench_tasks, testbed, skip_existing, tim
"--verbose", action="store_true", help="(Optional) Verbose mode"
)
parser.add_argument(
"--conda_link", default=None, type=str, help="(Optional) URL to conda installation to use"
"--conda_link",
default=None,
type=str,
help="(Optional) URL to conda installation to use",
)
parser.add_argument(
"--log_suffix", default=None, type=str, help="(Optional) Log suffix"
)
parser.add_argument(
"--num_processes", default=-1, type=int, help="Num processes"
)
parser.add_argument("--num_processes", default=-1, type=int, help="Num processes")
args = parser.parse_args()
main(**vars(args))
Loading

0 comments on commit a906e25

Please sign in to comment.