Skip to content

Commit

Permalink
Merge pull request #198 from kiudee/197_drawrate
Browse files Browse the repository at this point in the history
Let CLI output draw rate of the matches
  • Loading branch information
kiudee committed Mar 27, 2022
2 parents 72f6a05 + a98bfff commit 72bf37b
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 21 deletions.
1 change: 1 addition & 0 deletions HISTORY.rst
Expand Up @@ -13,6 +13,7 @@ Local tuner
- Add support for ``engineX_restart`` (default "auto") which allows one to set
the restart mode used by cutechess (#95).
- Add depth-based time control using ``engineX_depth`` (#95).
- Log the estimated draw rate of the current match (#197).


0.9.2 (2022-03-13)
Expand Down
12 changes: 8 additions & 4 deletions tests/test_local.py
Expand Up @@ -27,11 +27,12 @@ def test_parse_experiment_result():
Elo difference: -31.4 +/- 57.1, LOS: 13.9 %, DrawRatio: 31.0 %
Finished match
"""
score, error = parse_experiment_result(
score, error, draw_rate = parse_experiment_result(
teststr, n_dirichlet_samples=1000, random_state=0
)
assert_almost_equal(score, 0.0)
assert_almost_equal(error, 0.887797821633887)
assert_almost_equal(draw_rate, 1 / 4)

# Test cutechess 1.2.0 output:
teststr = """Started game 1 of 4 (engine1 vs engine2)
Expand All @@ -52,11 +53,12 @@ def test_parse_experiment_result():
Elo difference: -88.7 +/- nan, LOS: 28.2 %, DrawRatio: 25.0 %
Finished match
"""
score, error = parse_experiment_result(
score, error, draw_rate = parse_experiment_result(
teststr, n_dirichlet_samples=1000, random_state=0
)
assert_almost_equal(score, 0.38764005203222596)
assert_almost_equal(error, 0.6255020676255081)
assert_almost_equal(draw_rate, 1.5 / 5)

teststr = """Indexing opening suite...
Started game 1 of 40 (engine1 vs engine2)
Expand Down Expand Up @@ -90,11 +92,12 @@ def test_parse_experiment_result():
Finished game 10 (engine2 vs engine1): 1/2-1/2 {Draw by adjudication}
Score of engine1 vs engine2: 10 - 0 - 0 [0.450] 10
"""
score, error = parse_experiment_result(
score, error, draw_rate = parse_experiment_result(
teststr, n_dirichlet_samples=1000, random_state=0
)
assert_almost_equal(score, -2.7958800173440745)
assert_almost_equal(error, 1.9952678343378125)
assert_almost_equal(draw_rate, 1 / 8)

# Test if the result is correct in case the order of finished games is not linear.
# This can happen with concurrency > 1
Expand All @@ -116,11 +119,12 @@ def test_parse_experiment_result():
Elo difference: -88.7 +/- nan, LOS: 28.2 %, DrawRatio: 25.0 %
Finished match
"""
score, error = parse_experiment_result(
score, error, draw_rate = parse_experiment_result(
teststr, n_dirichlet_samples=1000, random_state=0
)
assert_almost_equal(score, 0.38764005203222596)
assert_almost_equal(error, 0.6255020676255081)
assert_almost_equal(draw_rate, 1.5 / 5)


def test_reduce_ranges():
Expand Down
3 changes: 2 additions & 1 deletion tune/cli.py
Expand Up @@ -502,10 +502,11 @@ def local( # noqa: C901
root_logger.info(f"Experiment finished ({difference}s elapsed).")

# Parse cutechess-cli output and report results (Elo and standard deviation):
score, error_variance = parse_experiment_result(out_exp, **settings)
score, error_variance, draw_rate = parse_experiment_result(out_exp, **settings)
root_logger.info(
"Got Elo: {} +- {}".format(-score * 100, np.sqrt(error_variance) * 100)
)
root_logger.info("Estimated draw rate: {:.2%}".format(draw_rate))

# Update model with the new data:
root_logger.info("Updating model")
Expand Down
43 changes: 27 additions & 16 deletions tune/local.py
Expand Up @@ -9,6 +9,7 @@
from typing import (
Any,
Callable,
Iterable,
Iterator,
List,
Optional,
Expand Down Expand Up @@ -53,7 +54,7 @@
LOGGER = "ChessTuner"


def elo_to_prob(elo, k=4.0):
def elo_to_prob(elo: np.ndarray, k: float = 4.0) -> np.ndarray:
"""Convert an Elo score (logit space) to a probability.
Parameters
Expand All @@ -76,10 +77,10 @@ def elo_to_prob(elo, k=4.0):
"""
if k <= 0:
raise ValueError("k must be positive")
return 1 / (1 + np.power(10, -elo / k))
return np.atleast_1d(1 / (1 + np.power(10, -elo / k)))


def prob_to_elo(p, k=4.0):
def prob_to_elo(p: np.ndarray, k: float = 4.0) -> np.ndarray:
"""Convert a win probability to an Elo score (logit space).
Parameters
Expand All @@ -102,12 +103,12 @@ def prob_to_elo(p, k=4.0):
"""
if k <= 0:
raise ValueError("k must be positive")
return k * np.log10(-p / (p - 1))
return np.atleast_1d(k * np.log10(-p / (p - 1)))


def counts_to_penta(
counts: np.ndarray,
prior_counts: Optional[np.ndarray] = None,
prior_counts: Optional[Iterable[float]] = None,
n_dirichlet_samples: int = 1000000,
score_scale: float = 4.0,
random_state: Union[int, RandomState, None] = None,
Expand Down Expand Up @@ -142,11 +143,13 @@ def counts_to_penta(
"""
if prior_counts is None:
prior_counts = np.array([0.14, 0.19, 0.34, 0.19, 0.14]) * 2.5
elif len(prior_counts) != 5:
raise ValueError("Argument prior_counts should contain 5 elements.")
else:
prior_counts = np.array(prior_counts)
if len(prior_counts) != 5:
raise ValueError("Argument prior_counts should contain 5 elements.")
dist = dirichlet(alpha=counts + prior_counts)
scores = [0.0, 0.25, 0.5, 0.75, 1.0]
score = prob_to_elo(dist.mean().dot(scores), k=score_scale)
score = float(prob_to_elo(dist.mean().dot(scores), k=score_scale))
error = prob_to_elo(
dist.rvs(n_dirichlet_samples, random_state=random_state).dot(scores),
k=score_scale,
Expand Down Expand Up @@ -985,13 +988,13 @@ def check_log_for_errors(cutechess_output: List[str],) -> None:


def parse_experiment_result(
outstr,
prior_counts=None,
n_dirichlet_samples=1000000,
score_scale=4.0,
random_state=None,
**kwargs,
):
outstr: str,
prior_counts: Optional[Sequence[float]] = None,
n_dirichlet_samples: int = 1000000,
score_scale: float = 4.0,
random_state: Union[int, RandomState, None] = None,
**kwargs: Any,
) -> Tuple[float, float, float]:
"""Parse cutechess-cli result output to extract mean score and error.
Here we use a simple pentanomial model to exploit paired openings.
Expand Down Expand Up @@ -1034,6 +1037,8 @@ def parse_experiment_result(
error : float
Estimated standard error of the score. Estimated by repeated draws
from a Dirichlet distribution.
draw_rate : float
Estimated draw rate of the match.
"""
wdl_strings = re.findall(r"Score of.*:\s*([0-9]+\s-\s[0-9]+\s-\s[0-9]+)", outstr)
array = np.array(
Expand All @@ -1048,6 +1053,7 @@ def parse_experiment_result(
diffs = diffs[np.argsort(finished)]

counts = {"WW": 0, "WD": 0, "WL/DD": 0, "LD": 0, "LL": 0}
DD = 0 # Track DD separately to compute draw rate
for i in range(0, len(diffs) - 1, 2):
match = diffs[i] + diffs[i + 1]
if match[0] == 2:
Expand All @@ -1061,17 +1067,22 @@ def parse_experiment_result(
counts["LD"] += 1
elif match[2] == 2:
counts["WL/DD"] += 1
DD += 1
else:
counts["LL"] += 1
counts_array = np.array(list(counts.values()))
return counts_to_penta(
score, error = counts_to_penta(
counts=counts_array,
prior_counts=prior_counts,
n_dirichlet_samples=n_dirichlet_samples,
score_scale=score_scale,
random_state=random_state,
**kwargs,
)
draw_rate = (DD + 0.5 * counts["WD"] + 0.5 * counts["LD"] + 1.0) / (
counts_array.sum() + 3.0
)
return score, error, draw_rate


def update_model(
Expand Down

0 comments on commit 72bf37b

Please sign in to comment.