Merge pull request #198 from kiudee/197_drawrate

Let CLI output draw rate of the matches
kiudee · Mar 27, 2022 · 72bf37b · 72bf37b
2 parents 72f6a05 + a98bfff
commit 72bf37b
Show file tree

Hide file tree

Showing 4 changed files with 38 additions and 21 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -13,6 +13,7 @@ Local tuner
 - Add support for ``engineX_restart`` (default "auto") which allows one to set
   the restart mode used by cutechess (#95).
 - Add depth-based time control using ``engineX_depth`` (#95).
+- Log the estimated draw rate of the current match (#197).
 
 
 0.9.2 (2022-03-13)

diff --git a/tests/test_local.py b/tests/test_local.py
@@ -27,11 +27,12 @@ def test_parse_experiment_result():
     Elo difference: -31.4 +/- 57.1, LOS: 13.9 %, DrawRatio: 31.0 %
     Finished match
     """
-    score, error = parse_experiment_result(
+    score, error, draw_rate = parse_experiment_result(
         teststr, n_dirichlet_samples=1000, random_state=0
     )
     assert_almost_equal(score, 0.0)
     assert_almost_equal(error, 0.887797821633887)
+    assert_almost_equal(draw_rate, 1 / 4)
 
     # Test cutechess 1.2.0 output:
     teststr = """Started game 1 of 4 (engine1 vs engine2)
@@ -52,11 +53,12 @@ def test_parse_experiment_result():
     Elo difference: -88.7 +/- nan, LOS: 28.2 %, DrawRatio: 25.0 %
     Finished match
     """
-    score, error = parse_experiment_result(
+    score, error, draw_rate = parse_experiment_result(
         teststr, n_dirichlet_samples=1000, random_state=0
     )
     assert_almost_equal(score, 0.38764005203222596)
     assert_almost_equal(error, 0.6255020676255081)
+    assert_almost_equal(draw_rate, 1.5 / 5)
 
     teststr = """Indexing opening suite...
     Started game 1 of 40 (engine1 vs engine2)
@@ -90,11 +92,12 @@ def test_parse_experiment_result():
     Finished game 10 (engine2 vs engine1): 1/2-1/2 {Draw by adjudication}
     Score of engine1 vs engine2: 10 - 0 - 0  [0.450] 10
     """
-    score, error = parse_experiment_result(
+    score, error, draw_rate = parse_experiment_result(
         teststr, n_dirichlet_samples=1000, random_state=0
     )
     assert_almost_equal(score, -2.7958800173440745)
     assert_almost_equal(error, 1.9952678343378125)
+    assert_almost_equal(draw_rate, 1 / 8)
 
     # Test if the result is correct in case the order of finished games is not linear.
     # This can happen with concurrency > 1
@@ -116,11 +119,12 @@ def test_parse_experiment_result():
     Elo difference: -88.7 +/- nan, LOS: 28.2 %, DrawRatio: 25.0 %
     Finished match
     """
-    score, error = parse_experiment_result(
+    score, error, draw_rate = parse_experiment_result(
         teststr, n_dirichlet_samples=1000, random_state=0
     )
     assert_almost_equal(score, 0.38764005203222596)
     assert_almost_equal(error, 0.6255020676255081)
+    assert_almost_equal(draw_rate, 1.5 / 5)
 
 
 def test_reduce_ranges():

diff --git a/tune/cli.py b/tune/cli.py
@@ -502,10 +502,11 @@ def local(  # noqa: C901
         root_logger.info(f"Experiment finished ({difference}s elapsed).")
 
         # Parse cutechess-cli output and report results (Elo and standard deviation):
-        score, error_variance = parse_experiment_result(out_exp, **settings)
+        score, error_variance, draw_rate = parse_experiment_result(out_exp, **settings)
         root_logger.info(
             "Got Elo: {} +- {}".format(-score * 100, np.sqrt(error_variance) * 100)
         )
+        root_logger.info("Estimated draw rate: {:.2%}".format(draw_rate))
 
         # Update model with the new data:
         root_logger.info("Updating model")

diff --git a/tune/local.py b/tune/local.py
@@ -9,6 +9,7 @@
 from typing import (
     Any,
     Callable,
+    Iterable,
     Iterator,
     List,
     Optional,
@@ -53,7 +54,7 @@
 LOGGER = "ChessTuner"
 
 
-def elo_to_prob(elo, k=4.0):
+def elo_to_prob(elo: np.ndarray, k: float = 4.0) -> np.ndarray:
     """Convert an Elo score (logit space) to a probability.
 
     Parameters
@@ -76,10 +77,10 @@ def elo_to_prob(elo, k=4.0):
     """
     if k <= 0:
         raise ValueError("k must be positive")
-    return 1 / (1 + np.power(10, -elo / k))
+    return np.atleast_1d(1 / (1 + np.power(10, -elo / k)))
 
 
-def prob_to_elo(p, k=4.0):
+def prob_to_elo(p: np.ndarray, k: float = 4.0) -> np.ndarray:
     """Convert a win probability to an Elo score (logit space).
 
     Parameters
@@ -102,12 +103,12 @@ def prob_to_elo(p, k=4.0):
     """
     if k <= 0:
         raise ValueError("k must be positive")
-    return k * np.log10(-p / (p - 1))
+    return np.atleast_1d(k * np.log10(-p / (p - 1)))
 
 
 def counts_to_penta(
     counts: np.ndarray,
-    prior_counts: Optional[np.ndarray] = None,
+    prior_counts: Optional[Iterable[float]] = None,
     n_dirichlet_samples: int = 1000000,
     score_scale: float = 4.0,
     random_state: Union[int, RandomState, None] = None,
@@ -142,11 +143,13 @@ def counts_to_penta(
     """
     if prior_counts is None:
         prior_counts = np.array([0.14, 0.19, 0.34, 0.19, 0.14]) * 2.5
-    elif len(prior_counts) != 5:
-        raise ValueError("Argument prior_counts should contain 5 elements.")
+    else:
+        prior_counts = np.array(prior_counts)
+        if len(prior_counts) != 5:
+            raise ValueError("Argument prior_counts should contain 5 elements.")
     dist = dirichlet(alpha=counts + prior_counts)
     scores = [0.0, 0.25, 0.5, 0.75, 1.0]
-    score = prob_to_elo(dist.mean().dot(scores), k=score_scale)
+    score = float(prob_to_elo(dist.mean().dot(scores), k=score_scale))
     error = prob_to_elo(
         dist.rvs(n_dirichlet_samples, random_state=random_state).dot(scores),
         k=score_scale,
@@ -985,13 +988,13 @@ def check_log_for_errors(cutechess_output: List[str],) -> None:
 
 
 def parse_experiment_result(
-    outstr,
-    prior_counts=None,
-    n_dirichlet_samples=1000000,
-    score_scale=4.0,
-    random_state=None,
-    **kwargs,
-):
+    outstr: str,
+    prior_counts: Optional[Sequence[float]] = None,
+    n_dirichlet_samples: int = 1000000,
+    score_scale: float = 4.0,
+    random_state: Union[int, RandomState, None] = None,
+    **kwargs: Any,
+) -> Tuple[float, float, float]:
     """Parse cutechess-cli result output to extract mean score and error.
 
     Here we use a simple pentanomial model to exploit paired openings.
@@ -1034,6 +1037,8 @@ def parse_experiment_result(
     error : float
         Estimated standard error of the score. Estimated by repeated draws
         from a Dirichlet distribution.
+    draw_rate : float
+        Estimated draw rate of the match.
     """
     wdl_strings = re.findall(r"Score of.*:\s*([0-9]+\s-\s[0-9]+\s-\s[0-9]+)", outstr)
     array = np.array(
@@ -1048,6 +1053,7 @@ def parse_experiment_result(
     diffs = diffs[np.argsort(finished)]
 
     counts = {"WW": 0, "WD": 0, "WL/DD": 0, "LD": 0, "LL": 0}
+    DD = 0  # Track DD separately to compute draw rate
     for i in range(0, len(diffs) - 1, 2):
         match = diffs[i] + diffs[i + 1]
         if match[0] == 2:
@@ -1061,17 +1067,22 @@ def parse_experiment_result(
             counts["LD"] += 1
         elif match[2] == 2:
             counts["WL/DD"] += 1
+            DD += 1
         else:
             counts["LL"] += 1
     counts_array = np.array(list(counts.values()))
-    return counts_to_penta(
+    score, error = counts_to_penta(
         counts=counts_array,
         prior_counts=prior_counts,
         n_dirichlet_samples=n_dirichlet_samples,
         score_scale=score_scale,
         random_state=random_state,
         **kwargs,
     )
+    draw_rate = (DD + 0.5 * counts["WD"] + 0.5 * counts["LD"] + 1.0) / (
+        counts_array.sum() + 3.0
+    )
+    return score, error, draw_rate
 
 
 def update_model(