In [64]:
import chess
import chess.pgn
import chess.engine
import io
import re
import os
import sys
from io import StringIO
import subprocess
import numpy as np
from contextlib import contextmanager

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
from typing import List, Dict, Tuple, Generator

In [3]:
import pyperclip as pc

# Python chess basics
https://python-chess.readthedocs.io/en/latest/

# Sample PGN Data

In [4]:
test_pgn = '''
[Event "Rated Bullet game"]
[Site "https://lichess.org/69CbaD8f"]
[Date "2021.10.07"]
[White "Cubigami"]
[Black "JoinedToday"]
[Result "1-0"]
[UTCDate "2021.10.07"]
[UTCTime "05:16:00"]
[WhiteElo "1930"]
[BlackElo "1912"]
[WhiteRatingDiff "+5"]
[BlackRatingDiff "-6"]
[Variant "Standard"]
[TimeControl "60+0"]
[ECO "B01"]
[Opening "Scandinavian Defense: Mieses-Kotroc Variation"]
[Termination "Normal"]
[Annotator "lichess.org"]

1. e4 { [%eval 0.24] [%clk 0:01:00] } 1... d5?! { (0.24 → 0.82) Inaccuracy. e5 was best. } { [%eval 0.82] [%clk 0:01:00] } (1... e5 2. Nf3 Nc6 3. Bb5 Nf6 4. O-O Nxe4 5. Re1 Nd6 6. Nxe5) 2. exd5 { [%eval 0.36] [%clk 0:01:00] } 2... Qxd5 { [%eval 0.66] [%clk 0:01:00] } { B01 Scandinavian Defense: Mieses-Kotroc Variation } 3. Nc3 { [%eval 0.46] [%clk 0:01:00] } 3... Qe6+?! { (0.46 → 1.12) Inaccuracy. Qa5 was best. } { [%eval 1.12] [%clk 0:01:00] } (3... Qa5 4. Nf3 Nf6 5. d4 Bf5 6. Ne5 c6 7. Bf4 Nbd7 8. Nc4) 4. Be2 { [%eval 0.94] [%clk 0:00:59] } 4... Qg6? { (0.94 → 2.18) Mistake. Nf6 was best. } { [%eval 2.18] [%clk 0:01:00] } (4... Nf6 5. d4 Qd6 6. Nf3 e6 7. O-O Be7 8. Nb5 Qd8 9. c4) 5. Nf3 { [%eval 1.51] [%clk 0:00:58] } 5... Nc6?? { (1.51 → 4.52) Blunder. Qxg2 was best. } { [%eval 4.52] [%clk 0:00:59] } (5... Qxg2 6. Rg1 Qh3 7. d4 Nf6 8. Rg3 Qf5 9. Ne5 c6 10. Rf3) 6. O-O?? { (4.52 → 0.37) Blunder. Nb5 was best. } { [%eval 0.37] [%clk 0:00:57] } (6. Nb5 Kd8 7. d4 a6 8. d5 axb5 9. dxc6+ Ke8 10. Ne5 Qe6) 6... h5?? { (0.37 → 5.60) Blunder. Bh3 was best. } { [%eval 5.6] [%clk 0:00:59] } (6... Bh3 7. Ne1 Bf5 8. Nd5 O-O-O 9. Ne3 Nf6 10. Nf3 Be4 11. d3) 7. h4?? { (5.60 → 0.08) Blunder. Nb5 was best. } { [%eval 0.08] [%clk 0:00:56] } (7. Nb5 Bh3) 7... a6?? { (0.08 → 2.63) Blunder. Bh3 was best. } { [%eval 2.63] [%clk 0:00:58] } (7... Bh3 8. Ng5 Bf5 9. Bb5 f6 10. Nf3 e6 11. Nd4 Ne7 12. Qf3 Kd7 13. Re1 a6 14. Ba4) 8. Bd3?? { (2.63 → -0.22) Blunder. d4 was best. } { [%eval -0.22] [%clk 0:00:53] } (8. d4) 8... Qf6?? { (-0.22 → 1.46) Blunder. Bf5 was best. } { [%eval 1.46] [%clk 0:00:56] } (8... Bf5) 9. Ne4 { [%eval 1.24] [%clk 0:00:51] } 9... Qe6?! { (1.24 → 2.16) Inaccuracy. Qg6 was best. } { [%eval 2.16] [%clk 0:00:54] } (9... Qg6 10. Ng3) 10. Nfg5? { (2.16 → 0.77) Mistake. Neg5 was best. } { [%eval 0.77] [%clk 0:00:51] } (10. Neg5 Qd6 11. Bc4 Nh6 12. c3 Bf5 13. Qb3 O-O-O 14. Bxf7 e5 15. Be6+ Bxe6 16. Qxe6+ Qxe6) 10... Qd7?? { (0.77 → 8.62) Blunder. Qd5 was best. } { [%eval 8.62] [%clk 0:00:52] } (10... Qd5 11. Nc3 Qd8 12. Bc4 e6 13. Re1 Be7 14. d3 Nh6 15. Qxh5 g6 16. Qd1 Nf5 17. g3) 11. Qf3?? { (8.62 → -0.59) Blunder. Nxf7 was best. } { [%eval -0.59] [%clk 0:00:50] } (11. Nxf7) 11... f6?? { (-0.59 → 8.00) Blunder. Ne5 was best. } { [%eval 8.0] [%clk 0:00:50] } (11... Ne5 12. Qf4 f6 13. Be2 Nc6 14. Nf3 e5 15. Qe3 Nge7 16. Qb3 Qd5 17. d3 Qxb3 18. axb3) 12. Nc5 { [%eval 7.68] [%clk 0:00:47] } 12... Qd8? { (7.68 → Mate in 1) Checkmate is now unavoidable. Ne5 was best. } { [%eval #1] [%clk 0:00:49] } (12... Ne5 13. Bg6+ Nxg6 14. Nxd7 Bxd7 15. Qxb7 Rd8 16. Qe4 Nxh4 17. Ne6 Bxe6 18. Qxe6 Rd6 19. Qc4) 13. Nge6? { (Mate in 1 → 8.51) Lost forced checkmate sequence. Bg6# was best. } { [%eval 8.51] [%clk 0:00:45] } (13. Bg6#) 13... Bxe6 { [%eval 8.53] [%clk 0:00:43] } 14. Nxe6 { [%eval 8.78] [%clk 0:00:45] } 14... Qd7? { (8.78 → Mate in 1) Checkmate is now unavoidable. Ne5 was best. } { [%eval #1] [%clk 0:00:43] } (14... Ne5 15. Qxb7) 15. Bg6# { [%clk 0:00:45] } { White wins by checkmate. } 1-0'''

carlson_draw = '''
[Event "Rated Blitz game"]
[Site "https://lichess.org/RzGPtofJ"]
[Date "2021.08.19"]
[White "Feokl1995"]
[Black "DrNykterstein"]
[Result "1/2-1/2"]
[UTCDate "2021.08.19"]
[UTCTime "22:35:41"]
[WhiteElo "2998"]
[BlackElo "3141"]
[WhiteRatingDiff "+2"]
[BlackRatingDiff "-2"]
[WhiteTitle "GM"]
[BlackTitle "GM"]
[Variant "Standard"]
[TimeControl "180+0"]
[ECO "B00"]
[Opening "Nimzowitsch Defense: Williams Variation"]
[Termination "Normal"]
[Annotator "lichess.org"]

1. e4 { [%eval 0.24] [%clk 0:03:00] } 1... Nc6 { [%eval 0.44] [%clk 0:03:00] } 2. Nf3 { [%eval 0.24] [%clk 0:02:58] } 2... d6 { [%eval 0.71] [%clk 0:02:59] } { B00 Nimzowitsch Defense: Williams Variation } 3. d4 { [%eval 0.72] [%clk 0:02:57] } 3... Nf6 { [%eval 0.89] [%clk 0:02:58] } 4. Nc3 { [%eval 0.54] [%clk 0:02:56] } 4... g6 { [%eval 0.96] [%clk 0:02:58] } 5. h3?! { (0.96 → 0.42) Inaccuracy. Be3 was best. } { [%eval 0.42] [%clk 0:02:55] } (5. Be3 Bg7 6. Qd2 O-O 7. d5 Nb8 8. Bh6 c6 9. h3 b5) 5... Bg7 { [%eval 0.67] [%clk 0:02:57] } 6. Be2 { [%eval 0.5] [%clk 0:02:55] } 6... O-O { [%eval 0.5] [%clk 0:02:57] } 7. O-O { [%eval 0.34] [%clk 0:02:54] } 7... a6 { [%eval 0.4] [%clk 0:02:55] } 8. a4 { [%eval 0.53] [%clk 0:02:53] } 8... b6 { [%eval 0.62] [%clk 0:02:50] } 9. Be3 { [%eval 0.39] [%clk 0:02:52] } 9... e6 { [%eval 0.6] [%clk 0:02:47] } 10. e5 { [%eval 0.84] [%clk 0:02:41] } 10... dxe5 { [%eval 0.56] [%clk 0:02:46] } 11. Nxe5 { [%eval 0.57] [%clk 0:02:40] } 11... Bb7 { [%eval 0.73] [%clk 0:02:43] } 12. Bf3 { [%eval 0.81] [%clk 0:02:38] } 12... Nd5 { [%eval 0.91] [%clk 0:02:42] } 13. Nxd5 { [%eval 0.85] [%clk 0:02:36] } 13... exd5 { [%eval 0.75] [%clk 0:02:42] } 14. Nxc6 { [%eval 0.81] [%clk 0:02:31] } 14... Bxc6 { [%eval 0.87] [%clk 0:02:42] } 15. c3 { [%eval 0.75] [%clk 0:02:30] } 15... a5 { [%eval 1.03] [%clk 0:02:41] } 16. Bf4 { [%eval 0.6] [%clk 0:02:29] } 16... Qd7 { [%eval 0.88] [%clk 0:02:40] } 17. Re1 { [%eval 0.75] [%clk 0:02:28] } 17... Rfe8 { [%eval 0.79] [%clk 0:02:39] } 18. b3 { [%eval 0.36] [%clk 0:02:26] } 18... Rxe1+ { [%eval 0.35] [%clk 0:02:37] } 19. Qxe1 { [%eval 0.45] [%clk 0:02:26] } 19... Re8 { [%eval 0.42] [%clk 0:02:36] } 20. Qd2 { [%eval 0.55] [%clk 0:02:26] } 20... Bf8 { [%eval 0.36] [%clk 0:02:36] } 21. Re1 { [%eval 0.28] [%clk 0:02:19] } 21... Rxe1+ { [%eval 0.29] [%clk 0:02:35] } 22. Qxe1 { [%eval 0.45] [%clk 0:02:19] } 22... Bd6 { [%eval 0.25] [%clk 0:02:34] } 23. Qe3 { [%eval 0.22] [%clk 0:02:17] } 23... f6 { [%eval 0.42] [%clk 0:02:12] } 24. Bg4 { [%eval 0.14] [%clk 0:02:12] } 24... Qd8 { [%eval 0.63] [%clk 0:02:09] } 25. Qe6+ { [%eval 0.44] [%clk 0:02:10] } 25... Kg7 { [%eval 0.47] [%clk 0:02:07] } 26. Bxd6 { [%eval 0.4] [%clk 0:02:08] } 26... Qxd6 { [%eval 0.43] [%clk 0:02:07] } 27. Qxd6 { [%eval 0.42] [%clk 0:02:06] } 27... cxd6 { [%eval 0.43] [%clk 0:02:07] } 28. Bf3 { [%eval 0.45] [%clk 0:02:02] } 28... f5 { [%eval 0.43] [%clk 0:02:06] } 29. h4 { [%eval 0.21] [%clk 0:01:55] } 29... Kf6 { [%eval 0.21] [%clk 0:02:04] } 30. g3 { [%eval 0.26] [%clk 0:01:54] } 30... h6 { [%eval 0.36] [%clk 0:02:03] } 31. Be2 { [%eval 0.14] [%clk 0:01:51] } 31... Bd7 { [%eval 0.3] [%clk 0:01:59] } 32. f4 { [%eval 0.35] [%clk 0:01:50] } 32... h5 { [%eval 0.36] [%clk 0:01:58] } 33. Kf2 { [%eval 0.35] [%clk 0:01:50] } 33... Be6 { [%eval 0.34] [%clk 0:01:57] } 34. Ke3 { [%eval 0.34] [%clk 0:01:48] } 34... Ke7 { [%eval 0.29] [%clk 0:01:57] } 35. Kd2 { [%eval 0.21] [%clk 0:01:48] } 35... Kd8 { [%eval 0.34] [%clk 0:01:57] } 36. Bb5 { [%eval 0.32] [%clk 0:01:46] } 36... Bf7 { [%eval 0.19] [%clk 0:01:55] } 37. Kd3 { [%eval 0.19] [%clk 0:01:44] } 37... Kc7 { [%eval 0.26] [%clk 0:01:55] } 38. Ke3 { [%eval 0.39] [%clk 0:01:39] } 38... Kd8 { [%eval 0.35] [%clk 0:01:54] } 39. Kd2 { [%eval 0.52] [%clk 0:01:39] } 39... Kc7 { [%eval 0.36] [%clk 0:01:53] } 40. Kc2 { [%eval 0.23] [%clk 0:01:38] } 40... Kd8 { [%eval 0.3] [%clk 0:01:53] } 41. Kb2 { [%eval 0.25] [%clk 0:01:37] } 41... Kc7 { [%eval 0.27] [%clk 0:01:53] } 42. Kc2 { [%eval 0.34] [%clk 0:01:37] } 42... Kd8 { [%eval 0.17] [%clk 0:01:52] } 43. Kd2 { [%eval 0.33] [%clk 0:01:36] } 43... Kc7 { [%eval 0.17] [%clk 0:01:52] } { The game is a draw. } 1/2-1/2
'''

# General Helper Functions

In [5]:
def generate_games(pgn_filename: str, max_games: int = 1_000_000) -> Generator[str, None, None]:
    """ Take a PGN filename and return a generator that loops over the first num_games games. """
    with open(pgn_filename) as file:
        num_games_yielded = 0
        while True:
            if num_games_yielded > max_games:
                return

            game = chess.pgn.read_game(file)

            if game is not None:
                yield game
                num_games_yielded += 1
            else:
                # No more games in the pgn
                return


# Doesn't work because "Annotator" not always in game.headers
# def is_analyzed_by_lichess(game: chess.pgn.Game) -> bool:
#     return 'Annotator' in game.headers and game.headers['Annotator'] == 'lichess.org'


EVAL_REGEX_PAT = '\[%eval ([+-]?(?:[0-9]*[.])?[0-9]+|#-?[0-9]+)]'
def get_score_from_comment(comment: str) -> float:
    """ Return the pawn-score in the comment if it contains "%eval". If "%eval" not in comment, raises ValueError. """
    if '%eval' not in comment:
        raise ValueError

    # Get score part using regex
    evals = re.findall(EVAL_REGEX_PAT, comment)
    assert len(evals) == 1, f'error: "%eval" tag does not appear exactly once in move.comment = {comment}'
    score = evals[0]

    # Convert to float
    try:
        score = float(score)
    except ValueError:
        # Only explanation for ValueError should be that %eval's score is a checkmate, ex. '#-5' or '#9'
        # Convert to centipawn-score with mate_score, then /100 to get pawn-score
        # https://python-chess.readthedocs.io/en/latest/engine.html?highlight=mate_score#chess.engine.Score.score
        assert '#' in score
        score = chess.engine.Mate(int(score.lstrip('#'))).score(mate_score=10_000) / 100.0
    return score


CLK_REGEX_PAT = '\[%clk (.*?)]'
def get_time_remaining_from_comment(comment: str) -> int:
    """
    Return the time remaining in seconds given by the "%clk" tag in the comment (always appears as an int).
    If "%clk" not in comment, raise ValueError.
    """
    if '%clk' not in comment:
        raise ValueError

    # Get eval part using regex
    clk = re.findall(CLK_REGEX_PAT, comment)
    assert len(clk) == 1, f'error: "%clk" tag does not appear exactly once in move.comment = {comment}'
    clk = clk[0]

    parts = clk.split(':')
    assert len(parts) == 3
    seconds_remaining = int(parts[0]) * 3600 \
                     + int(parts[1]) * 60 \
                     + int(parts[2])
    return seconds_remaining

def get_cpl(from_score: float, to_score: float, after_move_by: chess.Color) -> int:
    """ Get centipawn loss (CPL) based on scores of consecutive moves and side to move. """
    return int((to_score - from_score) * 100) * ((-1) ** (not after_move_by))


def parse_time_control(time_control: str) -> Tuple[int, int]:
    """ Convert a time control string like "300+3" . """
    if time_control == '-':
        # Correspondence game (ex. 3 days per move)
        # Assume the longest time control of 120+120
        return 7200, 120

    x = time_control.split('+')
    return int(x[0]), int(x[1])

# Engine Functions

In [6]:
@contextmanager
def start_engine_process() -> Generator[subprocess.Popen, None, None]:
    # Open the exe using Popen
    p = subprocess.Popen("./stockfish",
                         stdin=subprocess.PIPE,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)

    # Send commands to this open process by using stdin.write(*command*)
    send_commands(p, 'isready\n')

    try:
        yield p
    finally:
        p.terminate()


def send_commands(proc: subprocess.Popen, *commands: str) -> None:
    if not commands:
        return

    for cmd in commands:
        proc.stdin.write(cmd.encode())

    proc.stdin.flush()


def analyze_static(p: subprocess.Popen, fen: str) -> str:
    """ Return the full string output of Stockfish's static evaluation for the given FEN. """

    send_commands(p, f'position fen {fen}\n', 'eval\n')

    # Get as many str lines as were output
    lines = []
    while True:
        line = p.stdout.readline().decode().strip()
        lines.append(line.strip('\n\r'))

        # Static eval does not run on positions where either king is in check
        if 'none (in check)' in line or 'Final evaluation' in line:
            break

    return '\n'.join(lines)

def vectorize_contributing_terms(static_analysis_output: str) -> np.ndarray:
    """ Return a 1x68 list from the "Contributing terms..." table given by the static analysis. """
    # Regex tester
    # https://regex101.com/r/OWV1ee/4
    return np.array([float(i)
                     for tup in re.findall('([-]?(?:[0-9]*[.])?[0-9]+) +([-]?(?:[0-9]*[.])?[0-9]+)',
                                           static_analysis_output)
                     for i in tup])

In [7]:
''' Testing '''
with start_engine_process() as p:
    # # In-check position shouldn't give static eval
    # print(analyze_static(p, 'r3r3/pppkb1pp/8/n2nN3/8/8/PPPP1PPP/RNB1K2R b KQ - 2 12'))
    # print('========================')

    # Should give static eval
    output = analyze_static(p, 'r1bk1bnr/1pp2ppp/p1n2q2/3N2N1/6B1/8/PPP2PPP/R1BQK2R b KQ - 0 12')
    # pc.copy(output)
    print(output)
    print('========================')

    # Get all floats from the first table
    print(vectorize_contributing_terms(output))
    # print(analyze_static(p, 'r1bk1bnr/1pp2ppp/p1n2q2/3N2N1/6B1/8/PPP2PPP/R1BQK2R b KQ - 0 12'))

Stockfish 271021 by the Stockfish developers (see AUTHORS file)
readyok
info string NNUE evaluation using nn-13406b1dcbe0.nnue enabled

Contributing terms for the classical eval:
+------------+-------------+-------------+-------------+
|    Term    |    White    |    Black    |    Total    |
|            |   MG    EG  |   MG    EG  |   MG    EG  |
+------------+-------------+-------------+-------------+
|   Material |  ----  ---- |  ----  ---- |  0.80  0.49 |
|  Imbalance |  ----  ---- |  ----  ---- |  0.00  0.00 |
|      Pawns |  0.29 -0.06 |  0.38 -0.05 | -0.09 -0.01 |
|    Knights | -0.31 -0.35 | -0.11 -0.20 | -0.20 -0.14 |
|    Bishops | -0.13 -0.45 | -0.09 -0.79 | -0.04  0.35 |
|      Rooks | -0.26 -0.06 | -0.53 -0.12 |  0.26  0.06 |
|     Queens |  0.00  0.00 |  0.00  0.00 |  0.00  0.00 |
|   Mobility |  0.83  1.05 |  0.69  0.90 |  0.14  0.14 |
|King safety | -0.71 -0.40 | -3.30 -0.51 |  2.60  0.11 |
|    Threats |  1.41  1.99 |  1.21  1.00 |  0.21  0.99 |
|     Passed |  0.00  0

# Extracting Game Data

In [15]:
def vectorize_game(proc: subprocess.Popen, game: chess.pgn.Game) -> np.ndarray:
    """
    Return a (#plies)x(#features + 1 = 75) array. For each ply (half-move) in the game, the following
    features are extracted:
        Game-specific features:
        - Player ELO rating
        - Opponent ELO rating
        - Rating diff (Opponent ELO - Player ELO)
        - Clock starting time (seconds)
        - Clock increment (seconds)
        Move-specific features:
        - Ply (# half-moves into the game)
        - Time remaining at given move (seconds)
        - Static eval features:
            - 68 floats from the "Contributing terms ..." table
            - (TODO) 64 features from the "NNUE-derived piece values" table
                - Not sure how to deal with squares without pieces, filling with 0s might not make sense
        - Board state features:
            - (TODO) 256 features: 64 squares with one-hot encoded piece types (12 piece types can be
              represented with 4 bits, so 4 x 64 = 256)
    The final column is the target column holding the centipawn loss (CPL) of the player's move in the position.
    """

    data = np.empty((0, 75), dtype=np.float64)

    # Set up board and Game's move nodes
    board = game.board()
    mainline_nodes: List[chess.pgn.ChildNode] = list(game.mainline())
    if not mainline_nodes:
        return data
    if '%eval' not in mainline_nodes[0].comment:
        return data

    ''' Add row to `data` for each move '''
    # Game-specific features (constant while iterating over moves in the game)
    h = game.headers
    try:
        clock_start, clock_inc = parse_time_control(h['TimeControl'])
    except ValueError as e:
        print(f'test: pgn: {str(game)}')
        raise e
    game_features_white = np.array([h['WhiteElo'], h['BlackElo'], h['BlackRatingDiff'], clock_start, clock_inc])
    game_features_black = np.array([h['BlackElo'], h['WhiteElo'], h['WhiteRatingDiff'], clock_start, clock_inc])

    # Add row of features for all moves in the Game
    for i, node in enumerate(mainline_nodes):
        # Can't get static eval while in check
        if board.is_check():
            board.push(node.move)
            # print('test: skipping move (in check)')
            continue

        move: chess.Move = node.move
        comment: str = node.comment
        try:
            next_comment = mainline_nodes[i+1].comment
        except IndexError:
            # No next move -> game is over, this move has no cpl value
            break

        # Get centipawn loss (CPL) before gathering other data to make sure it's possible
        try:
            score = get_score_from_comment(comment)
            next_score = get_score_from_comment(next_comment)
        except ValueError:
            # If for some reason there is no "%eval" in either comment,
            # continue because it is needed to calculate CPL. This should be
            # because the next_move would end the game with checkmate/stalemate
            # assert any((s in next_comment for s in ('wins', 'draw'))), f'next_comment: {next_comment}'
            continue
        cpl = get_cpl(score, next_score, board.turn)

        # Move-specific features
        try:
            time_remaining = get_time_remaining_from_comment(comment)
        except ValueError:
            time_remaining = clock_start
        static_eval_features = vectorize_contributing_terms(analyze_static(proc, board.fen()))

        # Add row
        # move_features = np.concatenate((np.array([time_remaining]), static_eval_features, np.array([cpl])), 0)
        if board.turn == chess.WHITE:
            all_features = np.array([*game_features_white, time_remaining, *static_eval_features, cpl])
            # all_features = np.concatenate((game_features_white, move_features), 0)
        else:
            all_features = np.array([*game_features_black, time_remaining, *static_eval_features, cpl])
            # all_features = np.concatenate((game_features_black, move_features), 0)

        assert all_features.shape == (75,), f'error: all_features: {all_features}'
        data = np.vstack((data, all_features))

        # Play next move
        board.push(move)

    return data

In [16]:
''' Testing '''
with start_engine_process() as p:
    test = vectorize_game(p, chess.pgn.read_game(io.StringIO(carlson_draw)))
test

array([['2998', '3141', '-2', ..., '0.0', '0.0', '20'],
       ['3141', '2998', '+2', ..., '1.0', '0.88', '20'],
       ['2998', '3141', '-2', ..., '0.35', '0.51', '47'],
       ...,
       ['2998', '3141', '-2', ..., '2.19', '1.27', '-17'],
       ['3141', '2998', '+2', ..., '1.29', '1.43', '-16'],
       ['2998', '3141', '-2', ..., '1.57', '1.76', '-16']], dtype='<U32')

In [83]:
def write_to_csv(output_filename: str, data: np.array) -> None:
    # TODO doesn't actually work yet
    # Looked at these:
    # https://stackoverflow.com/questions/6081008/dump-a-numpy-array-into-a-csv-file
    # https://stackoverflow.com/a/27980725/7304977
    while True:
        try:
            with open(output_filename, 'wb') as f:
                np.savetxt(f, data, delimiter=',')
            break
        except PermissionError:
            input(f'error: PermissionError. Please close {output_filename} and press enter: ')


def prepare_game_data(pgn_filename: str, max_datapoints: int = 1_000_000) -> np.ndarray:
    data = np.empty((0, 75), dtype=np.float64)

    with start_engine_process() as proc:
        for game in generate_games(pgn_filename):
            data = np.concatenate((data, vectorize_game(proc, game)), 0)

            if data.shape[0] > max_datapoints:
                break

    return data[:max_datapoints,:]

In [136]:
data = prepare_game_data('data/lichess_db_standard_rated_2015-09.pgn', max_datapoints=10_000)

data

array([['1860', '1953', '+9', ..., '0.0', '0.0', '0'],
       ['1953', '1860', '-9', ..., '1.0', '0.88', '-9'],
       ['1860', '1953', '+9', ..., '0.8', '0.63', '-7'],
       ...,
       ['1459', '1763', '+4', ..., '0.19', '0.24', '-19'],
       ['1763', '1459', '-4', ..., '0.21', '0.0', '0'],
       ['1459', '1763', '+4', ..., '-1.44', '-1.54', '50']], dtype='<U32')

# Machine Learning

Split the data into a target and feature datasets. 

In [131]:
features = []
targets = []

for i in data:
    x = i[:-1]

    # Cast the string type into a numpy float type
    x = x.astype(np.float64)
    features.append(x)

    y = i[-1]

    y = y.astype(np.float64)
    targets.append(y)

# Rebuild each array as a numpy array    
X = np.array(X)
Y = np.array(Y)

# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)


reg = RandomForestRegressor()

# Fit our training data using a random forest regression technique
reg = reg.fit(X = X_train, y = y_train)


In [145]:
# Predict the values of our testing set using the model
y_pred = reg.predict(X=X_test)


# Check to see how many results are within the set range
count = 0
for i in range(0, len(y_pred)):
    if y_pred[i] < y_test[i] + 10:
        if y_pred[i] > y_test[i] - 10:
            count += 1
    
print(count)
print(len(y_pred))

588
4500


Comparing the output from our predicted values to what they should be.

In [139]:
for i in range(0, len(y_test)):
    print(y_pred[i], y_test[i])
    
# len(data[0])

152.66 -18.0
86.42 8.0
12.95 17.0
1415.37 -6.0
10.82 23.0
41.83 61.0
272.08 -2.0
51.42 -12.0
124.23 782.0
19.0 2.0
148.05 0.0
56.35 2.0
2873.08 8219.0
112.28 0.0
55.69 36.0
43.82 38.0
1270.67 1800.0
69.55 8.0
27.31 5.0
65.4 130.0
415.2 17.0
427.2 180.0
271.21 285.0
356.53 -6.0
209.81 31.0
69.05 -28.0
15.46 4.0
40.31 47.0
30.74 0.0
1238.14 1943.0
50.84 0.0
304.07 33.0
601.88 0.0
8.535 32.0
86.97 127.0
579.98 327.0
27.63 4.0
35.59 20.0
272.7 238.0
2436.38 8833.0
429.24 180.0
67.38 185.0
43.1 8.0
6.58 -7.0
286.34 35.0
920.41 72.0
27.45 4.0
33.15 25.0
70.08 22.0
26.09 62.0
16.99 24.0
25.04 -8.0
186.47 119.0
90.02 116.0
15.69 0.0
138.27 -2.0
12.45 32.0
40.95 7.0
40.95 13.0
306.04 98.0
65.98 8.0
47.69 16.0
8.88 24.0
178.7 0.0
34.76 7.0
36.86 16.0
96.83 9.0
75.6 102.0
1.88 -10.0
462.13 26.0
76.64 48.0
39.73 13.0
1713.7 201.0
1700.25 -19.0
211.38 52.0
41.97 -5.0
869.78 9850.0
55.7 -29.0
59.89 87.0
45.96 54.0
23.38 15.0
22.17 17.0
842.5 264.0
32.61 7.0
20.18 206.0
170.42 0.0
34.5 19.0
15.47 111

42.64 -7.0
415.88 1.0
3.67 -10.0
9.81 4.0
198.91 8.0
882.23 -3.0
185.33 94.0
109.56 321.0
75.97 40.0
5.82 -10.0
13.28 0.0
48.45 7.0
142.58 218.0
386.32 -14.0
33.91 124.0
1554.01 8.0
28.34 -18.0
178.16 900.0
54.51 182.0
604.72 16.0
303.79 304.0
595.08 0.0
1337.39 41.0
67.75 -2.0
2485.87 32.0
120.55 101.0
8.79 15.0
1033.44 1583.0
50.66 9.0
177.92 209.0
11.61 -4.0
213.99 524.0
100.05 118.0
114.54 6.0
8.42 5.0
554.2 1184.0
12.33 21.0
436.5 92.0
934.09 333.0
184.38 10.0
234.28 0.0
31.58 -6.0
6.84 -7.0
464.67 0.0
101.52 17.0
293.76 272.0
24.483125 0.0
224.53 -223.0
54.25 100.0
2659.84 3.0
120.65 7.0
271.29 7483.0
26.23 -3.0
54.46 43.0
223.17 200.0
52.42 -2.0
27.77 94.0
92.51 62.0
234.24 75.0
920.94 35.0
187.35 -542.0
14.23 17.0
28.18 19.0
41.5 56.0
12.32 0.0
24.38 -3.0
71.8 74.0
214.49 2.0
35.36 42.0
515.8566666666667 -10.0
20.86 26.0
11.81 59.0
131.9 20.0
791.41 -4.0
30.52 -9.0
571.75 66.0
40.6 8.0
728.0066666666665 240.0
79.12 27.0
8.63 19.0
27.93 35.0
1241.68 1121.0
178.0 93.0
77.59 -6.0


12.43 6.0
246.49 -9.0
203.07 0.0
175.94 21.0
39.66 40.0
62.12 0.0
191.1 18.0
15.74 3.0
64.26 3.0
17.85 21.0
21.23 14.0
62.89 87.0
33.74 -18.0
189.24 10.0
333.52 206.0
364.09815561631353 92.0
529.69 -1.0
254.67 8.0
570.17 14.0
66.28 0.0
162.5975 32.0
6.01 1.0
447.1 -37.0
109.58 -12.0
12.94 1.0
64.24 154.0
16.39 10.0
12.85 3.0
526.52 -8.0
188.93454545454543 0.0
289.52 -25.0
539.84 514.0
183.28125 22.0
78.27 9.0
1380.92 4216.0
1905.28 0.0
697.83 -89.0
134.7433333333333 0.0
276.45 16.0
502.41 101.0
359.74 18.0
194.6 7.0
804.79 22.0
222.84 73.0
63.24 -7.0
63.85 51.0
159.75 -11.0
48.46 99.0
29.64 7.0
297.95 3.0
70.15 131.0
28.29 30.0
54.38 45.0
105.86 33.0
15.26 35.0
70.66 13.0
281.5 -2.0
57.41 2.0
78.75 -9.0
58.18 3.0
151.44 14.0
1184.88 0.0
24.55 27.0
293.81 2.0
9.42 3.0
307.51000000000005 66.0
25.51 53.0
751.15 -10.0
84.07 63.0
511.72 69.0
48.19 84.0
516.49 1703.0
3439.97 -87.0
580.03 322.0
1832.3 20.0
17.61 4.0
46.29 154.0
184.57 3.0
0.19725682650682647 0.0
56.1 -10.0
1869.45 4720.0
24.3

8.52 110.0
14.59 3.0
166.32 98.0
378.91 33.0
63.68 97.0
44.12 -5.0
35.46 25.0
38.28 11.0
190.67 43.0
33.0 108.0
67.95 132.0
42.71 96.0
69.11 -17.0
1509.62 319.0
144.33 186.0
28.88 -7.0
62.28 -8.0
-0.5371583536862174 -1.0
16.51 -7.0
14.54 0.0
249.57 316.0
276.21 63.0
1491.27 2.0
0.4 3.0
66.46 2.0
240.57 28.0
61.39 111.0
30.28 0.0
636.49 1105.0
370.31 108.0
91.03 -10.0
154.00571428571428 -11.0
7.28 0.0
489.99 129.0
46.16 32.0
14.04 11.0
191.04 -22.0
32.28 79.0
440.53 218.0
52.59 48.0
357.03 594.0
32.12 174.0
1179.51 -15.0
649.86 -134.0
128.87 128.0
34.41 -3.0
174.88 233.0
484.15 223.0
46.26 30.0
101.63 29.0
13.0 14.0
172.96 53.0
296.34 321.0
65.19 -2.0
21.25 8.0
79.64 146.0
46.92 -10.0
-0.1715797220032178 0.0
523.6 -2.0
2228.28 -38.0
202.52 12.0
96.99 83.0
350.56 -25.0
19.0 28.0
7.5 -1.0
193.0 18.0
8.15 -3.0
147.26 10.0
199.11 34.0
21.7 19.0
26.03 37.0
59.12 10.0
118.73 15.0
34.3 1089.0
36.78 2.0
148.24 78.0
230.96 22.0
243.22 44.0
451.15 32.0
581.7 76.0
52.95 17.0
1518.12 -11.0
21.09 2.

423.58 105.0
420.39 41.0
20.42 4.0
326.19 0.0
53.43 32.0
938.01 4003.0
213.65 4.0
12.61 15.0
116.59 28.0
137.16 67.0
142.33 0.0
13.32 3.0
413.09 562.0
25.7 45.0
129.43 193.0
48.98 3.0
46.08 0.0
12.27 -8.0
107.68 157.0
24.67 3.0
53.25 45.0
10.77 7.0
52.63 52.0
23.843333333333334 0.0
26.74 15.0
24.98 -10.0
635.67 -13.0
326.32 37.0
1332.07 -4.0
565.44 74.0
55.742 -8.0
7.28 11.0
888.695 -51.0
15.88 35.0
53.2 2.0
491.03 -93.0
24.04 3.0
249.26 -18.0
2.85 -2.0
27.94 6.0
22.46 -4.0
165.13 14.0
9.47 36.0
353.66 486.0
16.36 1.0
26.4 60.0
30.15 30.0
72.81 20.0
717.13 0.0
234.44 405.0
75.18 -18.0
15.06 3.0
18.77 -1.0
38.37 13.0
164.76 288.0
44.26 130.0
450.62 19.0
114.84 -4.0
146.52 8.0
52.155 6.0
835.97 600.0
119.7 40.0
21.18 -16.0
39.85 -10.0
104.07 -14.0
306.65 0.0
1433.12 100.0
37.22 -6.0
83.54 3.0
63.04 504.0
18.71 21.0
60.69 13.0
7.53 36.0
121.76 544.0
-33.08482539682539 1.0
55.71 0.0
159.33 96.0
105.12 24.0
1166.57 0.0
10.64 -7.0
386.18 -7.0
678.48 -6.0
97.34 27.0
62.04 -10.0
6178.3 1.0
23.

56.75 -38.0
362.18666666666667 -1.0
164.86 33.0
27.96 -4.0
1624.09 4488.0
83.07 39.0
35.93 0.0
58.14 184.0
588.96 200.0
43.45 41.0
668.8208333333333 67.0
294.71 347.0
22.53 13.0
133.34 27.0
73.86 25.0
31.16 6.0
15.81 17.0
10.27 46.0
35.64 14.0
1804.62 29.0
13.14 -5.0
9.35 17.0
-171.49 0.0
179.73 16.0
160.1 -3.0
30.48 -4.0
480.82 -5.0
59.77 86.0
12.78 -4.0
18.55 -13.0
195.86 836.0
17.2 12.0
14.38 20.0
9.5 1.0
22.38 -1.0
66.34 46.0
16.0 -16.0
9.34 3.0
-28.64 -75.0
37.32 79.0
4853.33 -1.0
42.96 27.0
30.27 67.0
13.81 26.0
27.54 168.0
1026.07 -9.0
742.03 -20.0
19.81 62.0
9.41 6.0
732.03 -1.0
139.5 12.0
232.06 -2.0
36.83 23.0
105.59 0.0
35.27 1.0
41.91666666666666 297.0
75.55 145.0
45.57 -42.0
684.69 16.0
262.85 97.0
7.71 0.0
5.89 1.0
77.39 -18.0
40.8 144.0
26.7 -2.0
2078.81 8404.0
15.85 -12.0
663.89 0.0
501.0 8017.0
310.24 50.0
369.87 14.0
17.5 26.0
41.01 54.0
11.39 7.0
303.18 0.0
336.27 23.0
64.16 -9.0
170.27 188.0
21.44 1.0
7.62 -1.0
549.07 18.0
12.5 28.0
735.0 -74.0
396.71 378.0
38.98 4.