In [279]:
import chess
import chess.pgn
import chess.engine
import io
import re
import os
import sys
from io import StringIO
import subprocess
import numpy as np
from contextlib import contextmanager

In [280]:
from typing import List, Dict, Tuple, Generator

In [281]:
import pyperclip as pc

# Python chess basics
https://python-chess.readthedocs.io/en/latest/

# Sample PGN Data

In [282]:
test_pgn = '''
[Event "Rated Bullet game"]
[Site "https://lichess.org/69CbaD8f"]
[Date "2021.10.07"]
[White "Cubigami"]
[Black "JoinedToday"]
[Result "1-0"]
[UTCDate "2021.10.07"]
[UTCTime "05:16:00"]
[WhiteElo "1930"]
[BlackElo "1912"]
[WhiteRatingDiff "+5"]
[BlackRatingDiff "-6"]
[Variant "Standard"]
[TimeControl "60+0"]
[ECO "B01"]
[Opening "Scandinavian Defense: Mieses-Kotroc Variation"]
[Termination "Normal"]
[Annotator "lichess.org"]

1. e4 { [%eval 0.24] [%clk 0:01:00] } 1... d5?! { (0.24 → 0.82) Inaccuracy. e5 was best. } { [%eval 0.82] [%clk 0:01:00] } (1... e5 2. Nf3 Nc6 3. Bb5 Nf6 4. O-O Nxe4 5. Re1 Nd6 6. Nxe5) 2. exd5 { [%eval 0.36] [%clk 0:01:00] } 2... Qxd5 { [%eval 0.66] [%clk 0:01:00] } { B01 Scandinavian Defense: Mieses-Kotroc Variation } 3. Nc3 { [%eval 0.46] [%clk 0:01:00] } 3... Qe6+?! { (0.46 → 1.12) Inaccuracy. Qa5 was best. } { [%eval 1.12] [%clk 0:01:00] } (3... Qa5 4. Nf3 Nf6 5. d4 Bf5 6. Ne5 c6 7. Bf4 Nbd7 8. Nc4) 4. Be2 { [%eval 0.94] [%clk 0:00:59] } 4... Qg6? { (0.94 → 2.18) Mistake. Nf6 was best. } { [%eval 2.18] [%clk 0:01:00] } (4... Nf6 5. d4 Qd6 6. Nf3 e6 7. O-O Be7 8. Nb5 Qd8 9. c4) 5. Nf3 { [%eval 1.51] [%clk 0:00:58] } 5... Nc6?? { (1.51 → 4.52) Blunder. Qxg2 was best. } { [%eval 4.52] [%clk 0:00:59] } (5... Qxg2 6. Rg1 Qh3 7. d4 Nf6 8. Rg3 Qf5 9. Ne5 c6 10. Rf3) 6. O-O?? { (4.52 → 0.37) Blunder. Nb5 was best. } { [%eval 0.37] [%clk 0:00:57] } (6. Nb5 Kd8 7. d4 a6 8. d5 axb5 9. dxc6+ Ke8 10. Ne5 Qe6) 6... h5?? { (0.37 → 5.60) Blunder. Bh3 was best. } { [%eval 5.6] [%clk 0:00:59] } (6... Bh3 7. Ne1 Bf5 8. Nd5 O-O-O 9. Ne3 Nf6 10. Nf3 Be4 11. d3) 7. h4?? { (5.60 → 0.08) Blunder. Nb5 was best. } { [%eval 0.08] [%clk 0:00:56] } (7. Nb5 Bh3) 7... a6?? { (0.08 → 2.63) Blunder. Bh3 was best. } { [%eval 2.63] [%clk 0:00:58] } (7... Bh3 8. Ng5 Bf5 9. Bb5 f6 10. Nf3 e6 11. Nd4 Ne7 12. Qf3 Kd7 13. Re1 a6 14. Ba4) 8. Bd3?? { (2.63 → -0.22) Blunder. d4 was best. } { [%eval -0.22] [%clk 0:00:53] } (8. d4) 8... Qf6?? { (-0.22 → 1.46) Blunder. Bf5 was best. } { [%eval 1.46] [%clk 0:00:56] } (8... Bf5) 9. Ne4 { [%eval 1.24] [%clk 0:00:51] } 9... Qe6?! { (1.24 → 2.16) Inaccuracy. Qg6 was best. } { [%eval 2.16] [%clk 0:00:54] } (9... Qg6 10. Ng3) 10. Nfg5? { (2.16 → 0.77) Mistake. Neg5 was best. } { [%eval 0.77] [%clk 0:00:51] } (10. Neg5 Qd6 11. Bc4 Nh6 12. c3 Bf5 13. Qb3 O-O-O 14. Bxf7 e5 15. Be6+ Bxe6 16. Qxe6+ Qxe6) 10... Qd7?? { (0.77 → 8.62) Blunder. Qd5 was best. } { [%eval 8.62] [%clk 0:00:52] } (10... Qd5 11. Nc3 Qd8 12. Bc4 e6 13. Re1 Be7 14. d3 Nh6 15. Qxh5 g6 16. Qd1 Nf5 17. g3) 11. Qf3?? { (8.62 → -0.59) Blunder. Nxf7 was best. } { [%eval -0.59] [%clk 0:00:50] } (11. Nxf7) 11... f6?? { (-0.59 → 8.00) Blunder. Ne5 was best. } { [%eval 8.0] [%clk 0:00:50] } (11... Ne5 12. Qf4 f6 13. Be2 Nc6 14. Nf3 e5 15. Qe3 Nge7 16. Qb3 Qd5 17. d3 Qxb3 18. axb3) 12. Nc5 { [%eval 7.68] [%clk 0:00:47] } 12... Qd8? { (7.68 → Mate in 1) Checkmate is now unavoidable. Ne5 was best. } { [%eval #1] [%clk 0:00:49] } (12... Ne5 13. Bg6+ Nxg6 14. Nxd7 Bxd7 15. Qxb7 Rd8 16. Qe4 Nxh4 17. Ne6 Bxe6 18. Qxe6 Rd6 19. Qc4) 13. Nge6? { (Mate in 1 → 8.51) Lost forced checkmate sequence. Bg6# was best. } { [%eval 8.51] [%clk 0:00:45] } (13. Bg6#) 13... Bxe6 { [%eval 8.53] [%clk 0:00:43] } 14. Nxe6 { [%eval 8.78] [%clk 0:00:45] } 14... Qd7? { (8.78 → Mate in 1) Checkmate is now unavoidable. Ne5 was best. } { [%eval #1] [%clk 0:00:43] } (14... Ne5 15. Qxb7) 15. Bg6# { [%clk 0:00:45] } { White wins by checkmate. } 1-0'''

carlson_draw = '''
[Event "Rated Blitz game"]
[Site "https://lichess.org/RzGPtofJ"]
[Date "2021.08.19"]
[White "Feokl1995"]
[Black "DrNykterstein"]
[Result "1/2-1/2"]
[UTCDate "2021.08.19"]
[UTCTime "22:35:41"]
[WhiteElo "2998"]
[BlackElo "3141"]
[WhiteRatingDiff "+2"]
[BlackRatingDiff "-2"]
[WhiteTitle "GM"]
[BlackTitle "GM"]
[Variant "Standard"]
[TimeControl "180+0"]
[ECO "B00"]
[Opening "Nimzowitsch Defense: Williams Variation"]
[Termination "Normal"]
[Annotator "lichess.org"]

1. e4 { [%eval 0.24] [%clk 0:03:00] } 1... Nc6 { [%eval 0.44] [%clk 0:03:00] } 2. Nf3 { [%eval 0.24] [%clk 0:02:58] } 2... d6 { [%eval 0.71] [%clk 0:02:59] } { B00 Nimzowitsch Defense: Williams Variation } 3. d4 { [%eval 0.72] [%clk 0:02:57] } 3... Nf6 { [%eval 0.89] [%clk 0:02:58] } 4. Nc3 { [%eval 0.54] [%clk 0:02:56] } 4... g6 { [%eval 0.96] [%clk 0:02:58] } 5. h3?! { (0.96 → 0.42) Inaccuracy. Be3 was best. } { [%eval 0.42] [%clk 0:02:55] } (5. Be3 Bg7 6. Qd2 O-O 7. d5 Nb8 8. Bh6 c6 9. h3 b5) 5... Bg7 { [%eval 0.67] [%clk 0:02:57] } 6. Be2 { [%eval 0.5] [%clk 0:02:55] } 6... O-O { [%eval 0.5] [%clk 0:02:57] } 7. O-O { [%eval 0.34] [%clk 0:02:54] } 7... a6 { [%eval 0.4] [%clk 0:02:55] } 8. a4 { [%eval 0.53] [%clk 0:02:53] } 8... b6 { [%eval 0.62] [%clk 0:02:50] } 9. Be3 { [%eval 0.39] [%clk 0:02:52] } 9... e6 { [%eval 0.6] [%clk 0:02:47] } 10. e5 { [%eval 0.84] [%clk 0:02:41] } 10... dxe5 { [%eval 0.56] [%clk 0:02:46] } 11. Nxe5 { [%eval 0.57] [%clk 0:02:40] } 11... Bb7 { [%eval 0.73] [%clk 0:02:43] } 12. Bf3 { [%eval 0.81] [%clk 0:02:38] } 12... Nd5 { [%eval 0.91] [%clk 0:02:42] } 13. Nxd5 { [%eval 0.85] [%clk 0:02:36] } 13... exd5 { [%eval 0.75] [%clk 0:02:42] } 14. Nxc6 { [%eval 0.81] [%clk 0:02:31] } 14... Bxc6 { [%eval 0.87] [%clk 0:02:42] } 15. c3 { [%eval 0.75] [%clk 0:02:30] } 15... a5 { [%eval 1.03] [%clk 0:02:41] } 16. Bf4 { [%eval 0.6] [%clk 0:02:29] } 16... Qd7 { [%eval 0.88] [%clk 0:02:40] } 17. Re1 { [%eval 0.75] [%clk 0:02:28] } 17... Rfe8 { [%eval 0.79] [%clk 0:02:39] } 18. b3 { [%eval 0.36] [%clk 0:02:26] } 18... Rxe1+ { [%eval 0.35] [%clk 0:02:37] } 19. Qxe1 { [%eval 0.45] [%clk 0:02:26] } 19... Re8 { [%eval 0.42] [%clk 0:02:36] } 20. Qd2 { [%eval 0.55] [%clk 0:02:26] } 20... Bf8 { [%eval 0.36] [%clk 0:02:36] } 21. Re1 { [%eval 0.28] [%clk 0:02:19] } 21... Rxe1+ { [%eval 0.29] [%clk 0:02:35] } 22. Qxe1 { [%eval 0.45] [%clk 0:02:19] } 22... Bd6 { [%eval 0.25] [%clk 0:02:34] } 23. Qe3 { [%eval 0.22] [%clk 0:02:17] } 23... f6 { [%eval 0.42] [%clk 0:02:12] } 24. Bg4 { [%eval 0.14] [%clk 0:02:12] } 24... Qd8 { [%eval 0.63] [%clk 0:02:09] } 25. Qe6+ { [%eval 0.44] [%clk 0:02:10] } 25... Kg7 { [%eval 0.47] [%clk 0:02:07] } 26. Bxd6 { [%eval 0.4] [%clk 0:02:08] } 26... Qxd6 { [%eval 0.43] [%clk 0:02:07] } 27. Qxd6 { [%eval 0.42] [%clk 0:02:06] } 27... cxd6 { [%eval 0.43] [%clk 0:02:07] } 28. Bf3 { [%eval 0.45] [%clk 0:02:02] } 28... f5 { [%eval 0.43] [%clk 0:02:06] } 29. h4 { [%eval 0.21] [%clk 0:01:55] } 29... Kf6 { [%eval 0.21] [%clk 0:02:04] } 30. g3 { [%eval 0.26] [%clk 0:01:54] } 30... h6 { [%eval 0.36] [%clk 0:02:03] } 31. Be2 { [%eval 0.14] [%clk 0:01:51] } 31... Bd7 { [%eval 0.3] [%clk 0:01:59] } 32. f4 { [%eval 0.35] [%clk 0:01:50] } 32... h5 { [%eval 0.36] [%clk 0:01:58] } 33. Kf2 { [%eval 0.35] [%clk 0:01:50] } 33... Be6 { [%eval 0.34] [%clk 0:01:57] } 34. Ke3 { [%eval 0.34] [%clk 0:01:48] } 34... Ke7 { [%eval 0.29] [%clk 0:01:57] } 35. Kd2 { [%eval 0.21] [%clk 0:01:48] } 35... Kd8 { [%eval 0.34] [%clk 0:01:57] } 36. Bb5 { [%eval 0.32] [%clk 0:01:46] } 36... Bf7 { [%eval 0.19] [%clk 0:01:55] } 37. Kd3 { [%eval 0.19] [%clk 0:01:44] } 37... Kc7 { [%eval 0.26] [%clk 0:01:55] } 38. Ke3 { [%eval 0.39] [%clk 0:01:39] } 38... Kd8 { [%eval 0.35] [%clk 0:01:54] } 39. Kd2 { [%eval 0.52] [%clk 0:01:39] } 39... Kc7 { [%eval 0.36] [%clk 0:01:53] } 40. Kc2 { [%eval 0.23] [%clk 0:01:38] } 40... Kd8 { [%eval 0.3] [%clk 0:01:53] } 41. Kb2 { [%eval 0.25] [%clk 0:01:37] } 41... Kc7 { [%eval 0.27] [%clk 0:01:53] } 42. Kc2 { [%eval 0.34] [%clk 0:01:37] } 42... Kd8 { [%eval 0.17] [%clk 0:01:52] } 43. Kd2 { [%eval 0.33] [%clk 0:01:36] } 43... Kc7 { [%eval 0.17] [%clk 0:01:52] } { The game is a draw. } 1/2-1/2
'''

# General Helper Functions

In [310]:
def generate_games(pgn_filename: str, max_games: int = 1_000_000) -> Generator[str, None, None]:
    """ Take a PGN filename and return a generator that loops over the first num_games games. """
    with open(pgn_filename) as file:
        num_games_yielded = 0
        while True:
            if num_games_yielded > max_games:
                return

            game = chess.pgn.read_game(file)

            if game is not None:
                yield game
                num_games_yielded += 1
            else:
                # No more games in the pgn
                return


# Doesn't work because "Annotator" not always in game.headers
# def is_analyzed_by_lichess(game: chess.pgn.Game) -> bool:
#     return 'Annotator' in game.headers and game.headers['Annotator'] == 'lichess.org'


EVAL_REGEX_PAT = '\[%eval ([+-]?(?:[0-9]*[.])?[0-9]+|#-?[0-9]+)]'
def get_score_from_comment(comment: str) -> float:
    """ Return the pawn-score in the comment if it contains "%eval". If "%eval" not in comment, raises ValueError. """
    if '%eval' not in comment:
        raise ValueError

    # Get score part using regex
    evals = re.findall(EVAL_REGEX_PAT, comment)
    assert len(evals) == 1, f'error: "%eval" tag does not appear exactly once in move.comment = {comment}'
    score = evals[0]

    # Convert to float
    try:
        score = float(score)
    except ValueError:
        # Only explanation for ValueError should be that %eval's score is a checkmate, ex. '#-5' or '#9'
        # Convert to centipawn-score with mate_score, then /100 to get pawn-score
        # https://python-chess.readthedocs.io/en/latest/engine.html?highlight=mate_score#chess.engine.Score.score
        assert '#' in score
        score = chess.engine.Mate(int(score.lstrip('#'))).score(mate_score=10_000) / 100.0
    return score


CLK_REGEX_PAT = '\[%clk (.*?)]'
def get_time_remaining_from_comment(comment: str) -> int:
    """
    Return the time remaining in seconds given by the "%clk" tag in the comment (always appears as an int).
    If "%clk" not in comment, raise ValueError.
    """
    if '%clk' not in comment:
        raise ValueError

    # Get eval part using regex
    clk = re.findall(CLK_REGEX_PAT, comment)
    assert len(clk) == 1, f'error: "%clk" tag does not appear exactly once in move.comment = {comment}'
    clk = clk[0]

    parts = clk.split(':')
    assert len(parts) == 3
    seconds_remaining = int(parts[0]) * 3600 \
                     + int(parts[1]) * 60 \
                     + int(parts[2])
    return seconds_remaining

def get_cpl(from_score: float, to_score: float, after_move_by: chess.Color) -> int:
    """ Get centipawn loss (CPL) based on scores of consecutive moves and side to move. """
    return int((to_score - from_score) * 100) * ((-1) ** (not after_move_by))


def parse_time_control(time_control: str) -> Tuple[int, int]:
    """ Convert a time control string like "300+3" . """
    if time_control == '-':
        # Correspondence game (ex. 3 days per move)
        # Assume the longest time control of 120+120
        return 7200, 120

    x = time_control.split('+')
    return int(x[0]), int(x[1])

# Engine Functions

In [284]:
@contextmanager
def start_engine_process() -> Generator[subprocess.Popen, None, None]:
    # Open the exe using Popen
    p = subprocess.Popen("./stockfish_14_x64_avx2.exe",
                         stdin=subprocess.PIPE,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)

    # Send commands to this open process by using stdin.write(*command*)
    send_commands(p, 'isready\n')

    try:
        yield p
    finally:
        p.terminate()


def send_commands(proc: subprocess.Popen, *commands: str) -> None:
    if not commands:
        return

    for cmd in commands:
        proc.stdin.write(cmd.encode())

    proc.stdin.flush()


def analyze_static(p: subprocess.Popen, fen: str) -> str:
    """ Return the full string output of Stockfish's static evaluation for the given FEN. """

    send_commands(p, f'position fen {fen}\n', 'eval\n')

    # Get as many str lines as were output
    lines = []
    while True:
        line = p.stdout.readline().decode().strip()
        lines.append(line.strip('\n\r'))

        # Static eval does not run on positions where either king is in check
        if 'none (in check)' in line or 'Final evaluation' in line:
            break

    return '\n'.join(lines)

def vectorize_contributing_terms(static_analysis_output: str) -> np.ndarray:
    """ Return a 1x68 list from the "Contributing terms..." table given by the static analysis. """
    # Regex tester
    # https://regex101.com/r/OWV1ee/4
    return np.array([float(i)
                     for tup in re.findall('([-]?(?:[0-9]*[.])?[0-9]+) +([-]?(?:[0-9]*[.])?[0-9]+)',
                                           static_analysis_output)
                     for i in tup])

In [285]:
''' Testing '''
with start_engine_process() as p:
    # # In-check position shouldn't give static eval
    # print(analyze_static(p, 'r3r3/pppkb1pp/8/n2nN3/8/8/PPPP1PPP/RNB1K2R b KQ - 2 12'))
    # print('========================')

    # Should give static eval
    output = analyze_static(p, 'r1bk1bnr/1pp2ppp/p1n2q2/3N2N1/6B1/8/PPP2PPP/R1BQK2R b KQ - 0 12')
    # pc.copy(output)
    print(output)
    print('========================')

    # Get all floats from the first table
    print(vectorize_contributing_terms(output))
    # print(analyze_static(p, 'r1bk1bnr/1pp2ppp/p1n2q2/3N2N1/6B1/8/PPP2PPP/R1BQK2R b KQ - 0 12'))

Stockfish 14 by the Stockfish developers (see AUTHORS file)
readyok
info string NNUE evaluation using nn-3475407dc199.nnue enabled

Contributing terms for the classical eval:
+------------+-------------+-------------+-------------+
|    Term    |    White    |    Black    |    Total    |
|            |   MG    EG  |   MG    EG  |   MG    EG  |
+------------+-------------+-------------+-------------+
|   Material |  ----  ---- |  ----  ---- |  0.80  0.49 |
|  Imbalance |  ----  ---- |  ----  ---- |  0.00  0.00 |
|      Pawns |  0.29 -0.06 |  0.38 -0.05 | -0.09 -0.01 |
|    Knights | -0.31 -0.35 | -0.11 -0.20 | -0.20 -0.14 |
|    Bishops | -0.13 -0.45 | -0.09 -0.79 | -0.04  0.35 |
|      Rooks | -0.26 -0.06 | -0.53 -0.12 |  0.26  0.06 |
|     Queens |  0.00  0.00 |  0.00  0.00 |  0.00  0.00 |
|   Mobility |  0.83  1.05 |  0.69  0.90 |  0.14  0.14 |
|King safety | -0.71 -0.40 | -3.30 -0.51 |  2.60  0.11 |
|    Threats |  1.41  1.99 |  1.21  1.00 |  0.21  0.99 |
|     Passed |  0.00  0.00 

# Extracting Game Data

In [308]:
def vectorize_game(proc: subprocess.Popen, game: chess.pgn.Game) -> np.ndarray:
    """
    Return a (#plies)x(#features + 1 = 75) array. For each ply (half-move) in the game, the following
    features are extracted:
        Game-specific features:
        - Player ELO rating
        - Opponent ELO rating
        - Rating diff (Opponent ELO - Player ELO)
        - Clock starting time (seconds)
        - Clock increment (seconds)
        Move-specific features:
        - Ply (# half-moves into the game)
        - Time remaining at given move (seconds)
        - Static eval features:
            - 68 floats from the "Contributing terms ..." table
            - (TODO) 64 features from the "NNUE-derived piece values" table
                - Not sure how to deal with squares without pieces, filling with 0s might not make sense
        - Board state features:
            - (TODO) 256 features: 64 squares with one-hot encoded piece types (12 piece types can be
              represented with 4 bits, so 4 x 64 = 256)
    The final column is the target column holding the centipawn loss (CPL) of the player's move in the position.
    """

    data = np.empty((0, 75), dtype=np.float)

    # Set up board and Game's move nodes
    board = game.board()
    mainline_nodes: List[chess.pgn.ChildNode] = list(game.mainline())
    if not mainline_nodes:
        return data
    if '%eval' not in mainline_nodes[0].comment:
        return data

    ''' Add row to `data` for each move '''
    # Game-specific features (constant while iterating over moves in the game)
    h = game.headers
    try:
        clock_start, clock_inc = parse_time_control(h['TimeControl'])
    except ValueError as e:
        print(f'test: pgn: {str(game)}')
        raise e
    game_features_white = np.array([h['WhiteElo'], h['BlackElo'], h['BlackRatingDiff'], clock_start, clock_inc])
    game_features_black = np.array([h['BlackElo'], h['WhiteElo'], h['WhiteRatingDiff'], clock_start, clock_inc])

    # Add row of features for all moves in the Game
    for i, node in enumerate(mainline_nodes):
        # Can't get static eval while in check
        if board.is_check():
            board.push(node.move)
            # print('test: skipping move (in check)')
            continue

        move: chess.Move = node.move
        comment: str = node.comment
        try:
            next_comment = mainline_nodes[i+1].comment
        except IndexError:
            # No next move -> game is over, this move has no cpl value
            break

        # Get centipawn loss (CPL) before gathering other data to make sure it's possible
        try:
            score = get_score_from_comment(comment)
            next_score = get_score_from_comment(next_comment)
        except ValueError:
            # If for some reason there is no "%eval" in either comment,
            # continue because it is needed to calculate CPL. This should be
            # because the next_move would end the game with checkmate/stalemate
            # assert any((s in next_comment for s in ('wins', 'draw'))), f'next_comment: {next_comment}'
            continue
        cpl = get_cpl(score, next_score, board.turn)

        # Move-specific features
        try:
            time_remaining = get_time_remaining_from_comment(comment)
        except ValueError:
            time_remaining = clock_start
        static_eval_features = vectorize_contributing_terms(analyze_static(proc, board.fen()))

        # Add row
        # move_features = np.concatenate((np.array([time_remaining]), static_eval_features, np.array([cpl])), 0)
        if board.turn == chess.WHITE:
            all_features = np.array([*game_features_white, time_remaining, *static_eval_features, cpl])
            # all_features = np.concatenate((game_features_white, move_features), 0)
        else:
            all_features = np.array([*game_features_black, time_remaining, *static_eval_features, cpl])
            # all_features = np.concatenate((game_features_black, move_features), 0)

        assert all_features.shape == (75,), f'error: all_features: {all_features}'
        data = np.vstack((data, all_features))

        # Play next move
        board.push(move)

    return data

In [287]:
''' Testing '''
with start_engine_process() as p:
    test = vectorize_game(p, chess.pgn.read_game(io.StringIO(carlson_draw)))
test

array([['2998', '3141', '-2', ..., '0.0', '0.0', '20'],
       ['3141', '2998', '+2', ..., '1.0', '0.88', '20'],
       ['2998', '3141', '-2', ..., '0.35', '0.51', '47'],
       ...,
       ['2998', '3141', '-2', ..., '2.19', '1.27', '-17'],
       ['3141', '2998', '+2', ..., '1.29', '1.43', '-16'],
       ['2998', '3141', '-2', ..., '1.57', '1.76', '-16']], dtype='<U32')

In [341]:
def write_to_csv(output_filename: str, data: np.array) -> None:
    # TODO doesn't actually work yet
    # Looked at these:
    # https://stackoverflow.com/questions/6081008/dump-a-numpy-array-into-a-csv-file
    # https://stackoverflow.com/a/27980725/7304977
    while True:
        try:
            with open(output_filename, 'wb') as f:
                np.savetxt(f, data, delimiter=',')
            break
        except PermissionError:
            input(f'error: PermissionError. Please close {output_filename} and press enter: ')


def prepare_game_data(pgn_filename: str, max_datapoints: int = 1_000_000) -> np.ndarray:
    data = np.empty((0, 75), dtype=np.float)

    with start_engine_process() as proc:
        for game in generate_games(pgn_filename):
            data = np.concatenate((data, vectorize_game(proc, game)), 0)

            if data.shape[0] > max_datapoints:
                break

    return data[:max_datapoints,:]

In [342]:
data = prepare_game_data('data/lichess_db_standard_rated_2015-09.pgn/lichess_db_standard_rated_2015-09.pgn', max_datapoints=10_000)

data

array([['1860', '1953', '+9', ..., '0.0', '0.0', '0'],
       ['1953', '1860', '-9', ..., '1.0', '0.88', '-9'],
       ['1860', '1953', '+9', ..., '0.8', '0.63', '-7'],
       ...,
       ['1459', '1763', '+4', ..., '0.19', '0.24', '-19'],
       ['1763', '1459', '-4', ..., '0.21', '0.0', '0'],
       ['1459', '1763', '+4', ..., '-1.44', '-1.54', '50']], dtype='<U32')

# Machine Learning