In [6]:
import chess
import chess.pgn
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

data_dir = Path.cwd().parent / 'data/raw'

pgn_files = list(data_dir.glob("*.pgn"))
if not pgn_files:
    raise FileNotFoundError(f"No PGN files found in {data_dir}")

all_games_data = []
elo_list = []
states_per_game = []

MAX_GAMES = 10000
games_count = 0

def parse_games_from_file(pgn_file):
    global games_count
    with open(pgn_file, encoding='utf-8') as f:
        while games_count < MAX_GAMES:
            game = chess.pgn.read_game(f)
            if game is None:
                break 

            board = game.board()
            game_data = []

            for move in game.mainline_moves():
                mover = 1 if board.turn else 0
                en_passant = 1 if board.has_legal_en_passant() else 0
                white_castle_right_legal = 1 if board.is_legal(chess.Move.from_uci("e1g1")) else 0
                white_castle_left_legal  = 1 if board.is_legal(chess.Move.from_uci("e1c1")) else 0
                black_castle_right_legal = 1 if board.is_legal(chess.Move.from_uci("e8g8")) else 0
                black_castle_left_legal  = 1 if board.is_legal(chess.Move.from_uci("e8c8")) else 0
                fifty_move_draw = 1 if board.can_claim_fifty_moves() else 0

                game_data.append({
                    "mover": mover,
                    "move_made": board.san(move),
                    "en_passant": en_passant,
                    "white_castle_left_legal": white_castle_left_legal,
                    "white_castle_right_legal": white_castle_right_legal,
                    "black_castle_left_legal": black_castle_left_legal,
                    "black_castle_right_legal": black_castle_right_legal,
                    "fifty_move_draw": fifty_move_draw,
                })
                board.push(move)

            try:
                white_elo = int(game.headers.get("WhiteElo", 0))
                black_elo = int(game.headers.get("BlackElo", 0))
            except ValueError:
                white_elo, black_elo = 0, 0

            elo_list.append((white_elo + black_elo)/2)
            states_per_game.append(len(game_data))
            games_count += 1

            all_games_data.extend(game_data)

            if games_count >= MAX_GAMES:
                break

for file in pgn_files:
    if games_count >= MAX_GAMES:
        break
    parse_games_from_file(file)

df = pd.DataFrame(all_games_data)

num_games = len(states_per_game)
avg_states_per_game = np.mean(states_per_game)
total_states = len(df)
avg_elo = np.mean(elo_list)

print(f"Number of games: {num_games}")
print(f"Average states per game: {avg_states_per_game:.2f}")
print(f"Total states in dataset: {total_states}")
print(f"Average ELO in games: {avg_elo:.2f}")

# ELO histogram
plt.figure(figsize=(6,5))
plt.hist(elo_list, bins=30, color='skyblue', edgecolor='black')
plt.title("Average ELO per Game")
plt.xlabel("ELO")
plt.ylabel("Number of Games")
plt.tight_layout()
plt.savefig("elo_histogram.pdf")  # Save ELO histogram as PDF
plt.close()  # Close figure to free memory

# States per game histogram
plt.figure(figsize=(6,5))
plt.hist(states_per_game, bins=30, color='salmon', edgecolor='black')
plt.title("States per Game")
plt.xlabel("States")
plt.ylabel("Number of Games")
plt.tight_layout()
plt.savefig("states_per_game_histogram.pdf")  # Save states histogram as PDF
plt.close()


plt.tight_layout()
plt.show()


Number of games: 10000
Average states per game: 87.27
Total states in dataset: 872684
Average ELO in games: 2541.97


<Figure size 640x480 with 0 Axes>