# Preprocessing for opening analysis

In [1]:
import chess.pgn
import pickle
import numpy as np
import pandas as pd
from collections import defaultdict
from datetime import datetime 

## Function definitions

In [2]:
def load_games(filename,forbidden_players,verbose=False):
    games = []
    with open(filename) as pgn:
        offsets = []
        
        while True:
            offset = pgn.tell()
            headers = chess.pgn.read_headers(pgn)
            if headers is None:
                 break
            if not headers.get("Black", "?") in forbidden_players:
                offsets.append(offset)
        for offset in offsets:
            pgn.seek(offset)
            game = chess.pgn.read_game(pgn) 
            if game is None: 
                break
            else:
                games += [game]
                if verbose and len(games)%1000==0:
                    print(len(games))
        return games

In [3]:
def load_all_headers(filename,verbose=False):
    headers = []
    with open(filename) as pgn:
        while True:
            header = chess.pgn.read_headers(pgn) 
            if header is None:
                return headers
            else:
                headers += [header]
                if verbose and len(headers)%1000==0:
                    print(len(headers))

In [4]:
# Set the directory for the data
direc = '/Volumes/ikuperwajs/'

## Queen's Gambit

In [40]:
# Load the header files and create a dataframe
for j in range(9,15):
    print(j)
    games = load_games(direc + 'QG_games_{:d}_{:02d}.txt'.format(2020 + j//12,j%12+1),[])
    
    d = {'1-0' : 1, '1/2-1/2' : 1/2, '0-1' : 0, '*' : np.nan}
    d_black = defaultdict(lambda : 0)
    d_white = defaultdict(lambda : 0)

    whiteplayers = np.array([game.headers['White'] for game in games])
    blackplayers = np.array([game.headers['Black'] for game in games])

    game_start_times = np.array([datetime.fromisoformat(game.headers['UTCDate'].replace('.','-') + ' ' + game.headers['UTCTime']).timestamp()
                   for game in games])
    fourthmoves = np.array([list(game.mainline_moves())[3].uci() if len(list(game.mainline_moves()))>=4 else '' for game in games])
    results = np.array([d[game.headers['Result']] for game in games])
    fourthmove_rts = np.array([list(game.mainline())[1].clock()-list(game.mainline())[3].clock() if len(list(game.mainline_moves()))>=4 else np.nan for game in games])
    whiteelos = np.array([int(game.headers['WhiteElo']) for game in games])
    blackelos = np.array([int(game.headers['BlackElo']) for game in games])

    num_white_games = np.zeros_like(blackplayers,dtype=int)
    num_black_games = np.zeros_like(blackplayers,dtype=int)
    for i,_ in enumerate(blackplayers):
        num_white_games[i] = d_white[blackplayers[i]]
        num_black_games[i] = d_black[blackplayers[i]]
        d_black[blackplayers[i]] += 1
        d_white[whiteplayers[i]] += 1

    pd.DataFrame({'black': blackplayers, 'white' : whiteplayers, 'start_time' : game_start_times, 'fourthmove' : fourthmoves,
                  'rt' : fourthmove_rts, 'result' : results, 'blackelo' : blackelos, 'whiteelo' : whiteelos, 'num_black_games' : num_black_games, 'num_white_games' : num_white_games
                 }).to_csv(direc + 'df_{:d}_{:02d}.csv'.format(2020 + j//12,j%12+1))

13
14


In [21]:
# Load the header files and create a pickle file with the users
for i in range(3,10):
    print(i)
    headers = load_all_headers(direc + 'QG_games_2020_0' + str(i) + '.txt')
    players = np.unique([h['White'] for h in headers] + [h['Black'] for h in headers])
    with open(direc + 'QG_players_2020_0' + str(i) + '.pkl', 'wb') as file: 
        pickle.dump(players, file) 

## Sicilian Defense

In [36]:
# Load the header files and create a dataframe
for j in range(9,15):
    print(j)
    games = load_games(direc + 'SD_games_{:d}_{:02d}.txt'.format(2020 + j//12,j%12+1),[])
    
    d = {'1-0' : 1, '1/2-1/2' : 1/2, '0-1' : 0, '*' : np.nan}
    d_black = defaultdict(lambda : 0)
    d_white = defaultdict(lambda : 0)

    whiteplayers = np.array([game.headers['White'] for game in games])
    blackplayers = np.array([game.headers['Black'] for game in games])

    game_start_times = np.array([datetime.fromisoformat(game.headers['UTCDate'].replace('.','-') + ' ' + game.headers['UTCTime']).timestamp()
                   for game in games])
    thirdmoves = np.array([list(game.mainline_moves())[2].uci() if len(list(game.mainline_moves()))>=3 else '' for game in games])
    results = np.array([d[game.headers['Result']] for game in games])
    thirdmove_rts = np.array([list(game.mainline())[0].clock()-list(game.mainline())[2].clock() if len(list(game.mainline_moves()))>=3 else np.nan for game in games])
    whiteelos = np.array([int(game.headers['WhiteElo']) for game in games])
    blackelos = np.array([int(game.headers['BlackElo']) for game in games])

    num_white_games = np.zeros_like(blackplayers,dtype=int)
    num_black_games = np.zeros_like(blackplayers,dtype=int)
    for i,_ in enumerate(blackplayers):
        num_white_games[i] = d_white[blackplayers[i]]
        num_black_games[i] = d_black[blackplayers[i]]
        d_black[blackplayers[i]] += 1
        d_white[whiteplayers[i]] += 1

    pd.DataFrame({'black': blackplayers, 'white' : whiteplayers, 'start_time' : game_start_times, 'thirdmove' : thirdmoves,
                  'rt' : thirdmove_rts, 'result' : results, 'blackelo' : blackelos, 'whiteelo' : whiteelos, 'num_black_games' : num_black_games, 'num_white_games' : num_white_games
                 }).to_csv(direc + 'df_{:d}_{:02d}.csv'.format(2020 + j//12,j%12+1))

9
10
11
12
13
14


In [32]:
# Load the header files and create a pickle file with the users
for i in range(3,10):
    print(i)
    headers = load_all_headers(direc + 'SD_games_2020_0' + str(i) + '.txt')
    players = np.unique([h['White'] for h in headers] + [h['Black'] for h in headers])
    with open(direc + 'SD_players_2020_0' + str(i) + '.pkl', 'wb') as file: 
        pickle.dump(players, file) 

3
4
5
6
7
8
9


## King's Gambit

In [5]:
# Load the header files and create a dataframe
for j in range(9,15):
    print(j)
    games = load_games(direc + 'KG_games_{:d}_{:02d}.txt'.format(2020 + j//12,j%12+1),[])
    
    d = {'1-0' : 1, '1/2-1/2' : 1/2, '0-1' : 0, '*' : np.nan}
    d_black = defaultdict(lambda : 0)
    d_white = defaultdict(lambda : 0)

    whiteplayers = np.array([game.headers['White'] for game in games])
    blackplayers = np.array([game.headers['Black'] for game in games])

    game_start_times = np.array([datetime.fromisoformat(game.headers['UTCDate'].replace('.','-') + ' ' + game.headers['UTCTime']).timestamp()
                   for game in games])
    fourthmoves = np.array([list(game.mainline_moves())[3].uci() if len(list(game.mainline_moves()))>=4 else '' for game in games])
    results = np.array([d[game.headers['Result']] for game in games])
    fourthmove_rts = np.array([list(game.mainline())[1].clock()-list(game.mainline())[3].clock() if len(list(game.mainline_moves()))>=4 else np.nan for game in games])
    whiteelos = np.array([int(game.headers['WhiteElo']) for game in games])
    blackelos = np.array([int(game.headers['BlackElo']) for game in games])

    num_white_games = np.zeros_like(blackplayers,dtype=int)
    num_black_games = np.zeros_like(blackplayers,dtype=int)
    for i,_ in enumerate(blackplayers):
        num_white_games[i] = d_white[blackplayers[i]]
        num_black_games[i] = d_black[blackplayers[i]]
        d_black[blackplayers[i]] += 1
        d_white[whiteplayers[i]] += 1

    pd.DataFrame({'black': blackplayers, 'white' : whiteplayers, 'start_time' : game_start_times, 'fourthmove' : fourthmoves,
                  'rt' : fourthmove_rts, 'result' : results, 'blackelo' : blackelos, 'whiteelo' : whiteelos, 'num_black_games' : num_black_games, 'num_white_games' : num_white_games
                 }).to_csv(direc + 'df_{:d}_{:02d}.csv'.format(2020 + j//12,j%12+1))

9
10
11
12
13
14


In [6]:
# Load the header files and create a pickle file with the users
for i in range(3,10):
    print(i)
    headers = load_all_headers(direc + 'KG_games_2020_0' + str(i) + '.txt')
    players = np.unique([h['White'] for h in headers] + [h['Black'] for h in headers])
    with open(direc + 'KG_players_2020_0' + str(i) + '.pkl', 'wb') as file: 
        pickle.dump(players, file) 

3
4
5
6
7
8
9
