In [19]:
import os
import sys
import chess.pgn as pychess
from chess.pgn import read_game
from chess import square_name
from chess import SQUARE_NAMES, BB_SQUARES
import ChessFuncs
from ChessFuncs import end_board
import bz2
import time
from io import StringIO
from multiprocessing import Process
import time
from time import sleep
import time

# MultiParsing Chess PGN Files
Code below parses multiple PGN files at once, for about a 1.5-3X speed increase.
Try to parse only as many files as CPU Cores available.

In [None]:
multiplier = round(409700297/121332)
mins = 7 * multiplier
hours = mins / 60
days = hours / 24
days_to_run = round(days,1)
print(days_to_run)

In [6]:
#Create list of PGN files
pgns = [file for file in os.listdir() if "lichess" in file]
out_path = "test-results.txt"
pgns

['lichess_db_standard_rated_2014-01.pgn',
 'lichess_db_standard_rated_2014-02.pgn',
 'lichess_db_standard_rated_2014-03.pgn',
 'lichess_db_standard_rated_2014-04.pgn']

In [20]:
#the thing which will run simultaneously
def main(pgn_file, out_path):
    '''
    Takes in one long/concatenated PGN file and spits out the end king squares to an output file.
    '''
    
    def end_board(parsed_pgn_file):
        '''Moves the py-chess board obj to the final move.'''
        current_board = parsed_pgn_file.board()
        for move in parsed_pgn_file.main_line():
            current_board.push(move)    

        return current_board
    
    with open(pgn_file) as bigfile:
        pgn = []
        then = time.time()
        for line in bigfile:
            
            #Every 20 minutes, sleep for 2 minutes
            elapsed = time.time() - then
            if elapsed > 1200:
                print("Sleeping for 2 minutes")
                sleep(120)
                then = time.time()
        
            if line.startswith("[Event"):
                pgn.append(line)
            elif line.startswith("1. "):
                pgn.append(line)
                pgn = "".join(pgn)
                pgn = StringIO(pgn)
                pgn = read_game(pgn)
                result = pgn.headers["Result"]
                board = end_board(pgn)
                 
                if result in ["1-0","0-1"] and board.is_checkmate():
                    if result == "1-0":
                        king_num = board.king(0) # BLACK loses
                    if result == "0-1":
                        king_num = board.king(1) # WHITE loses
                    with open(out_path, "a") as results:
                        results.write("%i\n" % king_num)     
                else:
                    pass # if you want to save other data
                
                pgn = []
                
            else:
                pgn.append(line)

In [21]:
def single_parse(pgn_file_list, out_path):
    '''
    Takes in a list of long/concatenated pgn files:
    Read one pgn file at a time. 
    Made to show the speed difference between multiprocessing and singleprocessing.
    '''
    start = time.time()
    for pgn_file in pgn_file_list:
        main(pgn_file,out_path)
    processtime = time.time() - start
    print("single",processtime)
    return processtime

In [22]:
def multi_parse(pgn_file_list, out_path):
    #TO DO:
        #stagger multiprocess by number of cores
        #e.g. 8 files with 2 cores runs 4 times
        #e.g. 8 files with 4 cores runs 2 times
    '''
    Takes in a list of long/concatenated pgn files.
    Passes each file into a multiprocessing unit for the main_parse() function.
    Read multiples pgn files at once and write to same output file.
    
    Basically a for-looped version of this:    
        def multi():
            start = time.time()
            p1 = Process(target=main, args=(pgn1,))
            p2 = Process(target=main, args=(pgn2,))
            p3 = Process(target=main, args=(pgn3,))
            p4 = Process(target=main, args=(pgn4,))
            p1.start()
            p2.start()
            p3.start()
            p4.start()
            p1.join()
            p2.join()
            p3.join()
            p4.join()
            processtime = time.time() - start
            return processtime
    
    '''
    start = time.time()
    
    process_list = []
    for pgn_file in pgn_file_list:
        process = Process(target=main, args=(pgn_file,out_path))
        process_list.append(process)
    for process in process_list:
        process.start()
    for p in process_list:
        process.join()

    processtime = time.time() - start
    print("multi",processtime)
    return processtime


In [None]:
if __name__ == "__main__":
#     ChessFuncs.speed_increase(single_parse(pgns, out_path_s),multi_parse(pgns, out_path_m))
    multi_parse(pgns, out_path)