# Convert pgn to df
#### Saved as .feather for ease of use between partners
#### Dropped all Dates, Titles, and Rounds, as they won't be relevant

In [1]:
import chess.pgn
import io
import math
import pandas as pd
import importlib
import logging
import os
import pyarrow
import time
import datetime

pgn_path = '../lichess_db_standard_rated_2016-09.pgn'
feather_path = '../lichess_db_standard_rated_2016-09.feather'

# Configure logging
logging.basicConfig(filename='process.log', level=logging.INFO, 
                    format='%(asctime)s - %(message)s', datefmt='%H:%M:%S')

def drop_columns_if_exists(drop_from_df, columns_to_drop):
    for col in columns_to_drop:
        if col in drop_from_df.columns:
            drop_from_df.drop(col, axis=1, inplace=True)
    return drop_from_df

def skip_games(pgn, starting_game):
    current_game = -1
    while current_game < starting_game:
        line = pgn.readline()
        if not line:
            break  # EOF
        if line.startswith('[Event '):  # Typically indicates start of a new game
            current_game += 1
            if current_game == starting_game:
                # Move back to the start of this game.
                pgn.seek(pgn.tell() - len(line))
                return

# Function to read a PGN file and extract data
def read_pgn(pgn_path, starting_game, num_games):
    logging.info(f"Starting read_pgn for {pgn_path} at {starting_game} for {num_games} games")
    
    hundreth = math.floor(num_games / 100)
    games_data = []
    
    with open(pgn_path) as pgn:

        # Skip to first game to extract
        skip_games(pgn, starting_game)

        start_time = time.time()

        # Convert num_games games from pgn to df
        for i in range(num_games):
            
            # Log progress
            if (i % hundreth == 0):
                cur_time = time.time()
                elapsed_time = cur_time - start_time
                percent_done = int(i / hundreth)
                percent_left = 100 - percent_done
                mean_time_per_hundreth = elapsed_time / percent_done if percent_done else 0
                est_time_left = mean_time_per_hundreth * percent_left

                percent_done_formatted = "{:02d}".format(percent_done)
                elapsed_time_formatted = str(datetime.timedelta(seconds=int(elapsed_time)))
                est_time_left_formatted = str(datetime.timedelta(seconds=int(est_time_left)))
                logging.info(f"Completed: {percent_done_formatted}%, Elapsed: {elapsed_time_formatted}, Est Time Left: {est_time_left_formatted}")

            # Read next game
            game = chess.pgn.read_game(pgn)
            if game is None:
                break
                
            # Extract and append data from the game
            extracted_data = {key: game.headers[key] for key in game.headers.keys()}
            extracted_data['Moves'] = str(game.mainline())
            
            games_data.append(extracted_data)
            
    logging.info(f"Done with read_pgn for {pgn_path} at {starting_game} for {num_games} games")
    
    games_df = pd.DataFrame(games_data)
    drop_columns_if_exists(games_df, ['Round', 'Date', 'UTCDate', 'UTCTime', 'WhiteTitle', 'BlackTitle'])
    
    return games_df

def append_feather(pgn_path, feather_path, starting_game, num_games):
    new_df = read_pgn(pgn_path, starting_game, num_games)
    feather_df = pd.read_feather(feather_path) if os.path.exists(feather_path) else pd.DataFrame()
    combined_df = pd.concat([feather_df, new_df], ignore_index=True)
    combined_df.to_feather(feather_path)
    logging.info(f"Added {len(new_df)} games to {len(feather_df)} games already saved")
    return len(combined_df)

#### 7m games takes about 3h to convert. My computer keeps going to sleep and failing.
#### I go 1m at a time, effectively creating checkpoints as it's processing

In [2]:
append_feather(pgn_path, feather_path, 0, 1000000)

1000000

In [4]:
append_feather(pgn_path, feather_path, 1000000, 1000000)

2000000

In [6]:
append_feather(pgn_path, feather_path, 2000000, 1000000)

3000000

In [7]:
append_feather(pgn_path, feather_path, 3000000, 1000000)

4000000

In [8]:
append_feather(pgn_path, feather_path, 4000000, 1000000)

5000000

In [9]:
append_feather(pgn_path, feather_path, 5000000, 1000000)

6000000

In [10]:
append_feather(pgn_path, feather_path, 6000000, 1000000)

6813113