In [None]:
## Which PGN File To Train
max_games = 500000 
asset_dir = 'asset'
file_name = '2023_tc_500000_games.pgn'
########################################################
fold = X #### PLEASE CHANGE TO 2 3 4 WHEN RUNNING ######
########################################################

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
import io
import chess
import chess.pgn
import numpy as np
import pandas as pd
import os
import pickle
from tqdm import tqdm
from chess_class import ChessGame, ChessMove

In [None]:
gauth = GoogleAuth()
gauth.DEFAULT_SETTINGS['client_config_file'] = 'client_secret_1057507276332-5mk9ac9q22rsmtm1idlqvpraq08ar8p5.apps.googleusercontent.com.json'
gauth.LoadCredentialsFile("mycreds.txt")
if gauth.credentials is None:
    gauth.LocalWebserverAuth()
elif gauth.access_token_expired:
    gauth.Refresh()
else:
    gauth.Authorize()

gauth.SaveCredentialsFile("mycreds.txt")
drive = GoogleDrive(gauth)

In [None]:
def save_item_to_file(games, file_path):
    with open(file_path, 'wb') as file:
        pickle.dump(games, file)

def load_item_from_file(file_path):
    if os.path.exists(file_path):
        print('loading item from cache...')
        with open(file_path, 'rb') as file:
            items = pickle.load(file)
        print('loaded')
        return items
    else:
        return None

def load_pgns_from_text(pgns_text, num_games=None, start_index=0, encoding="utf-8"):
    games = []
    file_io = io.StringIO(pgns_text)
    for _ in tqdm(range(start_index), desc='Skipping games', unit='game', leave=False):
        game = chess.pgn.read_game(file_io)
        if game is None:
            break
    for _ in tqdm(range(num_games), desc='Loading games', unit='game', leave=True) if num_games else iter(int, 1):
        game = chess.pgn.read_game(file_io)
        if game is None:
            break
        games.append(game)
    return games

In [None]:
def find_folder_id(folder_name):
    """Find and return the Google Drive folder ID for a given folder name."""
    file_list = drive.ListFile({'q': f"title='{folder_name}' and mimeType='application/vnd.google-apps.folder' and trashed=false"}).GetList()
    for file in file_list:
        if file['title'] == folder_name:
            return file['id']
    return None

def read_pkl_file_from_drive(file_title, parent_id):
    """Read a .pkl file directly from Google Drive into a Python object."""
    query = f"'{parent_id}' in parents and trashed=false and title='{file_title}'"
    file_list = drive.ListFile({'q': query}).GetList()
    if not file_list:
        print(f"No file found with title: {file_title}")
        return None
    file = file_list[0]
    file_content = file.GetContentString(encoding='cp437')
    buffer = io.BytesIO(file_content.encode('cp437'))
    return pickle.load(buffer)

def read_pgn_file_from_drive(file_title, parent_id):
    """Read a .pgn file directly from Google Drive into a text variable."""
    query = f"'{parent_id}' in parents and trashed=false and title='{file_title}'"
    file_list = drive.ListFile({'q': query}).GetList()
    if not file_list:
        print(f"No file found with title: {file_title}")
        return None
    file = file_list[0]
    urls = file.GetContentString(encoding='UTF-8')
    return urls

def save_item_to_drive(data, parent_id, file_title, drive):
    serialized_data = pickle.dumps(data)
    bytes_io = io.BytesIO(serialized_data)
    file_metadata = {
        'title': file_title,
        'parents': [{'id': parent_id}]
    }
    drive_file = drive.CreateFile(file_metadata)
    drive_file.content = bytes_io
    drive_file.Upload()
    print(f"Uploaded {file_title} to Google Drive with ID: {drive_file['id']}")

In [None]:
%%time
chess_games_loaded = False
asset_folder_id = find_folder_id(asset_dir)
cached_urls_file = file_name.split('.')[0] + f'_urls_list_{fold}.pkl'
cached_ratings_file = file_name.split('.')[0] + f'_ratings_list_{fold}.pkl'
cached_games_file = file_name.split('.')[0] + f'_game_arrays_{fold}.pkl'

if asset_folder_id is None:
    print("Asset folder not found.")
else:
    pgns_text = read_pgn_file_from_drive(file_name, asset_folder_id)
    pgns = load_pgns_from_text(pgns_text, num_games=max_games)
    if pgns is not None:
        print("pgn file loaded successfully.")

In [None]:
def chess_games_to_arrays(games_generator):
    def rating_to_group(rating):
        rating = int(rating)
        if rating < 800:
            return 0
        elif rating >= 2400:
            return 9
        return int(rating)//200 - 3

    attributes = ["ply_count", "time_category", "classification_name", "count_legal_moves", "force_moves_percent",
                  "game_state", "distance", "is_endgame", "has_increment", "in_time_trouble", "can_dirty_flag",
                  "is_check", "is_double_check", "is_discovered_check", "is_capture", "is_threat", "is_developing",
                  "is_retreating", "was_hanging", "is_hanging", "was_true_hanging", "is_true_hanging", "is_create_tension",
                  "is_resolve_tension", "is_maintain_tension", "start_square", "end_square", "threats", 
                  "create_tension", "maintain_tension", "resolve_tension", "piece_value"]
    
    game_arrays = []
    ratings_list = []
    urls_list = []
    for i, game in enumerate(tqdm(games_generator, total=int(max_games*0.8), desc="Processing games")):
        elo_w, elo_b, url = game.white_elo, game.black_elo, game.url
        total_plies = game.total_ply
        df = pd.DataFrame(columns=attributes)
        for j, move in enumerate(game.moves):
            move_row = {attribute: getattr(move, attribute, None) for attribute in attributes}
            df.loc[j] = move_row
        df['ply_count'] = df['ply_count'] / total_plies
        df['count_legal_moves'] = df['count_legal_moves'] / 128
        df['distance'] = (df['distance'] - 1) / 6

        df['prev_end_square'] = df['end_square'].shift(1).fillna(64)
        df['prev_threats'] = df['threats'].shift(1).fillna({}).apply(lambda x: x if isinstance(x, set) else {})
        df['prev_create_tension'] = df['create_tension'].shift(1).fillna({}).apply(lambda x: x if isinstance(x, set) else {})
        df['last_move_end_square'] = df['end_square'].shift(2).fillna(64)
        df['last_move_create_tension'] = df['create_tension'].shift(2).fillna({}).apply(lambda x: x if isinstance(x, set) else {})
        df['last_move_threats'] = df['threats'].shift(2).fillna({}).apply(lambda x: x if isinstance(x, set) else {})
    
        df['is_reacting'] = df.apply(lambda row: row['prev_end_square'] in (row['create_tension'] | row['threats']), axis=1) | \
                            (df['prev_end_square'] == df['end_square']) | \
                            df.apply(lambda row: row['start_square'] in row['prev_threats'], axis=1)
        df['is_same_piece'] = df['last_move_end_square'] == df['start_square']
        df['veni_vidi_vici'] = df.apply(lambda row: row['end_square'] in (row['last_move_create_tension'] | row['last_move_threats']), axis=1)
        df['is_collinear'] = df.apply(lambda row: row['start_square'] in (row['prev_create_tension'] | row['prev_threats']), axis=1) | \
                            df.apply(lambda row: row['prev_end_square'] in row['create_tension'], axis=1)
        df.drop(columns=['prev_end_square', 'last_move_end_square', 'prev_threats', 'last_move_create_tension', 'prev_create_tension',
                         'last_move_threats', 'threats', 'create_tension', 'maintain_tension', 'resolve_tension'], inplace=True)

        df['moved_piece_king'] = df['piece_value'].apply(lambda x: 1 if x == 6 else 0)
        df['moved_piece_queen'] = df['piece_value'].apply(lambda x: 1 if x == 5 else 0)
        df['moved_piece_rook'] = df['piece_value'].apply(lambda x: 1 if x == 4 else 0)
        df['moved_piece_bishop'] = df['piece_value'].apply(lambda x: 1 if x == 3 else 0)
        df['moved_piece_knight'] = df['piece_value'].apply(lambda x: 1 if x == 2 else 0)
        df['moved_piece_pawn'] = df['piece_value'].apply(lambda x: 1 if x == 1 else 0)
        df['time_category_instant'] = df['time_category'].apply(lambda x: 1 if x == 'instant' else 0)
        df['time_category_fast'] = df['time_category'].apply(lambda x: 1 if x == 'fast' else 0)
        df['time_category_normal'] = df['time_category'].apply(lambda x: 1 if x == 'normal' else 0)
        df['time_category_slow'] = df['time_category'].apply(lambda x: 1 if x == 'slow' else 0)
        df['classification_name_Great'] = df['classification_name'].apply(lambda x: 1 if x == 'Great' else 0)
        df['classification_name_Good'] = df['classification_name'].apply(lambda x: 1 if x == 'Good' else 0)
        df['classification_name_Inaccuracy'] = df['classification_name'].apply(lambda x: 1 if x == 'Inaccuracy' else 0)
        df['classification_name_Blunder'] = df['classification_name'].apply(lambda x: 1 if x == 'Blunder' else 0)
        df['classification_name_Mistake'] = df['classification_name'].apply(lambda x: 1 if x == 'Mistake' else 0)

        df = df.drop(['classification_name', 'time_category', 'piece_value', 'start_square', 'end_square'], axis=1)

        game_array_rep = df.astype(float).to_numpy()
        game_arrays.append(game_array_rep)
        #ratings_list.append(rating_to_group((elo_w + elo_b)/2))
        ratings_list.append([rating_to_group(elo_w), rating_to_group(elo_b)])
        urls_list.append(url)
    return game_arrays, ratings_list, urls_list

In [None]:
if not chess_games_loaded:
    games_generator = (ChessGame(pgn) for i, pgn in enumerate(pgns) if i % 5 == fold)
    game_arrays, ratings_list, urls_list  = chess_games_to_arrays(games_generator)
    
    save_item_to_drive(game_arrays, asset_folder_id, cached_games_file, drive)
    save_item_to_drive(ratings_list, asset_folder_id, cached_ratings_file, drive)
    save_item_to_drive(urls_list, asset_folder_id, cached_urls_file, drive)

In [None]:
ratings_list = [(elo_w + elo_b)//2 for [elo_w, elo_b] in ratings_list]

In [None]:
len(game_arrays), len(ratings_list), len(urls_list)