In [1]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
import io
import pickle
import numpy as np
import os
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
os.chdir('..')

In [3]:
gauth = GoogleAuth()
gauth.DEFAULT_SETTINGS['client_config_file'] = 'client_secret_1057507276332-5mk9ac9q22rsmtm1idlqvpraq08ar8p5.apps.googleusercontent.com.json'
gauth.LoadCredentialsFile("mycreds.txt")
if gauth.credentials is None:
    gauth.LocalWebserverAuth()
elif gauth.access_token_expired:
    gauth.Refresh()
else:
    gauth.Authorize()

gauth.SaveCredentialsFile("mycreds.txt")
drive = GoogleDrive(gauth)

In [4]:
max_games = 500000 
asset_dir = 'asset'
file_name = '2023_tc_50000_games.pgn'

cached_urls_file = file_name.split('.')[0] + '_urls_list.pkl'
cached_ratings_file = file_name.split('.')[0] + '_ratings_list.pkl'
cached_games_file = file_name.split('.')[0] + '_game_arrays.pkl'

In [5]:
def load_item_from_file(file_path):
    if os.path.exists(file_path):
        print('loading item from cache...')
        with open(file_path, 'rb') as file:
            items = pickle.load(file)
        print('loaded')
        return items
    else:
        return None

try:
    assets_path = os.path.join(os.getcwd(), asset_dir)
    cached_urls_path = os.path.join(assets_path, cached_urls_file)
    cached_ratings_path = os.path.join(assets_path, cached_ratings_file)
    cached_games_path = os.path.join(assets_path, cached_games_file)
    urls_list = load_item_from_file(cached_urls_path)
    ratings_list = load_item_from_file(cached_ratings_path)
    game_arrays = load_item_from_file(cached_games_path)
except:
    ('Files not on disk, reading from drive...')

loading item from cache...
loaded
loading item from cache...
loaded
loading item from cache...
loaded


In [6]:
def find_folder_id(folder_name):
    """Find and return the Google Drive folder ID for a given folder name."""
    file_list = drive.ListFile({'q': f"title='{folder_name}' and mimeType='application/vnd.google-apps.folder' and trashed=false"}).GetList()
    for file in file_list:
        if file['title'] == folder_name:
            return file['id']
    return None

def read_pkl_file_from_drive(file_title, parent_id):
    """Read a .pkl file directly from Google Drive into a Python object."""
    query = f"'{parent_id}' in parents and trashed=false and title='{file_title}'"
    file_list = drive.ListFile({'q': query}).GetList()
    if not file_list:
        print(f"No file found with title: {file_title}")
        return None
    file = file_list[0]
    file_content = file.GetContentString(encoding='cp437')
    buffer = io.BytesIO(file_content.encode('cp437'))
    return pickle.load(buffer)

asset_folder_id = find_folder_id(asset_dir)
if asset_folder_id is None:
    print("Asset folder not found.")
elif game_arrays is not None:
    pass
else:
    file_titles = {
        'urls_list': cached_urls_file,
        'ratings_list': cached_ratings_file,
        'game_arrays': cached_games_file,
    }

    urls_list = read_pkl_file_from_drive(file_titles['urls_list'], asset_folder_id)
    ratings_list = read_pkl_file_from_drive(file_titles['ratings_list'], asset_folder_id)
    game_arrays = read_pkl_file_from_drive(file_titles['game_arrays'], asset_folder_id)

if urls_list is not None:
    print("URLs list loaded successfully.")
if ratings_list is not None:
    print("Ratings list loaded successfully.")
if game_arrays is not None:
    print("Game arrays loaded successfully.")

##2m 26.4s

URLs list loaded successfully.
Ratings list loaded successfully.
Game arrays loaded successfully.


In [7]:
def train_test_split(game_arrays, ratings_list, urls_list, fold_number=0):
    if fold_number < 0 or fold_number > 4:
        raise ValueError("fold_number must be between 0 and 4")
    game_vecs = [np.mean(matrix, axis=0) for matrix in game_arrays]
    X_test = game_vecs[fold_number::5]
    X_train = [df for i in range(5) if i != fold_number for df in game_vecs[i::5]]
    y_test = ratings_list[fold_number::5]
    y_train = [ratings for i in range(5) if i != fold_number for ratings in ratings_list[i::5]]
    test_urls = urls_list[fold_number::5]
    train_urls = [url for i in range(5) if i != fold_number for url in urls_list[i::5]]

    return X_train, X_test, y_train, y_test, train_urls, test_urls

In [8]:
X_train, X_test, y_train, y_test, train_urls, test_urls = train_test_split(game_arrays, ratings_list, urls_list)

In [11]:
clf_nb = BernoulliNB().fit(X_train, y_train)
y_pred_nb = clf_nb.predict(X_test)
accuracies = [sum(abs(p - a) <= k for p, a in zip(y_pred_nb, y_test)) for k in range(10)]
[x/20000 for x in accuracies]

[0.15415,
 0.40405,
 0.58295,
 0.71775,
 0.81835,
 0.89065,
 0.93495,
 0.9686,
 0.99285,
 1.0]

In [12]:
%%time
clf_rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
clf_rf.fit(X_train, y_train)
y_pred_rf = clf_rf.predict(X_test)
accuracies = [sum(abs(p - a) <= k for p, a in zip(y_pred_rf, y_test)) for k in range(10)]
[x/20000 for x in accuracies]

#CPU times: total: 10min 5s
#Wall time: 17min 19s

CPU times: total: 2min 48s
Wall time: 29.6 s


[0.23465,
 0.57625,
 0.7719,
 0.8853,
 0.94775,
 0.9776,
 0.99195,
 0.99755,
 0.99975,
 1.0]

In [13]:
feature_importances = clf_rf.feature_importances_
features = ['ply_count', 'count_legal_moves', 'force_moves_percent', 'game_state',
            'distance', 'is_endgame', 'has_increment', 'in_time_trouble',
            'can_dirty_flag', 'is_check', 'is_double_check', 'is_discovered_check',
            'is_capture', 'is_threat', 'is_developing', 'is_retreating',
            'was_hanging', 'is_hanging', 'was_true_hanging', 'is_true_hanging',
            'is_create_tension', 'is_resolve_tension', 'is_maintain_tension',
            'is_reacting', 'is_same_piece', 'veni_vidi_vici', 'is_collinear',
            'moved_piece_king', 'moved_piece_queen', 'moved_piece_rook',
            'moved_piece_bishop', 'moved_piece_knight', 'moved_piece_pawn',
            'time_category_instant', 'time_category_fast', 'time_category_normal',
            'time_category_slow', 'classification_name_Great',
            'classification_name_Good', 'classification_name_Inaccuracy',
            'classification_name_Blunder', 'classification_name_Mistake']

feature_importance_pairs = zip(features, feature_importances)
sorted_feature_importances = sorted(feature_importance_pairs, key=lambda x: x[1], reverse=True)
top_features = sorted_feature_importances[:20]
top_features

[('classification_name_Blunder', 0.035824666517166266),
 ('classification_name_Great', 0.03417247444188693),
 ('time_category_instant', 0.032114738292212563),
 ('time_category_normal', 0.0317092673939854),
 ('game_state', 0.031688642263981874),
 ('count_legal_moves', 0.030781828553123083),
 ('is_capture', 0.029654483636490585),
 ('veni_vidi_vici', 0.02931147456728802),
 ('is_same_piece', 0.029304778757139778),
 ('is_create_tension', 0.02834975587061691),
 ('is_reacting', 0.028308992234884756),
 ('time_category_fast', 0.02809154314663366),
 ('classification_name_Good', 0.02796288767813461),
 ('classification_name_Mistake', 0.027951748339499108),
 ('is_maintain_tension', 0.027598705633058798),
 ('force_moves_percent', 0.02739179654453292),
 ('is_threat', 0.027043384395000453),
 ('is_check', 0.02649315289929562),
 ('is_true_hanging', 0.02640261354632915),
 ('distance', 0.026361074704690634)]