In [1]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
import io
import pickle
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from itertools import combinations, chain
from torch.utils.data import TensorDataset, DataLoader
from collections import Counter

In [2]:
gauth = GoogleAuth()
gauth.DEFAULT_SETTINGS['client_config_file'] = 'client_secret_1057507276332-5mk9ac9q22rsmtm1idlqvpraq08ar8p5.apps.googleusercontent.com.json'
gauth.LoadCredentialsFile("mycreds.txt")
if gauth.credentials is None:
    gauth.LocalWebserverAuth()
elif gauth.access_token_expired:
    gauth.Refresh()
else:
    gauth.Authorize()

gauth.SaveCredentialsFile("mycreds.txt")
drive = GoogleDrive(gauth)

In [3]:
max_games = 500000 
asset_dir = 'asset'
file_name = '2023_tc_50000_games.pgn'

cached_urls_file = file_name.split('.')[0] + '_urls_list.pkl'
cached_ratings_file = file_name.split('.')[0] + '_ratings_list.pkl'
cached_games_file = file_name.split('.')[0] + '_game_arrays.pkl'

In [4]:
def find_folder_id(folder_name):
    """Find and return the Google Drive folder ID for a given folder name."""
    file_list = drive.ListFile({'q': f"title='{folder_name}' and mimeType='application/vnd.google-apps.folder' and trashed=false"}).GetList()
    for file in file_list:
        if file['title'] == folder_name:
            return file['id']
    return None

def read_pkl_file_from_drive(file_title, parent_id):
    """Read a .pkl file directly from Google Drive into a Python object."""
    query = f"'{parent_id}' in parents and trashed=false and title='{file_title}'"
    file_list = drive.ListFile({'q': query}).GetList()
    if not file_list:
        print(f"No file found with title: {file_title}")
        return None
    file = file_list[0]
    file_content = file.GetContentString(encoding='cp437')
    buffer = io.BytesIO(file_content.encode('cp437'))
    return pickle.load(buffer)

asset_folder_id = find_folder_id(asset_dir)
if asset_folder_id is None:
    print("Asset folder not found.")
else:
    file_titles = {
        'urls_list': cached_urls_file,
        'ratings_list': cached_ratings_file,
        'game_arrays': cached_games_file,
    }

    urls_list = read_pkl_file_from_drive(file_titles['urls_list'], asset_folder_id)
    ratings_list = read_pkl_file_from_drive(file_titles['ratings_list'], asset_folder_id)
    game_arrays = read_pkl_file_from_drive(file_titles['game_arrays'], asset_folder_id)

    if urls_list is not None:
        print("URLs list loaded successfully.")
    if ratings_list is not None:
        print("Ratings list loaded successfully.")
    if game_arrays is not None:
        print("Game arrays loaded successfully.")

URLs list loaded successfully.
Ratings list loaded successfully.
Game arrays loaded successfully.


In [5]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, dropout_rate=0):
        super(RNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, hidden_size)
        self.fc5 = nn.Linear(hidden_size, hidden_size)
        self.fc6 = nn.Linear(hidden_size, hidden_size)
        self.fc7 = nn.Linear(hidden_size, hidden_size)
        self.fc_classification = nn.Linear(hidden_size, num_classes)
        self.fc_regression = nn.Linear(hidden_size, 1)
        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 

        out, _ = self.lstm(x, (h0,c0))  
        out = out[:, -1, :]
        
        out = self.dropout(F.relu(self.fc1(out)))
        out = self.dropout(F.relu(self.fc2(out)))
        out = self.dropout(F.relu(self.fc3(out)))
        out = self.dropout(F.relu(self.fc4(out)))
        out = self.dropout(F.relu(self.fc5(out)))
        out = self.dropout(F.relu(self.fc6(out)))
        out = self.dropout(F.relu(self.fc7(out)))
        classification_output = self.fc_classification(out)
        regression_output = self.fc_regression(out)
        return classification_output, regression_output

In [6]:
def combined_loss(classification_output, regression_output, target, alpha=0.5):
    classification_loss = nn.CrossEntropyLoss()(classification_output, target)
    regression_target = target.float()
    regression_loss = nn.MSELoss()(regression_output.squeeze(), regression_target)
    return alpha * classification_loss + (1 - alpha) * regression_loss

def train_model(model, train_loader, test_loader, optimizer, num_epochs, device, alpha=0.5):
    torets = []
    for epoch in range(num_epochs):
        model.train()
        for i, (moves, labels) in enumerate(train_loader):  
            moves = moves.to(device)
            labels = labels.to(device)

            classification_output, regression_output = model(moves)
            loss = combined_loss(classification_output, regression_output, labels, alpha)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        #print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')
        predicted_probs, predicted_labels, actual_labels = test_model(model, test_loader, device)
        pred_closeness = [sum(abs(p - a) <= k for p, a in zip(predicted_labels, actual_labels)) for k in range(10)]
        toret = [x/20000 for x in pred_closeness]
        torets.append(toret)
    return torets

def test_model(model, test_loader, device):
    model.eval()
    n_correct = 0
    n_samples = 0
    predicted_probs = []
    predicted_labels = []
    actual_labels = []
    with torch.no_grad():
        for moves, labels in test_loader:
            moves = moves.to(device)
            labels = labels.to(device)
            classification_output, _ = model(moves)
            probabilities = F.softmax(classification_output, dim=1)

            _, predicted = torch.max(classification_output.data, 1)
            predicted_probs.extend(probabilities.cpu().numpy())
            predicted_labels.extend(predicted.cpu().numpy())
            actual_labels.extend(labels.cpu().numpy())
            n_samples += labels.size(0)
            n_correct += (predicted == labels).sum().item()

    acc = 100.0 * n_correct / n_samples
    #print(f'Accuracy of the network on the test moves: {acc} %')
    return predicted_probs, predicted_labels, actual_labels

In [7]:
def pad_game(game, max_length=256, vector_size=42):
    padding_length = max_length - len(game)
    if padding_length < 0:
        return game[:max_length]
    else:
        padding = np.full((padding_length, vector_size), -1)
        return np.vstack((game, padding))

In [8]:
def get_loaders(padded_games, ratings_list, urls_list, batch_size, fold_number=0):
    if fold_number < 0 or fold_number > 4:
        raise ValueError("fold_number must be between 0 and 4")
    test_list = padded_games[fold_number::5]
    #print(len(test_list))
    train_list = [df for i in range(5) if i != fold_number for df in padded_games[i::5]]
    test_ratings = ratings_list[fold_number::5]
    train_ratings = [ratings for i in range(5) if i != fold_number for ratings in ratings_list[i::5]]
    test_urls = urls_list[fold_number::5]
    train_urls = [url for i in range(5) if i != fold_number for url in urls_list[i::5]]

    train_data = [torch.FloatTensor(doc) for doc in train_list]
    test_data = [torch.FloatTensor(doc) for doc in test_list]
    train_labels = torch.LongTensor(train_ratings)
    test_labels = torch.LongTensor(test_ratings)

    train_dataset = TensorDataset(torch.stack(train_data), train_labels)
    test_dataset = TensorDataset(torch.stack(test_data), test_labels)
    train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, test_loader, train_urls, test_urls

In [9]:
input_size = 42
hidden_size = 100
num_classes = 10
num_epochs = 18
num_layers = 2
learning_rate = 0.001
dropout_rate = 0.25
sequence_length = 80
batch_size = 80
alpha = 0.8
decay = 0.0000064

torch.manual_seed(64)

<torch._C.Generator at 0x152596879310>

In [10]:
piece_indices = [0, 1, 2, 4, 5, 9, 10, 11, 12, 14, 15, 27, 28, 29, 30, 31, 32] #17
domain_indices = [13, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26] #12
time_indices = [6, 7, 8, 33, 34, 35, 36] #7
engine_indices = [3, 37, 38, 39, 40, 41] #6
type_inds = [piece_indices, domain_indices, time_indices, engine_indices]
ind_names = ['piece_indices', 'domain_indices', 'time_indices', 'engine_indices']

for r in range(1, len(type_inds) + 1):
    for combo in combinations(enumerate(type_inds), r):
        keep_indices = list(chain(*[c[1] for c in combo]))
        names = [ind_names[c[0]] for c in combo]

        input_size = len(keep_indices)
        game_arrays_trunc = [arr[:, keep_indices] for arr in game_arrays]
        padded_games = [pad_game(g, sequence_length, input_size) for g in game_arrays_trunc]
        
        train_loader, test_loader, train_urls, test_urls = get_loaders(padded_games, ratings_list, urls_list, batch_size)
        model_path = file_name.split('.')[0] + '_pred.pth'
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=decay)
        num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f'Model: {names}, Parameters: {num_params:,}')
        #model.load_state_dict(torch.load(model_path, map_location=device))
        #model.to(device)

        lists = train_model(model, train_loader, test_loader, optimizer, num_epochs, device, alpha)
        predicted_probs, predicted_labels, actual_labels = test_model(model, test_loader, device)

        pred_closeness = [sum(abs(p - a) <= k for p, a in zip(predicted_labels, actual_labels)) for k in range(10)]
        print([x/20000 for x in pred_closeness])

        truths_counter = Counter(actual_labels)
        preds_counter = Counter(predicted_labels)
        actual_df = pd.DataFrame(list(truths_counter.items()), columns=['Label', 'Actual Count'])
        predicted_df = pd.DataFrame(list(preds_counter.items()), columns=['Label', 'Predicted Count'])
        merged_df = pd.merge(actual_df, predicted_df, on='Label', how='outer').fillna(0)
        sorted_df = merged_df.set_index('Label').sort_index()
        corrects_list = []
        recall_list = []
        for i in range(10):
            indices = [index for index, value in enumerate(actual_labels) if value == i]
            totals = [predicted_labels[ind] for ind in indices]
            corrects = [value for value in totals if value == i]
            rate = len(corrects)/len(totals)
            corrects_list.append(len(corrects))
            recall_list.append(rate)

        sorted_df['Correct Predictions'] = corrects_list
        sorted_df['Identify Rate'] = pd.Series(recall_list).round(3)
        display(sorted_df.T)
        print()

Model: ['piece_indices'], Parameters: 200,211
[0.2435, 0.61335, 0.81845, 0.9253, 0.97405, 0.99115, 0.99715, 0.99935, 0.99995, 1.0]


Label,0,1,2,3,4,5,6,7,8,9
Actual Count,1625.0,2149.0,2091.0,2043.0,2026.0,2116.0,2157.0,2114.0,2059.0,1620.0
Predicted Count,1580.0,2421.0,2661.0,2506.0,0.0,4295.0,2202.0,1365.0,2318.0,652.0
Correct Predictions,672.0,704.0,564.0,484.0,0.0,781.0,402.0,288.0,681.0,294.0
Identify Rate,0.414,0.328,0.27,0.237,0.0,0.369,0.186,0.136,0.331,0.181



Model: ['domain_indices'], Parameters: 198,211
[0.2278, 0.5759, 0.784, 0.89575, 0.96075, 0.9853, 0.99445, 0.99915, 0.99985, 1.0]


Label,0,1,2,3,4,5,6,7,8,9
Actual Count,1625.0,2149.0,2091.0,2043.0,2026.0,2116.0,2157.0,2114.0,2059.0,1620.0
Predicted Count,777.0,3121.0,3082.0,0.0,1662.0,5144.0,0.0,3661.0,1272.0,1281.0
Correct Predictions,414.0,919.0,624.0,0.0,271.0,842.0,0.0,716.0,349.0,421.0
Identify Rate,0.255,0.428,0.298,0.0,0.134,0.398,0.0,0.339,0.169,0.26



Model: ['time_indices'], Parameters: 196,211
[0.19535, 0.49265, 0.68185, 0.82715, 0.9053, 0.95475, 0.97985, 0.993, 0.99965, 1.0]


Label,0,1,2,3,4,5,6,7,8,9
Actual Count,1625.0,2149.0,2091.0,2043.0,2026.0,2116.0,2157.0,2114.0,2059.0,1620.0
Predicted Count,1427.0,5015.0,0.0,1728.0,2834.0,0.0,4534.0,0.0,2596.0,1866.0
Correct Predictions,488.0,1015.0,0.0,254.0,376.0,0.0,719.0,0.0,481.0,574.0
Identify Rate,0.3,0.472,0.0,0.124,0.186,0.0,0.333,0.0,0.234,0.354



Model: ['engine_indices'], Parameters: 195,811
[0.22555, 0.54305, 0.7505, 0.8721, 0.9413, 0.9783, 0.9945, 0.99905, 0.9999, 1.0]


Label,0,1,2,3,4,5,6,7,8,9
Actual Count,1625.0,2149.0,2091.0,2043.0,2026.0,2116.0,2157.0,2114.0,2059.0,1620.0
Predicted Count,2415.0,2521.0,2725.0,0.0,3380.0,0.0,1084.0,3024.0,2177.0,2674.0
Correct Predictions,839.0,611.0,477.0,0.0,552.0,0.0,173.0,531.0,463.0,865.0
Identify Rate,0.516,0.284,0.228,0.0,0.272,0.0,0.08,0.251,0.225,0.534



Model: ['piece_indices', 'domain_indices'], Parameters: 205,011
[0.2672, 0.6466, 0.85405, 0.94325, 0.98, 0.99315, 0.9982, 0.99965, 1.0, 1.0]


Label,0,1,2,3,4,5,6,7,8,9
Actual Count,1625.0,2149.0,2091.0,2043.0,2026.0,2116.0,2157.0,2114.0,2059.0,1620.0
Predicted Count,1346.0,2702.0,2079.0,1833.0,1468.0,3579.0,595.0,4065.0,1096.0,1237.0
Correct Predictions,706.0,864.0,506.0,367.0,298.0,745.0,116.0,912.0,333.0,497.0
Identify Rate,0.434,0.402,0.242,0.18,0.147,0.352,0.054,0.431,0.162,0.307



Model: ['piece_indices', 'time_indices'], Parameters: 203,011
[0.27345, 0.6758, 0.88045, 0.9575, 0.98925, 0.9962, 0.9989, 1.0, 1.0, 1.0]


Label,0,1,2,3,4,5,6,7,8,9
Actual Count,1625.0,2149.0,2091.0,2043.0,2026.0,2116.0,2157.0,2114.0,2059.0,1620.0
Predicted Count,1227.0,2001.0,2324.0,2776.0,1321.0,4305.0,299.0,3445.0,1512.0,790.0
Correct Predictions,650.0,698.0,602.0,612.0,289.0,866.0,62.0,828.0,453.0,409.0
Identify Rate,0.4,0.325,0.288,0.3,0.143,0.409,0.029,0.392,0.22,0.252



Model: ['piece_indices', 'engine_indices'], Parameters: 202,611
[0.27605, 0.65585, 0.86, 0.9532, 0.98685, 0.99645, 0.99875, 0.9997, 1.0, 1.0]


Label,0,1,2,3,4,5,6,7,8,9
Actual Count,1625.0,2149.0,2091.0,2043.0,2026.0,2116.0,2157.0,2114.0,2059.0,1620.0
Predicted Count,1303.0,3618.0,1374.0,497.0,3688.0,2071.0,1909.0,1539.0,927.0,3074.0
Correct Predictions,643.0,1090.0,330.0,127.0,737.0,452.0,438.0,372.0,237.0,1095.0
Identify Rate,0.396,0.507,0.158,0.062,0.364,0.214,0.203,0.176,0.115,0.676



Model: ['domain_indices', 'time_indices'], Parameters: 201,011
[0.27235, 0.64675, 0.83145, 0.92725, 0.9693, 0.98835, 0.99655, 0.99965, 1.0, 1.0]


Label,0,1,2,3,4,5,6,7,8,9
Actual Count,1625.0,2149.0,2091.0,2043.0,2026.0,2116.0,2157.0,2114.0,2059.0,1620.0
Predicted Count,1252.0,3055.0,3298.0,1503.0,1731.0,990.0,2553.0,811.0,3263.0,1544.0
Correct Predictions,691.0,979.0,745.0,284.0,356.0,190.0,555.0,180.0,831.0,636.0
Identify Rate,0.425,0.456,0.356,0.139,0.176,0.09,0.257,0.085,0.404,0.393



Model: ['domain_indices', 'engine_indices'], Parameters: 200,611
[0.2735, 0.64675, 0.8457, 0.94235, 0.98045, 0.99385, 0.99835, 0.9998, 1.0, 1.0]


Label,0,1,2,3,4,5,6,7,8,9
Actual Count,1625.0,2149.0,2091.0,2043.0,2026.0,2116.0,2157.0,2114.0,2059.0,1620.0
Predicted Count,1813.0,2032.0,3670.0,0.0,2975.0,232.0,3167.0,1801.0,1926.0,2384.0
Correct Predictions,850.0,660.0,827.0,0.0,595.0,52.0,645.0,435.0,505.0,901.0
Identify Rate,0.523,0.307,0.396,0.0,0.294,0.025,0.299,0.206,0.245,0.556



Model: ['time_indices', 'engine_indices'], Parameters: 198,611
[0.25265, 0.63195, 0.8255, 0.928, 0.9731, 0.9905, 0.99755, 0.9997, 1.0, 1.0]


Label,0,1,2,3,4,5,6,7,8,9
Actual Count,1625.0,2149.0,2091.0,2043.0,2026.0,2116.0,2157.0,2114.0,2059.0,1620.0
Predicted Count,916.0,3033.0,3716.0,0.0,2419.0,2035.0,2408.0,739.0,3642.0,1092.0
Correct Predictions,468.0,937.0,763.0,0.0,425.0,362.0,428.0,155.0,988.0,527.0
Identify Rate,0.288,0.436,0.365,0.0,0.21,0.171,0.198,0.073,0.48,0.325



Model: ['piece_indices', 'domain_indices', 'time_indices'], Parameters: 207,811
[0.30025, 0.7144, 0.8899, 0.9643, 0.9899, 0.99725, 0.99935, 0.9998, 1.0, 1.0]


Label,0,1,2,3,4,5,6,7,8,9
Actual Count,1625.0,2149.0,2091.0,2043.0,2026.0,2116.0,2157.0,2114.0,2059.0,1620.0
Predicted Count,1453.0,2705.0,2629.0,1405.0,2005.0,2992.0,2502.0,589.0,3094.0,626.0
Correct Predictions,791.0,964.0,702.0,342.0,427.0,685.0,584.0,157.0,988.0,365.0
Identify Rate,0.487,0.449,0.336,0.167,0.211,0.324,0.271,0.074,0.48,0.225



Model: ['piece_indices', 'domain_indices', 'engine_indices'], Parameters: 207,411
[0.29685, 0.6923, 0.8774, 0.9581, 0.9871, 0.99595, 0.99885, 0.9998, 1.0, 1.0]


Label,0,1,2,3,4,5,6,7,8,9
Actual Count,1625.0,2149.0,2091.0,2043.0,2026.0,2116.0,2157.0,2114.0,2059.0,1620.0
Predicted Count,2010.0,2813.0,2646.0,579.0,1625.0,1575.0,2663.0,2143.0,3057.0,889.0
Correct Predictions,954.0,876.0,630.0,134.0,353.0,384.0,626.0,553.0,966.0,461.0
Identify Rate,0.587,0.408,0.301,0.066,0.174,0.181,0.29,0.262,0.469,0.285



Model: ['piece_indices', 'time_indices', 'engine_indices'], Parameters: 205,411
[0.30085, 0.7069, 0.89515, 0.9669, 0.9906, 0.99715, 0.99955, 0.9999, 1.0, 1.0]


Label,0,1,2,3,4,5,6,7,8,9
Actual Count,1625.0,2149.0,2091.0,2043.0,2026.0,2116.0,2157.0,2114.0,2059.0,1620.0
Predicted Count,1291.0,2934.0,1912.0,2079.0,2184.0,1210.0,2802.0,1205.0,2015.0,2368.0
Correct Predictions,687.0,990.0,496.0,484.0,514.0,280.0,678.0,308.0,581.0,999.0
Identify Rate,0.423,0.461,0.237,0.237,0.254,0.132,0.314,0.146,0.282,0.617



Model: ['domain_indices', 'time_indices', 'engine_indices'], Parameters: 203,411
[0.29195, 0.69045, 0.88405, 0.9662, 0.9914, 0.9977, 0.99995, 1.0, 1.0, 1.0]


Label,0,1,2,3,4,5,6,7,8,9
Actual Count,1625.0,2149.0,2091.0,2043.0,2026.0,2116.0,2157.0,2114.0,2059.0,1620.0
Predicted Count,1173.0,2374.0,2349.0,1557.0,1998.0,450.0,5714.0,0.0,2474.0,1911.0
Correct Predictions,677.0,862.0,623.0,353.0,429.0,83.0,1243.0,0.0,711.0,858.0
Identify Rate,0.417,0.401,0.298,0.173,0.212,0.039,0.576,0.0,0.345,0.53



Model: ['piece_indices', 'domain_indices', 'time_indices', 'engine_indices'], Parameters: 210,211
[0.31845, 0.7371, 0.9139, 0.97605, 0.9938, 0.99845, 0.99965, 1.0, 1.0, 1.0]


Label,0,1,2,3,4,5,6,7,8,9
Actual Count,1625.0,2149.0,2091.0,2043.0,2026.0,2116.0,2157.0,2114.0,2059.0,1620.0
Predicted Count,1187.0,2040.0,2161.0,2413.0,956.0,2731.0,1882.0,2856.0,1862.0,1912.0
Correct Predictions,705.0,776.0,621.0,624.0,244.0,657.0,468.0,754.0,620.0,900.0
Identify Rate,0.434,0.361,0.297,0.305,0.12,0.31,0.217,0.357,0.301,0.556



