In [13]:
import torch
import time
import csv
import pandas as pd
from pathlib import Path
import os

import numpy as np
import scipy.sparse as sp
from tqdm import tqdm
from scipy.sparse import csr_matrix


import torch.nn.functional as F

In [2]:
# print numpy version
print("Numpy version: ", np.__version__)
# print scipy version
print("Scipy version: ", scipy.__version__)


Numpy version:  1.19.2
Scipy version:  1.6.2


In [3]:
print("Loaded PyTorch Version: {}".format(torch.__version__))


Loaded PyTorch Version: 2.4.1+cu121


In [4]:
# check if GPU is available
if torch.cuda.is_available():
    print("GPU is available")
    device = torch.device("cuda")
else:
    print("GPU is not available")
    device = torch.device("cpu")
print("Using device: ", device)

GPU is available
Using device:  cuda


In [5]:
# find the python version
import sys
print("Python version: ", sys.version)

Python version:  3.8.20 (default, Oct  3 2024, 15:24:27) 
[GCC 11.2.0]


# Load Dataset

In [6]:
from enum import Enum

class DiscardDataset(torch.utils.data.Dataset):
    
    class DiscardType(Enum):
        DISCARD = 0
        POOL    = 1

    def __init__(self, data_path, years: list, n_rows: int = None, phase: int = None, balance_data: bool = False, discard_type=DiscardType.DISCARD, singular=False):
        """ 
        If n_rows = None -> get all 
        param: singular: If True, pick 1 state per game at random!
        """ 
        
        # FORCE SEED
        torch.manual_seed(0)
        np.random.seed(0)
        
        # Invalid Parameter Combinations
        if balance_data:
            if not n_rows:
                raise BaseException("`n_rows` must be defined if `balance_data` is True!")
            elif n_rows < 34:
                raise BaseException("Cannot balance data if `n_rows` < 34!")
        
        ALL_YEARS = (2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019)
        invalid_years = set(years) - set(ALL_YEARS)
        if invalid_years:
            raise Exception(f"INVALID YEARS: {invalid_years}")
        
        # Dataset Print
        if n_rows:
            print(f"Loading Dataset with {n_rows:>13,} rows", end=' ')
        else:
            print(f"Loading Dataset with all rows", end=' ')

        if phase in [0, 1, 2]:
            print(f"(Phase {phase})", end=' ')
        else:
            print(f"(All Phases)", end=' ')
            
        print("{:<14}".format("<BALANCED>" if balance_data else '<NOT BALANCED>'), end=' ')
        
        print(years)
        
        # Check if given discard_type is valid
        if discard_type not in [DiscardDataset.DiscardType.DISCARD, DiscardDataset.DiscardType.POOL]:
            raise BaseException(f"INVALID discard type = {discard_type}! Use either `DiscardDataset.DISCARD` or `DiscardDataset.POOL`!")
        self.discard_type = discard_type

        game_id_list = []
        temp_matrices = []
        finished = False
        
        # Used when balance_data = False and n_rows != None
        loaded_rows = 0  
        
        # Used when balance_data = True
        class_bins = np.zeros(34)
        baseline_bin_size = n_rows // 34 if balance_data else -1  # The expected size of the smallest bin

        if balance_data:
            paths_load_bar = tqdm(total=baseline_bin_size * 34, unit='rows', position=0)
        else:
            paths_load_bar = tqdm(total=n_rows, unit='rows', position=0)

        for year in years:

            paths = (Path(data_path) / str(year)).iterdir()

            for idx, path in enumerate(paths):
                
                if path.suffix != '.npz':
                    continue
                
                game_id_list.append(path.stem)

                arr = scipy.sparse.load_npz(path).toarray()  # Loads a single complete game

                if phase in [0, 1, 2]:
                    phased_matrices = self.generate_phase_column(arr)
                    arr = phased_matrices[phase]
                    
                if singular:
                    if arr.shape[0] <= 0:  # No rows found (This can happen if a game lack states from a certain phase)
                        continue
                    random_row_index = np.random.choice(arr.shape[0], 1, replace=False)
                    arr = arr[random_row_index]  # Select 1 row per loaded game

                temp_matrices.append(arr)

                paths_load_bar.set_postfix(year=year, files_loaded=(idx + 1))  # Update Bar

                if balance_data:
                    
                    class_bins += np.bincount(arr[:, -1], minlength=34)
                    smallest_class_bin = int(np.amin(class_bins))

                    paths_load_bar.n = smallest_class_bin * 34
                    paths_load_bar.refresh()
                    
                    if baseline_bin_size <= smallest_class_bin:
                        finished = True
                        break

                else:
                    paths_load_bar.update(arr.shape[0])
                    
                    if n_rows:
                        loaded_rows += arr.shape[0]
                        if n_rows <= loaded_rows:
                            finished = True
                            break

            if finished:  # Early Stopping
                break

        if not finished and n_rows is not None:
            raise BaseException("`n_rows` is higher than found rows -- Either lower `n_rows` or include more annual datasets!")

        new_game_id_list = []
        for i, t_matrix in enumerate(temp_matrices):
            new_game_id_list.extend([game_id_list[i]] * t_matrix.shape[0])
        new_game_id_list = np.array(new_game_id_list)
        
        if balance_data:
            
            matrix = np.concatenate(temp_matrices, axis=0)
            sorted_indices = np.argsort(matrix[:, -1])
            
            matrix = matrix[sorted_indices]  # Sort rows by last column (the y-value)
            new_game_id_list = new_game_id_list[sorted_indices]
            
            split_indices = np.where(np.diff(matrix[:, -1])!=0)[0]+1  # I was drunk
            sorted_rows = np.array_split(matrix, split_indices)  # Organize rows according to their last column's value into a list
            sorted_game_ids = np.array_split(new_game_id_list, split_indices)
            
            for i in range(len(sorted_rows)):
                sorted_rows[i] = sorted_rows[i][:baseline_bin_size]  # The balancing action
                sorted_game_ids[i] = sorted_game_ids[i][:baseline_bin_size]

            final_arr = np.concatenate(sorted_rows, axis=0)
            final_game_id_list = np.concatenate(sorted_game_ids, axis=0)

        else:
            final_arr = np.vstack(temp_matrices)
            final_game_id_list = new_game_id_list
            
            if n_rows:
                final_arr = final_arr[:n_rows]
                final_game_id_list = final_game_id_list[:n_rows]
                
        # Extract Round Number and Steps from data
        self.round_numbers = final_arr[:, 32].reshape(-1).tolist()
        self.step_numbers  = (final_arr[:, 33] + 128 - 1).reshape(-1).tolist()        
        final_arr[:, 32] = -128  # Reset to padding value
        final_arr[:, 33] = -128  # Reset to padding value

        # Finalize tqdm bar
        paths_load_bar.n = final_arr.shape[0]
        paths_load_bar.last_print_n = final_arr.shape[0]
        paths_load_bar.refresh()
        paths_load_bar.close()
        
        self.game_ids = list(final_game_id_list)
        self.combined_x_data = torch.FloatTensor(final_arr[:, :-1])  # Must be Float it seems
        
        self.x_data = None
        if self.discard_type == DiscardDataset.DiscardType.POOL:
            self.use_pools()
        else:
            self.use_discards()
        
        self.y_data = torch.LongTensor(final_arr[:, -1])  # Must be Long it seems
        
    def use_pools(self):
        self.discard_type = DiscardDataset.DiscardType.POOL
        self.x_data = self.combined_x_data[:, 0:374]

    def use_discards(self):
        self.discard_type = DiscardDataset.DiscardType.DISCARD
        self.x_data = torch.hstack((self.combined_x_data[:, :238], self.combined_x_data[:, 374:]))  # Slice away POOL data
    
    @staticmethod
    def generate_phase_column(array: np.array) -> np.array:
        # Begin with merging all pools together

        merged_discards = array[:, 238:374]  # Pool
        merged_discards = np.sum(merged_discards, axis=1)

        phases = np.zeros([array.shape[0]])  # Early Game
        phases[(24 < merged_discards) & (merged_discards <= 48)] = 1  # Mid Game
        phases[(48 < merged_discards)] = 2  # End Game

        return array[(phases == 0)], array[(phases == 1)], array[(phases == 2)]        

    def __len__(self):
        return self.x_data.shape[0]

    def __getitem__(self, idx):
#         X = self.x_data[idx, 0:374] if self.discard_type == DiscardDataset.DiscardType.POOL else torch.hstack((self.x_data[idx, :238], self.x_data[idx, 374:]))
        return {
            'game_id': self.game_ids[idx],
            'round': self.round_numbers[idx],
            'step': self.step_numbers[idx],
            'X': self.x_data[idx],
            'y': self.y_data[idx]
        }

In [16]:
file = '/home/lenzlaww/document/SBU/CSE537/finalProject/mjx/2019/2019010100gm-00a9-0000-127203b5.npz'
npz = np.load(file)
mat1 = np.array(csr_matrix((npz['data'], npz['indices'], npz['indptr']), shape=npz['shape']).toarray())
print(mat1.shape)

(469, 511)


In [22]:
print("\nLOADING DATASETS:\n")

torch.manual_seed(0)
np.random.seed(0)
dataset_path = Path("2019")
npz_files = [f for f in dataset_path.iterdir() if f.suffix == '.npz']
print(f"Total .npz files: {len(npz_files)}")
npz_files[:5]




LOADING DATASETS:

Total .npz files: 171629


[PosixPath('2019/2019012700gm-00a9-0000-c920cee3.npz'),
 PosixPath('2019/2019071617gm-00a9-0000-0baa04f0.npz'),
 PosixPath('2019/2019100811gm-00a9-0000-6fc9b38a.npz'),
 PosixPath('2019/2019060921gm-00a9-0000-1786343a.npz'),
 PosixPath('2019/2019110621gm-00a9-0000-636733c5.npz')]

# Simple Model

In [None]:
class Net(torch.nn.Module):
    """ Simple Feed-Forward Net """

    def __init__(self):
        super(Net, self).__init__()
        
        self.name = "BasicDenseNetwork_testing"
        
        self.fc1 = torch.nn.Linear(11 * 34, 1024)  # SWITCH TO 1024
        self.fc2 = torch.nn.Linear(1024, 512)      # SWITCH TO 1024
        self.fc3 = torch.nn.Linear(512, 256)
        self.fc4 = torch.nn.Linear(256, 128)
        self.fc5 = torch.nn.Linear(128, 34)
        
        self.relu_1 = torch.nn.LeakyReLU()
        self.relu_2 = torch.nn.LeakyReLU()
        self.relu_3 = torch.nn.LeakyReLU()
        self.relu_4 = torch.nn.LeakyReLU()

    def forward(self, x):
            
        x = self.fc1(x)
        x = self.relu_1(x)
        
        x = self.fc2(x)
        x = self.relu_2(x)
        
        x = self.fc3(x)
        x = self.relu_3(x)
        
        x = self.fc4(x)
        x = self.relu_4(x)
        
        x = self.fc5(x)

        return x