In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import os
import pickle
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, roc_auc_score
import sys
sys.path.append(os.path.abspath(".."))
import torch

from datasets import PressingSequenceDataset, SoccerMapInputDataset

2025-06-11 23:19:44.743891: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-11 23:19:44.746390: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-11 23:19:44.776807: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-06-11 23:19:44.776840: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-06-11 23:19:44.776860: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

In [2]:
data_path = "/data/MHL/pressing-intensity"

with open(f"{data_path}/train_dataset.pkl", "rb") as f:
    train_dataset = pickle.load(f)

with open(f"{data_path}/test_dataset.pkl", "rb") as f:
    test_dataset = pickle.load(f)

len(train_dataset), len(test_dataset)

(7795, 888)

In [3]:
sample = train_dataset[0]
sample.keys()

dict_keys(['features', 'pressing_intensity', 'label', 'presser_id', 'agent_order'])

In [30]:
feature_len = []
for sample in train_dataset:
    feature_len.append(str(sample['features'][-1:, ...].shape))
    feature_len.append(str(sample['pressing_intensity'][-1:, ...].shape))
print(set(feature_len))

{'torch.Size([1, 11, 10])', 'torch.Size([1, 11, 11])', 'torch.Size([1, 23, 8])', 'torch.Size([1, 10, 10])', 'torch.Size([1, 10, 11])'}


In [10]:
sample['features'][-1:, ...].shape

torch.Size([1, 23, 8])

In [4]:
print(f"Features : {sample['features'].shape}")
print(f"Pressing Intensity : {sample['pressing_intensity'].shape}")
print(f"Labels : {sample['label']}")
print(f"Presser ID : {sample['presser_id']}")
print(f"Players Order : {sample['agent_order']}")

Features : torch.Size([2, 23, 8])
Pressing Intensity : torch.Size([2, 11, 11])
Labels : 0
Presser ID : 77414
Players Order : ['188178', '250079', '250101', '250102', '500133', '500140', '500141', '500142', '62365', '62386', '77414', '187259', '343587', '408792', '500113', '500115', '500116', '500117', '500118', '500121', '500502', '83615', 'ball']


In [22]:
x_tensor_lst = [sample['features'] for sample in train_dataset]
x_tensor_lst = torch.cat(x_tensor_lst)
feature_cols = ['x', 'y', 'vx', 'vy', 'v', 'ax', 'ay', 'a']

for i in range(8):
    print(f"{feature_cols[i]} : {x_tensor_lst[-1, ...][..., i].min()} ~ {x_tensor_lst[-1, ...][..., i].max()}")

x : -25.148563385009766 ~ 53.016780853271484
y : -7.853689670562744 ~ 20.099321365356445
vx : -1.6054884195327759 ~ 5.0920281410217285
vy : -12.45792007446289 ~ 0.8517857193946838
v : 0.07405897229909897 ~ 13.411462783813477
ax : -4.507462978363037 ~ 1.2211380004882812
ay : -7.128241539001465 ~ 2.685581922531128
a : 0.19751569628715515 ~ 6.0


# 1. XGBoost

In [10]:
num_seq = 150
num_agents = 11
use_pressing_intensity = False
selected_features_idx = [1, 2, 4, 7]

train_features = []
train_labels = []
for i in range(len(train_dataset)):
    sample = train_dataset[i]
    x_tensor = sample['features'][..., selected_features_idx][-90:]
    press_intensity = sample['pressing_intensity'][-90:]
    y_tensor = sample['label']
    
    # Flatten the sequence data: shape (sequence_length, num_features) -> (sequence_length*num_features,)
    feature_vector = x_tensor.flatten().numpy()
    if use_pressing_intensity:
        if press_intensity.shape[1] != num_agents:
            pad_tensor = torch.zeros(press_intensity.shape[0], num_agents-press_intensity.shape[1], press_intensity.shape[2])
            press_intensity = torch.cat([press_intensity, pad_tensor], dim=1)
            
        if press_intensity.shape[2] != num_agents:
            pad_tensor = torch.zeros(press_intensity.shape[0], press_intensity.shape[1], num_agents-press_intensity.shape[2])
            press_intensity = torch.cat([press_intensity, pad_tensor], dim=2)
        
        press_vector = press_intensity.flatten().numpy()
        # Concatenate the flattened sequence data with the pressing intensity
        feature_vector = np.concatenate((feature_vector, press_vector))
    train_features.append(feature_vector)
    train_labels.append(y_tensor.item())


test_features = []
test_labels = []
for i in range(len(test_dataset)):
    sample = test_dataset[i]
    x_tensor = sample['features'][..., selected_features_idx][-90:]
    press_intensity = sample['pressing_intensity'][-90:]
    y_tensor = sample['label']
    
    # Flatten the sequence data: shape (sequence_length, num_features) -> (sequence_length*num_features,)
    feature_vector = x_tensor.flatten().numpy()
    
    if use_pressing_intensity:
        if press_intensity.shape[1] != num_agents:
            pad_tensor = torch.zeros(press_intensity.shape[0], num_agents-press_intensity.shape[1], press_intensity.shape[2])
            press_intensity = torch.cat([press_intensity, pad_tensor], dim=1)
        if press_intensity.shape[2] != num_agents:
            pad_tensor = torch.zeros(press_intensity.shape[0], press_intensity.shape[1], num_agents-press_intensity.shape[2])
            press_intensity = torch.cat([press_intensity, pad_tensor], dim=2)
        
        press_vector = press_intensity.flatten().numpy()
        # Concatenate the flattened sequence data with the pressing intensity
        feature_vector = np.concatenate((feature_vector, press_vector))
    test_features.append(feature_vector)
    test_labels.append(y_tensor.item())

X_train = np.array(train_features)
y_train = np.array(train_labels)

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

X_test = np.array(test_features)
y_test = np.array(test_labels)

In [12]:
def print_dataset_distribution(y_train, y_val, y_test):
    def _print_split(name, labels):
        total = len(labels)
        unique, counts = np.unique(labels, return_counts=True)
        print(f"{name} Set:")
        print(f"  Total samples: {total}")
        for label, count in zip(unique, counts):
            percent = (count / total) * 100
            print(f"    Label {label}: {count:>5} samples ({percent:5.2f}%)")
        print("-" * 40)

    print("\nðŸ“Š Dataset Distribution Summary")
    print("=" * 40)
    _print_split("Train", y_train)
    _print_split("Validation", y_val)
    _print_split("Test", y_test)

print_dataset_distribution(y_train, y_val, y_test)


ðŸ“Š Dataset Distribution Summary
Train Set:
  Total samples: 11392
    Label 0:  9138 samples (80.21%)
    Label 1:  2254 samples (19.79%)
----------------------------------------
Validation Set:
  Total samples: 2849
    Label 0:  2281 samples (80.06%)
    Label 1:   568 samples (19.94%)
----------------------------------------
Test Set:
  Total samples: 1721
    Label 0:  1358 samples (78.91%)
    Label 1:   363 samples (21.09%)
----------------------------------------


In [9]:
# Create XGBoost DMatrix objects for train and test sets
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_val, label=y_val)

# Set XGBoost training parameters
params = {
    'objective': 'binary:logistic',  # binary classification
    'eval_metric': 'auc',            # evaluation metric: AUC
    'max_depth': 6,                  # maximum depth of trees
    'eta': 0.1,                      # learning rate
    'seed': 42
}

# Specify the watchlist to evaluate performance on training and test sets during training
watchlist = [(dtrain, 'train'), (dtest, 'eval')]
num_rounds = 100

In [10]:
# Train XGBoost model with early stopping on the evaluation set
bst = xgb.train(params, dtrain, num_rounds, watchlist, early_stopping_rounds=10)



[0]	train-auc:0.72404	eval-auc:0.59755
[1]	train-auc:0.77449	eval-auc:0.61684
[2]	train-auc:0.80143	eval-auc:0.61842
[3]	train-auc:0.82563	eval-auc:0.62081
[4]	train-auc:0.83993	eval-auc:0.62739
[5]	train-auc:0.85611	eval-auc:0.63009
[6]	train-auc:0.86313	eval-auc:0.63397
[7]	train-auc:0.87409	eval-auc:0.63416
[8]	train-auc:0.88935	eval-auc:0.63792
[9]	train-auc:0.89944	eval-auc:0.63648
[10]	train-auc:0.90767	eval-auc:0.63826
[11]	train-auc:0.91626	eval-auc:0.63678
[12]	train-auc:0.92196	eval-auc:0.63293
[13]	train-auc:0.92891	eval-auc:0.63344
[14]	train-auc:0.93481	eval-auc:0.63360
[15]	train-auc:0.93753	eval-auc:0.63349
[16]	train-auc:0.94254	eval-auc:0.63322
[17]	train-auc:0.94739	eval-auc:0.63347
[18]	train-auc:0.94938	eval-auc:0.63573
[19]	train-auc:0.95371	eval-auc:0.63360
[20]	train-auc:0.95596	eval-auc:0.63550


In [None]:
# W/O Pressing Intensity
# Get predictions on the test set
y_pred = bst.predict(dtest)
y_pred_label = (y_pred > 0.5).astype(int)

# Calculate and print evaluation metrics
accuracy = accuracy_score(y_val, y_pred_label)
auc = roc_auc_score(y_val, y_pred)

print("Test Accuracy: {:.4f}".format(accuracy))
print("Test AUC: {:.4f}".format(auc))

Test Accuracy: 0.8027
Test AUC: 0.6491


: 

In [11]:
# W/O Pressing Intensity
# Get predictions on the test set
y_pred = bst.predict(dtest)
y_pred_label = (y_pred > 0.5).astype(int)

# Calculate and print evaluation metrics
accuracy = accuracy_score(y_val, y_pred_label)
auc = roc_auc_score(y_val, y_pred)

print("Test Accuracy: {:.4f}".format(accuracy))
print("Test AUC: {:.4f}".format(auc))

Test Accuracy: 0.7989
Test AUC: 0.6465


# 2. SoccerMap / exPress Evaluation

In [13]:
from datasets import exPressInputDataset
from tqdm import tqdm

In [24]:
train_dataset[0]['pressing_intensity'].shape == (1, 11, 11)

True

In [None]:
train_dataset = exPressInputDataset(f"{data_path}/train_dataset.pkl")

cnt = 0
for idx in range(len(train_dataset)):
    if train_dataset[idx]['pressing_intensity'].shape == (1, 11, 11):
        cnt += 1
cnt

Loading dataset from /data/MHL/pressing-intensity/train_dataset.pkl...


7795

In [15]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=16)

In [19]:
feat_shape = []
press_shape = []
for batch in train_loader:
    feat_shape.append(batch['features'].shape)
    press_shape.append(batch['pressing_intensity'].shape)    

RuntimeError: each element in list of batch should be of equal size

In [39]:
import torch
import pytorch_lightning as pl
from torch.utils.data import DataLoader
import json
import os
os.chdir('/home/work/MHL/express-v2')
import argparse # To accept checkpoint path as argument

# Import project modules
# import config  # Import static configurations
from model import PytorchSoccerMapModel # Import Lightning model
from datasets import PressingSequenceDataset, SoccerMapInputDataset 


In [40]:
pl.seed_everything(42, workers=True) # Ensure reproducibility

DATA_PATH = "/data/MHL/pressing-intensity" # Path where pickled datasets are saved
test_dataset = SoccerMapInputDataset(os.path.join(DATA_PATH, "test_dataset.pkl"))

if len(test_dataset) == 0:
    print("Loaded test dataset is empty. Exiting.")

# Custom collate function to handle potential None values from dataset errors
def collate_fn_skip_none(batch):
    batch = list(filter(lambda x: x is not None, batch))
    if not batch: return None
    try: return torch.utils.data.dataloader.default_collate(batch)
    except RuntimeError: return None # Skip batch if collation error

test_loader = DataLoader(
    test_dataset,
    batch_size=1,
    shuffle=False,
    num_workers=4,
    pin_memory=True,
    persistent_workers=True,
    # collate_fn=collate_fn_skip_none
)

Seed set to 42


Loading dataset from /data/MHL/pressing-intensity/test_dataset.pkl...


In [41]:
import argparse

parser = argparse.ArgumentParser(description="Train a pressing evaluation model.")
# parser.add_argument("--model_type", type=str, default="soccermap", choices=['soccermap', 'xgboost', 'exPress'], help="Path to the model checkpoint (.ckpt) file saved during training.")
# parser.add_argument("--root_path", type=str, default="/data/MHL/pressing-intensity", help="Path to the data file.")
parser.add_argument("--model_type", type=str, default="soccermap", choices=['soccermap', 'xgboost', 'exPress'], help="Path to the model checkpoint (.ckpt) file saved during training.")
parser.add_argument("--root_path", type=str, default="/data/MHL/pressing-intensity", help="Path to the data file.")
parser.add_argument("--mode", type=str, default="train", choices=['train', 'test'], help="Mode: 'train' or 'test'.")
parser.add_argument("--ckpt_path", type=str, default=None, help="Path to checkpoint file (Required for 'test' mode).")
parser.add_argument("--params_path", type=str, default="params.json", help="Path to the JSON containing configurations.")
parser.add_argument("--seed", type=int, default=42, help="Seed number.")

args = parser.parse_args([])

args.mode = 'test'
args.model_type = "exPress"
args.ckpt_path = "/data/MHL/pressing-intensity/checkpoints/exPress-epoch=28-val_loss=0.49.ckpt"

In [42]:
from components import press


component_dict = {
                    "soccermap": press.SoccerMapComponent,
                    "exPress": press.exPressComponent,
                }

exp = component_dict[args.model_type](args)

Seed set to 42


Configurations loaded from params.json.
