In [3]:
from kloppy import wyscout
from kloppy.domain import Provider, Dimension, NormalizedPitchDimensions, Orientation
import pandas as pd
import json
import pickle
import os
import numpy as np
import sklearn
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from concurrent.futures import ThreadPoolExecutor
from pandas.api.types import is_categorical_dtype

In [2]:
def _load_raw_match(matchid):
    """Load a single match (no transform here)."""
    try:
        dataset = wyscout.load_open_data(
            match_id=matchid,
            coordinates="wyscout"
        )
        return matchid, dataset
    except Exception as e:
        print(f"Failed match {matchid}: {e}")
        return matchid, None

In [None]:
class KloppyDataset(Dataset):

    def __init__(self, matches_path):
        super().__init__()
        self.filenames = os.listdir(matches_path)
        match_ids = []
        for file in self.filenames:
            with open(matches_path + file, 'r') as file:
                data = json.load(file)

            matches = pd.DataFrame(data)
            match = matches.loc[:, "wyId"].to_list()
            match_ids += match
        self.match_ids = match_ids
    
    def transform(self, dataset):
        ds = dataset.transform(
        to_pitch_dimensions=NormalizedPitchDimensions(
            pitch_length=105,
            pitch_width=68,
            x_dim = Dimension(0, 1),
            y_dim = Dimension(0, 1)
        ),
        to_coordinate_system = Provider.WYSCOUT,
        to_orientation = Orientation.STATIC_HOME_AWAY)
        return ds
    
    def load_match(self, max_workers = None):
        df_dict = {}
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            for matchid, dataset in executor.map(_load_raw_match, self.match_ids):
                if dataset is None:
                    continue

                ds = self.transform(dataset)
                df = ds.to_df()
                df_dict[str(matchid)] = df
                print(f"Loaded + transformed match {matchid} (total: {len(df_dict)})")

        return df_dict

In [None]:
kloppy_ds = KloppyDataset(matches_path= 'matches/')
data = kloppy_ds.load_match(max_workers=None)
with open("data.pkl", "wb") as f:
    pickle.dump(data, f)

In [4]:
with open("data.pkl", "rb") as f:
    data_dict = pickle.load(f)

In [5]:
# Just use columns that are needed
cols = [ 'event_type', 'period_id', 'team_id', 'player_id',
        'coordinates_x', 'coordinates_y', 'success', 'result', 'timestamp' ]

# Create the beginning of match and end of match token
begin = pd.DataFrame(["<BOM>", None, None, None, None, None, None, None, None], index=cols).T # Transpose to make this a row series
end = pd.DataFrame(["<EOM>", None, None, None, None, None, None, None, None], index=cols).T # Transpose to make this a row series

# Add the tokens
for matchid, values in data_dict.items():   
    extracted = values[cols]
    data_dict[matchid] = pd.concat((begin, extracted, end), ignore_index= True)

# Create a new dataframe with all matches, separated by beginning and end of match tokens
# Create a column 'matchid' to keep track of matches
extracted = pd.concat(
    [df.assign(matchid = k) for k, df in data_dict.items()],
    ignore_index = True
)

# Combine results and event_type into one column 
fltr = extracted.event_type != None 
extracted['input_event'] = pd.Series()
extracted.input_event[fltr] = extracted['event_type'][fltr] +  '_' + extracted['result'][fltr]
NANS = extracted.input_event.isna()
extracted['input_event'][NANS] = extracted['event_type'] # Where there is no result, only use input event

# Create times in seconds and delta_times
times = extracted.timestamp.dt.total_seconds()
delta_times = extracted.timestamp.dt.total_seconds().diff()

# Create Delta_x and Delta_y
delta_x = extracted.coordinates_x.diff()
delta_y = extracted.coordinates_y.diff()

# Add columns to the dataframe
extracted['timestamp_seconds'] = times
extracted['delta_time'] = delta_times
extracted['delta_x'] = delta_x
extracted['delta_y'] = delta_y

# Drop the columns that are not needed anymore
final_df = extracted.drop(
    columns = ['success', 'result', 'event_type', 'timestamp']
)

# Put Categorical Variables into type category
factor_cols = ["period_id", "team_id", "player_id", "input_event", "matchid"]
final_df[factor_cols] = final_df[factor_cols].astype("category")

  data_dict[matchid] = pd.concat((begin, extracted, end), ignore_index= True)
  data_dict[matchid] = pd.concat((begin, extracted, end), ignore_index= True)
  data_dict[matchid] = pd.concat((begin, extracted, end), ignore_index= True)
  data_dict[matchid] = pd.concat((begin, extracted, end), ignore_index= True)
  data_dict[matchid] = pd.concat((begin, extracted, end), ignore_index= True)
  data_dict[matchid] = pd.concat((begin, extracted, end), ignore_index= True)
  data_dict[matchid] = pd.concat((begin, extracted, end), ignore_index= True)
  data_dict[matchid] = pd.concat((begin, extracted, end), ignore_index= True)
  data_dict[matchid] = pd.concat((begin, extracted, end), ignore_index= True)
  data_dict[matchid] = pd.concat((begin, extracted, end), ignore_index= True)
  data_dict[matchid] = pd.concat((begin, extracted, end), ignore_index= True)
  data_dict[matchid] = pd.concat((begin, extracted, end), ignore_index= True)
  data_dict[matchid] = pd.concat((begin, extracted, end), ignore

In [None]:
# with open("final_df.pkl", "wb") as f:
#     pickle.dump(final_df, f)

In [6]:
label = pd.Series(final_df.input_event[1:].to_numpy())
label[label.index.max() + 1] = "<BOM>"
final_df['label'] = label
unique_match_ids = final_df.matchid.unique().to_numpy()

# Train-test split
train_ids, test_ids = sklearn.model_selection.train_test_split(unique_match_ids, test_size=0.1) # 10% test data

# Train-validation split
train_ids, val_ids = sklearn.model_selection.train_test_split(train_ids, test_size=0.25) # 25% validation data

# indexing from data set
X_train = final_df[final_df.matchid.isin(train_ids)]

In [None]:
# with open("X_train.pkl", "rb") as f:
#     X_train = pickle.load(f)

In [7]:
# Convert new label column to category type
factor_cols = ["label"]
X_train[factor_cols] = X_train[factor_cols].astype("category")


# Fill all NAs with <missing>
for col in X_train.columns:
    if is_categorical_dtype(X_train[col]):
        X_train[col] = X_train[col].cat.add_categories(["<missing>"]).fillna("<missing>")

# Convert category variables from strings to their category code
for col in X_train.select_dtypes(['category']).columns:
    X_train[col] = X_train[col].cat.codes

# Fill all continuous column NAs with -1
X_train = X_train.fillna(-1)  

# Extract continuous variables into separate DF
co_cols = [
    'coordinates_x', 
    'coordinates_y', 
    'delta_x', 
    'delta_y', 
    'timestamp_seconds', 
    'delta_time'
    ]
coordinates_and_time = X_train[co_cols]
X_train = X_train.drop(
    columns = co_cols
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[factor_cols] = X_train[factor_cols].astype("category")
  if is_categorical_dtype(X_train[col]):
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = X_train[col].cat.add_categories(["<missing>"]).fillna("<missing>")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = X_tr

In [8]:
# Create the tensors for categorical and continuous variables
X_train_numpy = X_train.to_numpy()
X_train_coordinates = coordinates_and_time.to_numpy()
X_train_tensor = torch.tensor(X_train_numpy)
X_train_coordinates_tensor =  torch.tensor(X_train_coordinates)

In [9]:
class FootballTransformer(nn.Module):
    def __init__(self, num_events, num_players, num_teams, d_model=128):
        super().__init__()

        self.event_emb = nn.Embedding(num_events, d_model)
        self.player_emb = nn.Embedding(num_players, d_model)
        self.team_emb = nn.Embedding(num_teams, d_model)
    
        self.num_linear = nn.Linear(6, d_model)  # still need to create time or delta time
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=4,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=4)
    
        self.fc_out = nn.Linear(d_model, num_events)

    def forward(self, event_id, player_id, team_id, numeric):
        x = (
            self.event_emb(event_id)
            + self.player_emb(player_id)
            + self.team_emb(team_id)
            + self.num_linear(numeric)
        )

        x = self.transformer(x)
        x = x[:, -1]  # last timestep
        return self.fc_out(x)

In [None]:
# !!!!!!!!! DIESER CODE FUNKTIONIERT !!!!!!!!!

event_ids  = X_train_tensor[:5, 4].long()
player_ids = X_train_tensor[:5, 2].long()
team_ids   = X_train_tensor[:5, 1].long()

num_events  = int(event_ids.max().item())  + 1
num_players = int(player_ids.max().item()) + 1
num_teams   = int(team_ids.max().item())   + 1

model = FootballTransformer(
    num_events=num_events,
    num_players=num_players,
    num_teams=num_teams,
    d_model=128,
)
numeric_batch = X_train_coordinates_tensor[:5,:].float().unsqueeze(0) 

with torch.no_grad():
    out = model(event_ids, player_ids, team_ids, numeric_batch)

In [None]:
# One full sequence as batch size 1
# event_id  = X_train_tensor[:5, 4].long().unsqueeze(0)  # (1, seq_len)
# player_id = X_train_tensor[:5, 2].long().unsqueeze(0)  # (1, seq_len)
# team_id   = X_train_tensor[:5, 1].long().unsqueeze(0)  # (1, seq_len)

# numeric   = X_train_coordinates_tensor[:5,:].float().unsqueeze(0)   # (1, seq_len, 2)

# with torch.no_grad():
#     out = test(event_id, player_id, team_id, numeric)

In [None]:
# ChatGPT Test Tensors


# batch_size = 4
# seq_len = 10

# event_id  = torch.randint(0, 500, (batch_size, seq_len))
# player_id = torch.randint(0, 300, (batch_size, seq_len))
# team_id   = torch.randint(0, 50,  (batch_size, seq_len))

# numeric = torch.randn(batch_size, seq_len, 2)  # x, y coords or x, y, deltaT

In [10]:
class FootballSequenceDataset(Dataset):
    def __init__(self, X_events: torch.Tensor, X_numeric: torch.Tensor, seq_len: int):
        """
        X_events:  (N_events, num_cols)       # your X_train_tensor
        X_numeric: (N_events, 6)              # your X_train_coordinates_tensor
        seq_len:   length of context window
        """
        self.X_events = X_events
        self.X_numeric = X_numeric
        self.seq_len = seq_len

        # we stop at N - seq_len so we always have a "next" event as target
        self.N = X_events.size(0) - seq_len
            
    def __len__(self):
        return self.N

    def __getitem__(self, idx):
        # slice [idx, idx+seq_len) as input sequence
        window = slice(idx, idx + self.seq_len)

        # NOTE: no conversion to tensor, they already ARE tensors
        events_window = self.X_events[window]          # (seq_len, num_cols)
        numeric       = self.X_numeric[window]         # (seq_len, 6)

        # extract IDs from the right columns
        event_ids  = events_window[:, 4].long()        # (seq_len,)
        player_ids = events_window[:, 2].long()        # (seq_len,)
        team_ids   = events_window[:, 1].long()        # (seq_len,)

        # target: the event ID *after* this window
        target = self.X_events[idx + self.seq_len, 4].long()  # scalar

        return event_ids, player_ids, team_ids, numeric, target

In [11]:
seq_len = 20  # context length

train_dataset = FootballSequenceDataset(
    X_train_tensor,
    X_train_coordinates_tensor,
    seq_len=seq_len,
)

train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
)

In [12]:
num_events  = int(X_train_tensor[:, 4].max().item()) + 1
num_players = int(X_train_tensor[:, 2].max().item()) + 1
num_teams   = int(X_train_tensor[:, 1].max().item()) + 1

model = FootballTransformer(
    num_events=num_events,
    num_players=num_players,
    num_teams=num_teams,
    d_model=128,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

FootballTransformer(
  (event_emb): Embedding(27, 128)
  (player_emb): Embedding(3030, 128)
  (team_emb): Embedding(143, 128)
  (num_linear): Linear(in_features=6, out_features=128, bias=True)
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (fc_out): Linear(in_features=128, out_features=27, bias=True)
)

In [13]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [14]:
num_epochs = 1

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for event_ids, player_ids, team_ids, numeric, target in train_loader:
        # Move to device
        event_ids  = event_ids.to(device)          # (batch, seq_len)
        player_ids = player_ids.to(device)
        team_ids   = team_ids.to(device)
        numeric    = numeric.to(device).float()    # (batch, seq_len, 6)
        target     = target.to(device)             # (batch,)

        optimizer.zero_grad()

        # Forward
        logits = model(event_ids, player_ids, team_ids, numeric)  # (batch, num_events)

        # Compute loss: predict next event_id
        loss = criterion(logits, target)

        # Backprop + update
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * event_ids.size(0)

    avg_loss = total_loss / len(train_dataset)
    print(f"Epoch {epoch+1} | loss = {avg_loss:.4f}")

KeyboardInterrupt: 