Potential hyper-parameters to tune: weight_decay, dropout_prob, number of hidden layers and their sizes

Add tensorboard for following training?

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Subset
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
from pathlib import Path
import copy
import matplotlib.pyplot as plt

In [None]:
from src.utils import calculate_performance_metrics

### Define model and other functions

In [None]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size, dropout_prob=0.2):
        super(MLP, self).__init__()

        # Initialize an empty list to hold layers
        layers = []

        # Input layer to the first hidden layer
        layers.append(nn.Linear(input_size, hidden_sizes[0]))
        layers.append(nn.ReLU())  # Activation function

        # Create hidden layers dynamically based on hidden_sizes
        for i in range(1, len(hidden_sizes)):
            layers.append(nn.Linear(hidden_sizes[i-1], hidden_sizes[i]))
            layers.append(nn.BatchNorm1d(hidden_sizes[i])) # Batch Normalization
            layers.append(nn.ReLU())  # Activation function for each hidden layer
            layers.append(nn.Dropout(dropout_prob))

        # Output layer (last hidden layer to output)
        layers.append(nn.Linear(hidden_sizes[-1], output_size))

        # Use nn.Sequential to create the full model from the list of layers
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

In [None]:
# Function to average model weights
def average_model_weights(models):
    avg_model = copy.deepcopy(models[0])  # Create a copy of the first model
    with torch.no_grad():  # Turn off gradient tracking
        for key in avg_model.state_dict().keys():
            for i in range(1, len(models)):
                avg_model.state_dict()[key] += models[i].state_dict()[key]
            avg_model.state_dict()[key] = avg_model.state_dict()[key] / len(models)
    return avg_model

In [None]:
# Early Stopping Class
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.best_loss = np.Inf
        self.counter = 0
        self.early_stop = False
        self.best_model_state = None

    def __call__(self, val_loss, model):
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
            self.best_model_state = copy.deepcopy(model.state_dict())  # Save best model state
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

In [None]:
def ensemble_predict(models, X):
    with torch.no_grad():
        predictions = [model(X) for model in models]
    return torch.mean(torch.stack(predictions), dim=0)

### Data

In [None]:
latest_gameweek = 0
shift_param = 1

# fetch data
filepath = Path('../../data/modeling/fpl_df.csv')
fpl_df = pd.read_csv(filepath, index_col=0, low_memory=False)
fpl_df['data_retrieved_datetime'] = pd.to_datetime(fpl_df['data_retrieved_datetime'])
display(fpl_df.head())
display(fpl_df.shape)

In [None]:
features_no_shift = ['element_type', 'home', 'opponent_xG_ewm_5', 'opponent_xG_ewm_10',
       'opponent_xG_ewm_20', 'opponent_xG_ewm_40', 'opponent_xGA_ewm_5',
       'opponent_xGA_ewm_10', 'opponent_xGA_ewm_20',
       'opponent_xGA_ewm_40', ]

features_shift = ['corners_and_indirect_freekicks_order', 'creativity_rank', 
       'direct_freekicks_order', 'ict_index_rank', 'influence_rank',
       'minutes', 'now_cost', 'penalties_order', 'points_per_game', 
       'selected_by_percent', 'threat_rank',
       'team_xG_ewm_5', 'team_xG_ewm_10', 'team_xG_ewm_20',
       'team_xG_ewm_40', 'team_xGA_ewm_5', 'team_xGA_ewm_10',
       'team_xGA_ewm_20', 'team_xGA_ewm_40', 
       'gameweek_assists_ewm_5', 'gameweek_bps_ewm_5',
       'gameweek_creativity_ewm_5', 'event_points_ewm_5',
       'gameweek_goals_scored_ewm_5', 'gameweek_goals_conceded_ewm_5',
       'gameweek_saves_ewm_5', 'gameweek_threat_ewm_5',
       'gameweek_xG_ewm_5', 'gameweek_xA_ewm_5', 'gameweek_xGA_ewm_5',
       'gameweek_minutes_ewm_5', 'gameweek_xPoints_ewm_5',
       'gameweek_assists_ewm_10', 'gameweek_bps_ewm_10',
       'gameweek_creativity_ewm_10', 'event_points_ewm_10',
       'gameweek_goals_scored_ewm_10', 'gameweek_goals_conceded_ewm_10',
       'gameweek_saves_ewm_10', 'gameweek_threat_ewm_10',
       'gameweek_xG_ewm_10', 'gameweek_xA_ewm_10', 'gameweek_xGA_ewm_10',
       'gameweek_minutes_ewm_10', 'gameweek_xPoints_ewm_10',
       'gameweek_assists_ewm_20', 'gameweek_bps_ewm_20',
       'gameweek_creativity_ewm_20', 'event_points_ewm_20',
       'gameweek_goals_scored_ewm_20', 'gameweek_goals_conceded_ewm_20',
       'gameweek_saves_ewm_20', 'gameweek_threat_ewm_20',
       'gameweek_xG_ewm_20', 'gameweek_xA_ewm_20', 'gameweek_xGA_ewm_20',
       'gameweek_minutes_ewm_20', 'gameweek_xPoints_ewm_20',
       'gameweek_assists_ewm_40', 'gameweek_bps_ewm_40',
       'gameweek_creativity_ewm_40', 'event_points_ewm_40',
       'gameweek_goals_scored_ewm_40', 'gameweek_goals_conceded_ewm_40',
       'gameweek_saves_ewm_40', 'gameweek_threat_ewm_40',
       'gameweek_xG_ewm_40', 'gameweek_xA_ewm_40', 'gameweek_xGA_ewm_40',
       'gameweek_minutes_ewm_40', 'gameweek_xPoints_ewm_40',
       'gameweek_assists_expanding', 'gameweek_bps_expanding',
       'gameweek_creativity_expanding', 'event_points_expanding',
       'gameweek_goals_scored_expanding',
       'gameweek_goals_conceded_expanding', 'gameweek_saves_expanding',
       'gameweek_threat_expanding', 'gameweek_xG_expanding',
       'gameweek_xA_expanding', 'gameweek_xGA_expanding',
       'gameweek_minutes_expanding', 'gameweek_xPoints_expanding',
       'gameweek_assists_expanding_per90', 'gameweek_bps_expanding_per90',
       'gameweek_creativity_expanding_per90',
       'event_points_expanding_per90',
       'gameweek_goals_scored_expanding_per90',
       'gameweek_goals_conceded_expanding_per90',
       'gameweek_saves_expanding_per90',
       'gameweek_threat_expanding_per90', 'gameweek_xG_expanding_per90',
       'gameweek_xA_expanding_per90', 'gameweek_xGA_expanding_per90',
       'gameweek_xPoints_expanding_per90', 'xG_overperformance'
    ]

target = ['event_points']

In [None]:
# shift given features
df = fpl_df.copy()
df[features_shift] = df.groupby(['first_name', 'second_name'])[features_shift].shift(shift_param)
display(df.head())
display(df.tail())
display(df.shape)

In [None]:
df.isnull().sum(axis=1).plot(kind='hist')

In [None]:
# drop rows where too much data missing
df = df[df.isnull().sum(axis=1) <= 90].reset_index(drop=True)
display(df.shape)

### Train-validation split

In [None]:
train_index = df[~(df.data_retrieved_datetime>'1-1-2024')].index
display(train_index)
test_index = df[(df.data_retrieved_datetime>'1-1-2024')].index
display(test_index)

In [None]:
X_df = df[features_no_shift+features_shift].copy()
y_df = df[target].copy()
X_train_df = df.loc[train_index, features_no_shift+features_shift].copy()
y_train_df = df.loc[train_index, target].copy()
X_test_df = df.loc[test_index, features_no_shift+features_shift].copy()
y_test_df = df.loc[test_index, target].copy()

In [None]:
y_df.isnull().sum()

In [None]:
# fit input data scaling
scaler = StandardScaler()

# fit input data imputation
imputer = SimpleImputer(missing_values=np.nan, strategy='median', keep_empty_features=True)

# preprocessing pipeline
preprocess_pipeline = Pipeline([
            ("scaler", scaler),
            ("imputer", imputer),
        ])

In [None]:
# preprocess input data
X_train = preprocess_pipeline.fit_transform(X_train_df)
X_test = preprocess_pipeline.transform(X_test_df)

In [None]:
# Convert Pandas DataFrame to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train_df.values, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test_df.values, dtype=torch.float32)

### Training

In [None]:
# Hyperparameters
input_size = X_train.shape[1]
hidden_sizes = [64, 32, 16, 8] # 64, 32, 16, 8
output_size = y_train.shape[1]
learning_rate = 0.001
num_epochs = 100
k_folds = 3  # Number of folds for cross-validation
patience = 10  # Number of epochs with no improvement for early stopping

In [None]:
# K-Fold Cross Validation
kf = KFold(n_splits=k_folds, shuffle=True)

In [None]:
# Initialize for storing fold results
fold_models = []
fold_performance = []

In [None]:
# K-fold cross-validation loop
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f'Fold {fold + 1}/{k_folds}')
    
    # Create DataLoader for training and validation sets
    train_dataset = Subset(TensorDataset(X_train, y_train), train_idx)
    val_dataset = Subset(TensorDataset(X_train, y_train), val_idx)
    
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
    
    # Instantiate the model, loss function, and optimizer
    model = MLP(input_size, hidden_sizes, output_size)
    criterion = nn.MSELoss(reduction='sum')  # Mean Squared Error loss for regression
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)
    
    # Early stopping instance
    early_stopping = EarlyStopping(patience=patience)

    # Training loop with early stopping
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        for batch_X, batch_y in train_loader:
            # Forward pass
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Validation after every epoch
        model.eval()  # Set model to evaluation mode
        val_loss = 0.0
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                val_outputs = model(batch_X)
                val_loss += criterion(val_outputs, batch_y).item()

        val_loss /= len(val_dataset)  # Average validation loss

        print(f'Epoch [{epoch+1}/{num_epochs}], Fold {fold+1}, Validation Loss: {val_loss:.4f}')

        # Early stopping check
        early_stopping(val_loss, model)
        if early_stopping.early_stop:            
            print(f"Early stopping at epoch {epoch+1} for fold {fold+1}")
            # Load the best model state when early stopping is triggered
            model.load_state_dict(early_stopping.best_model_state)
            break

    print(f'Fold {fold + 1}, Validation Loss: {val_loss:.4f}')
    fold_performance.append(val_loss)
    
    # Save the model for this fold
    fold_models.append(copy.deepcopy(model))

# Average the weights from all fold models
average_model = average_model_weights(fold_models)

# Final performance across folds
avg_performance = np.mean(fold_performance)
print(f'Average Validation Loss across {k_folds} folds: {avg_performance:.4f}')

TODO: test final model, save final model + preprocessing pipeline

### Average model test

In [None]:
# average model mse

average_model.eval() # Set model to evaluation mode
with torch.no_grad():
    predictions = average_model(X_test)
criterion(predictions, y_test).item() / X_test.shape[0]

In [None]:
mae, rmse, r2 = calculate_performance_metrics(y_test.numpy().flatten(), predictions.numpy().flatten())
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

### Test individual models

In [None]:
for i in range(0,len(fold_models)):
    with torch.no_grad():
        predictions = fold_models[i](X_test)
    mse = criterion(predictions, y_test).item() / X_test.shape[0]
    mae, rmse, r2 = calculate_performance_metrics(y_test.numpy().flatten(), predictions.numpy().flatten())
    print(f'MSE: {mse}')
    print(f'RMSE: {rmse}')
    print(f'R^2: {r2}')

### Ensemble model test

In [None]:
# predictions on test set
predictions = ensemble_predict(fold_models, X_test) 

In [None]:
# test mse
criterion(predictions, y_test).item() / X_test.shape[0]

In [None]:
mae, rmse, r2 = calculate_performance_metrics(y_test.numpy().flatten(), predictions.numpy().flatten())
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

In [None]:
# Check predictions in the test set for a particular player
player_name = 'Salah'
aux = df[df['name'].notnull()].copy()
aux = aux.loc[aux['name'].str.contains(player_name)]
aux = aux[features_no_shift+features_shift]
aux.shape

In [None]:
aux.isnull().sum().sum()

In [None]:
aux = preprocess_pipeline.transform(aux.values)

In [None]:
sum(sum(np.isnan(aux)))

In [None]:
#sum(sum(np.isnan(aux)))
aux.shape

In [None]:
ensemble_predict(fold_models, torch.tensor(aux, dtype=torch.float32))

In [None]:
fold_models[0](torch.tensor(aux.values, dtype=torch.float32))