In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
import mlflow
import xgboost as xgb

In [2]:
current_uri = mlflow.get_tracking_uri()
print(f"Current MLflow Tracking URI: {current_uri}")

Current MLflow Tracking URI: file:///Users/christianhellum/Cand.%20Merc./Data-Science-Project/data_science_project/Beat%20the%20bookies/mlflow/mlruns


In [3]:
# Set the URI to your desired MLflow tracking server with host and port
mlflow.set_tracking_uri("http://127.0.0.1:8080")

In [4]:
def train_xgboost(file_path, random_state=42):
    """
    Train an XGBoost classifier on the given dataset.

    Args:
        file_path (str): The path to the CSV file containing the dataset.
        random_state (int, optional): The seed used by the random number generator. Defaults to 42.
    """
    # Load and prepare the data
    pred_df = pd.read_csv(file_path, index_col=0)

    pred_df['Home'] = pred_df['Home'].astype('category')
    pred_df['Away'] = pred_df['Away'].astype('category')
    pred_df['FTR'] = pred_df['FTR'].astype('category')

    # Ensure 'Date' is in datetime format
    pred_df['Date'] = pd.to_datetime(pred_df['Date'])

    # Split data into train and test sets based on date
    train = pred_df[pred_df['Date'] < '2024-01-01']
    test = pred_df[pred_df['Date'] >= '2024-01-01']

    # Define predictors and target for train and test sets
    X_train = train[['Home', 'Away', 'Attendance', 'B365H', 'B365D', 'B365A',
                     'AttackStrengthHome', 'AttackStrengthAway', 'DefenseWeaknessHome',
                     'DefenseWeaknessAway', 'AvgHomePoints', 'AvgAwayPoints',
                     'AvgLosingHomePoints', 'AvgLosingAwayPoints', 'AvgGoalDiffHome',
                     'AvgGoalDiffAway', 'HomeWinsRatio', 'HomeDrawsRatio', 'AwayWinsRatio',
                     'AwayDrawsRatio', 'AvgHomeCornersLast5', 'AvgAwayCornersLast5',
                     'AvgHomeShotsLast5', 'AvgHomeShotsOnTargetLast5', 'AvgAwayShotsLast5',
                     'AvgAwayShotsOnTargetLast5', 'elo', 'elo_away', 'FormHomeTeam',
                     'FormAwayTeam', 'ProbabilityDraw', 'ProbabilityHomeWin',
                     'ProbabilityAwayWin']]
    X_test = test[X_train.columns]
    y_train = train['FTR']
    y_test = test['FTR']

    # Encode the target labels
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)

    # Standardize the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Define the parameter grid for XGBoost
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.7, 0.8, 0.9],
        'colsample_bytree': [0.7, 0.8, 0.9]
    }

    # Initialize TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=5)

    # Initialize XGBoost Classifier
    xgb_model = xgb.XGBClassifier(random_state=random_state, use_label_encoder=False, eval_metric='mlogloss')

    # Initialize GridSearchCV with TimeSeriesSplit on training set only
    grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=tscv, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train_scaled, y_train_encoded)

    # Get the best parameters
    best_params = grid_search.best_params_
    print("Best Parameters:", best_params)

    # Get the best model
    best_xgb_model = grid_search.best_estimator_

    # Train the best model on the entire training set
    best_xgb_model.fit(X_train_scaled, y_train_encoded)

    # Make predictions on training set
    y_train_pred_encoded = best_xgb_model.predict(X_train_scaled)

    # Make predictions on test set
    y_test_pred_encoded = best_xgb_model.predict(X_test_scaled)

    # Decode the predictions back to original labels
    y_train_pred = label_encoder.inverse_transform(y_train_pred_encoded)
    y_test_pred = label_encoder.inverse_transform(y_test_pred_encoded)

    # Calculate metrics
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    train_f1 = f1_score(y_train, y_train_pred, average='weighted')
    test_f1 = f1_score(y_test, y_test_pred, average='weighted')

    # Print the metrics
    print(f"Training Accuracy with Best XGBoost: {train_accuracy:.4f}")
    print(f"Test Accuracy with Best XGBoost: {test_accuracy:.4f}")
    print(f"Training F1 Score with Best XGBoost: {train_f1:.4f}")
    print(f"Test F1 Score with Best XGBoost: {test_f1:.4f}")

    # Print the confusion matrix
    print("Confusion Matrix - Training Set with Best XGBoost")
    print(confusion_matrix(y_train, y_train_pred))

    print("Confusion Matrix - Test Set with Best XGBoost")
    print(confusion_matrix(y_test, y_test_pred))

    # Print classification report
    print("Classification Report - Training Set with Best XGBoost")
    print(classification_report(y_train, y_train_pred))

    print("Classification Report - Test Set with Best XGBoost")
    print(classification_report(y_test, y_test_pred))

    # Set our tracking server uri for logging
    # mlflow server --host 127.0.0.1 --port 8080

    # Set experiment name
    mlflow.set_experiment("XGBoost Model Training")

    # Start an MLflow run
    with mlflow.start_run():

        # Log the accuracy and F1 score metrics
        mlflow.log_metric("train_accuracy", train_accuracy)
        mlflow.log_metric("test_accuracy", test_accuracy)
        mlflow.log_metric("train_f1", train_f1)
        mlflow.log_metric("test_f1", test_f1)

        # Set a tag that we can use to remind ourselves what this run was for
        mlflow.set_tag("Training Info", "XGBoost model training with accuracy and F1 score metrics.")

        # Log the model
        mlflow.xgboost.log_model(
            xgb_model=best_xgb_model,
            artifact_path="xgboost_model",
            registered_model_name="XGBoostClassifier"
        )

    return best_xgb_model, train_accuracy, test_accuracy, train_f1, test_f1



In [5]:
best_model, train_acc, test_acc, train_f1, test_f1 = train_xgboost("/Users/christianhellum/Cand. Merc./Data-Science-Project/data_science_project/Beat the bookies/data/df_preprocessed.csv")

Best Parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}
Training Accuracy with Best XGBoost: 0.6482
Test Accuracy with Best XGBoost: 0.5437
Training F1 Score with Best XGBoost: 0.6294
Test F1 Score with Best XGBoost: 0.4973
Confusion Matrix - Training Set with Best XGBoost
[[284  36 151]
 [ 85 167 219]
 [ 77  30 651]]
Confusion Matrix - Test Set with Best XGBoost
[[24  5 20]
 [ 8  3 27]
 [ 8  5 60]]
Classification Report - Training Set with Best XGBoost
              precision    recall  f1-score   support

           A       0.64      0.60      0.62       471
           D       0.72      0.35      0.47       471
           H       0.64      0.86      0.73       758

    accuracy                           0.65      1700
   macro avg       0.66      0.61      0.61      1700
weighted avg       0.66      0.65      0.63      1700

Classification Report - Test Set with Best XGBoost
              precision    recall  f1-score   s

Registered model 'XGBoostClassifier' already exists. Creating a new version of this model...
2024/06/18 12:28:20 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBoostClassifier, version 2
Created version '2' of model 'XGBoostClassifier'.
