In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import TimeSeriesSplit
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
import mlflow
import mlflow.keras

2024-06-15 17:01:44.137333: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
current_uri = mlflow.get_tracking_uri()
print(f"Current MLflow Tracking URI: {current_uri}")

Current MLflow Tracking URI: http://127.0.0.1:8080


In [3]:
# Set the URI to your desired MLflow tracking server with host and port
mlflow.set_tracking_uri("http://127.0.0.1:8080")

In [4]:
def train_deep_learning(file_path, random_state=42):
    """
    Train a Deep Learning model on the given dataset.

    Args:
        file_path (str): The path to the CSV file containing the dataset.
        random_state (int, optional): The seed used by the random number generator. Defaults to 42.
    """
    # Set random seed for reproducibility
    np.random.seed(random_state)
    import tensorflow as tf
    tf.random.set_seed(random_state)

    # Load and prepare the data
    pred_df = pd.read_csv(file_path, index_col=0)

    pred_df['Home'] = pred_df['Home'].astype('category')
    pred_df['Away'] = pred_df['Away'].astype('category')
    pred_df['FTR'] = pred_df['FTR'].astype('category')

    # Ensure 'Date' is in datetime format
    pred_df['Date'] = pd.to_datetime(pred_df['Date'])

    # Split data into train and test sets based on date
    train = pred_df[pred_df['Date'] < '2024-01-01']
    test = pred_df[pred_df['Date'] >= '2024-01-01']

    # Define predictors and target for train and test sets
    X_train = train[['Home', 'Away', 'Attendance', 'B365H', 'B365D', 'B365A',
                     'AttackStrengthHome', 'AttackStrengthAway', 'DefenseWeaknessHome',
                     'DefenseWeaknessAway', 'AvgHomePoints', 'AvgAwayPoints',
                     'AvgLosingHomePoints', 'AvgLosingAwayPoints', 'AvgGoalDiffHome',
                     'AvgGoalDiffAway', 'HomeWinsRatio', 'HomeDrawsRatio', 'AwayWinsRatio',
                     'AwayDrawsRatio', 'AvgHomeCornersLast5', 'AvgAwayCornersLast5',
                     'AvgHomeShotsLast5', 'AvgHomeShotsOnTargetLast5', 'AvgAwayShotsLast5',
                     'AvgAwayShotsOnTargetLast5', 'elo', 'elo_away', 'FormHomeTeam',
                     'FormAwayTeam', 'ProbabilityDraw', 'ProbabilityHomeWin',
                     'ProbabilityAwayWin']]
    y_train = train['FTR']
    X_test = test[X_train.columns]
    y_test = test['FTR']

    # Encode the target labels
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)

    # Convert to one-hot encoding
    y_train_one_hot = to_categorical(y_train_encoded)
    y_test_one_hot = to_categorical(y_test_encoded)

    # Standardize the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Define the neural network model
    def create_model(input_dim):
        model = Sequential()
        model.add(Dense(64, activation='relu', input_dim=input_dim))
        model.add(Dropout(0.5))
        model.add(Dense(32, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(3, activation='softmax'))  # 3 classes: 'A', 'D', 'H'
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        return model

    # Initialize the model
    model = create_model(X_train_scaled.shape[1])

    # Initialize TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=5)

    # Train the model using cross-validation
    for train_index, val_index in tscv.split(X_train_scaled):
        X_train_fold, X_val_fold = X_train_scaled[train_index], X_train_scaled[val_index]
        y_train_fold, y_val_fold = y_train_one_hot[train_index], y_train_one_hot[val_index]

        model.fit(X_train_fold, y_train_fold, epochs=50, batch_size=32, validation_data=(X_val_fold, y_val_fold), verbose=2)

    # Evaluate the model on the training set
    train_loss, train_accuracy = model.evaluate(X_train_scaled, y_train_one_hot, verbose=0)
    train_pred_prob = model.predict(X_train_scaled)
    train_pred = np.argmax(train_pred_prob, axis=1)
    train_f1 = f1_score(y_train_encoded, train_pred, average='weighted')

    # Evaluate the model on the test set
    test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test_one_hot, verbose=0)
    test_pred_prob = model.predict(X_test_scaled)
    test_pred = np.argmax(test_pred_prob, axis=1)
    test_f1 = f1_score(y_test_encoded, test_pred, average='weighted')

    # Print the metrics
    print(f"Training Accuracy with Best Neural Network: {train_accuracy:.4f}")
    print(f"Test Accuracy with Best Neural Network: {test_accuracy:.4f}")
    print(f"Training F1 Score with Best Neural Network: {train_f1:.4f}")
    print(f"Test F1 Score with Best Neural Network: {test_f1:.4f}")

    # Print the confusion matrix
    print("Confusion Matrix - Training Set with Best Neural Network")
    print(confusion_matrix(y_train_encoded, train_pred))

    print("Confusion Matrix - Test Set with Best Neural Network")
    print(confusion_matrix(y_test_encoded, test_pred))

    # Print classification report
    print("Classification Report - Training Set with Best Neural Network")
    print(classification_report(y_train_encoded, train_pred))

    print("Classification Report - Test Set with Best Neural Network")
    print(classification_report(y_test_encoded, test_pred))

    # Set our tracking server uri for logging
    # mlflow server --host 127.0.0.1 --port 8080

    # Set experiment name
    mlflow.set_experiment("Deep Learning Model Training")

    # Start an MLflow run
    with mlflow.start_run():

        # Log the accuracy and F1 score metrics
        mlflow.log_metric("train_accuracy", train_accuracy)
        mlflow.log_metric("test_accuracy", test_accuracy)
        mlflow.log_metric("train_f1", train_f1)
        mlflow.log_metric("test_f1", test_f1)

        # Set a tag that we can use to remind ourselves what this run was for
        mlflow.set_tag("Training Info", "Deep Learning model training with accuracy and F1 score metrics.")

        # Log the model
        mlflow.keras.log_model(
            model,
            artifact_path="deep_learning_model",
            registered_model_name="DeepLearningClassifier"
        )

    return model, train_accuracy, test_accuracy, train_f1, test_f1


In [6]:
best_model, train_acc, test_acc, train_f1, test_f1 = train_deep_learning("/Users/christianhellum/Cand. Merc./Data-Science-Project/data_science_project/Beat the bookies/data/df_preprocessed.csv")

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


9/9 - 1s - 137ms/step - accuracy: 0.4070 - loss: 1.4031 - val_accuracy: 0.3852 - val_loss: 1.1699
Epoch 2/50
9/9 - 0s - 11ms/step - accuracy: 0.3579 - loss: 1.2534 - val_accuracy: 0.4134 - val_loss: 1.1279
Epoch 3/50
9/9 - 0s - 10ms/step - accuracy: 0.3474 - loss: 1.2364 - val_accuracy: 0.4311 - val_loss: 1.1036
Epoch 4/50
9/9 - 0s - 10ms/step - accuracy: 0.4561 - loss: 1.1125 - val_accuracy: 0.4346 - val_loss: 1.0883
Epoch 5/50
9/9 - 0s - 10ms/step - accuracy: 0.4211 - loss: 1.1569 - val_accuracy: 0.4311 - val_loss: 1.0805
Epoch 6/50
9/9 - 0s - 7ms/step - accuracy: 0.4281 - loss: 1.0812 - val_accuracy: 0.4170 - val_loss: 1.0761
Epoch 7/50
9/9 - 0s - 7ms/step - accuracy: 0.4632 - loss: 1.0843 - val_accuracy: 0.4276 - val_loss: 1.0730
Epoch 8/50
9/9 - 0s - 6ms/step - accuracy: 0.4912 - loss: 1.0414 - val_accuracy: 0.4488 - val_loss: 1.0694
Epoch 9/50
9/9 - 0s - 7ms/step - accuracy: 0.4667 - loss: 1.0992 - val_accuracy: 0.4452 - val_loss: 1.0678
Epoch 10/50
9/9 - 0s - 7ms/step - accuracy

2024/06/15 17:03:17 INFO mlflow.tracking.fluent: Experiment with name 'Deep Learning Model Training' does not exist. Creating a new experiment.
Successfully registered model 'DeepLearningClassifier'.
2024/06/15 17:03:29 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DeepLearningClassifier, version 1
Created version '1' of model 'DeepLearningClassifier'.
