In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import mlflow
import mlflow.sklearn

In [2]:
current_uri = mlflow.get_tracking_uri()
print(f"Current MLflow Tracking URI: {current_uri}")

Current MLflow Tracking URI: file:///C:/Users/Christian/Documents/Cand.%20merc/Data-Science-Project/data_science_project/Beat%20the%20bookies/mlflow/mlruns


In [4]:
# Set the URI to your desired MLflow tracking server with host and port
mlflow.set_tracking_uri("http://127.0.0.1:8080")

In [5]:
mlflow server --host 127.0.0.1 --port 8080

SyntaxError: invalid syntax (436093207.py, line 1)

In [6]:
def train_random_forest(file_path, random_state=42):
    """
    Train a Random Forest classifier on the given dataset.

    Args:
        file_path (str): The path to the CSV file containing the dataset.
        test_size (float, optional): The proportion of the dataset to include in the test split. Defaults to 0.2.
        random_state (int, optional): The seed used by the random number generator. Defaults to 42.
    """
    # Load and prepare the data
    pred_df = pd.read_csv(file_path, index_col=0)

    pred_df['Home'] = pred_df['Home'].astype('category')
    pred_df['Away'] = pred_df['Away'].astype('category')
    pred_df['FTR'] = pred_df['FTR'].astype('category')

    # Ensure 'Date' is in datetime format
    pred_df['Date'] = pd.to_datetime(pred_df['Date'])

    # Split data into train and test sets based on date
    train = pred_df[pred_df['Date'] < '2024-01-01']
    test = pred_df[pred_df['Date'] >= '2024-01-01']

    # Define predictors and target for train and test sets
    X_train = train[['Home', 'Away', 'Attendance', 'B365H', 'B365D', 'B365A',
                     'AttackStrengthHome', 'AttackStrengthAway', 'DefenseWeaknessHome',
                     'DefenseWeaknessAway', 'AvgHomePoints', 'AvgAwayPoints',
                     'AvgLosingHomePoints', 'AvgLosingAwayPoints', 'AvgGoalDiffHome',
                     'AvgGoalDiffAway', 'HomeWinsRatio', 'HomeDrawsRatio', 'AwayWinsRatio',
                     'AwayDrawsRatio', 'AvgHomeCornersLast5', 'AvgAwayCornersLast5',
                     'AvgHomeShotsLast5', 'AvgHomeShotsOnTargetLast5', 'AvgAwayShotsLast5',
                     'AvgAwayShotsOnTargetLast5', 'elo', 'elo_away', 'FormHomeTeam',
                     'FormAwayTeam', 'ProbabilityDraw', 'ProbabilityHomeWin',
                     'ProbabilityAwayWin']]
    y_train = train['FTR']
    X_test = test[X_train.columns]
    y_test = test['FTR']

    # Standardize the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Define the parameter grid with stricter regularization
    param_grid = {
        'n_estimators': [50, 500],  # Use fewer estimators to reduce complexity
        'max_depth': [5, 10, 15],
        'max_features': ['sqrt', 'log2']  # Limit the number of features considered for splitting at each node
    }

    # Initialize TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=5)

    # Initialize RandomForestClassifier
    rf_model = RandomForestClassifier(random_state=random_state)

    # Initialize GridSearchCV with TimeSeriesSplit on training set only
    grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=tscv, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train_scaled, y_train)

    # Get the best parameters
    best_params = grid_search.best_params_
    print("Best Parameters:", best_params)

    # Get the best model
    best_rf_model = grid_search.best_estimator_

    # Train the best model on the entire training set
    best_rf_model.fit(X_train_scaled, y_train)

    # Make predictions on training set
    y_train_pred = best_rf_model.predict(X_train_scaled)

    # Make predictions on test set
    y_test_pred = best_rf_model.predict(X_test_scaled)

    # Calculate metrics
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    train_f1 = f1_score(y_train, y_train_pred, average='weighted')
    test_f1 = f1_score(y_test, y_test_pred, average='weighted')

    # Print the metrics
    print(f"Training Accuracy with Best Random Forest: {train_accuracy:.4f}")
    print(f"Test Accuracy with Best Random Forest: {test_accuracy:.4f}")
    print(f"Training F1 Score with Best Random Forest: {train_f1:.4f}")
    print(f"Test F1 Score with Best Random Forest: {test_f1:.4f}")

    # Print the confusion matrix
    print("Confusion Matrix - Training Set with Best Random Forest")
    print(confusion_matrix(y_train, y_train_pred))

    print("Confusion Matrix - Test Set with Best Random Forest")
    print(confusion_matrix(y_test, y_test_pred))

    # Print classification report
    print("Classification Report - Training Set with Best Random Forest")
    print(classification_report(y_train, y_train_pred))

    print("Classification Report - Test Set with Best Random Forest")
    print(classification_report(y_test, y_test_pred))

    # Set our tracking server uri for logging
    #mlflow server --host 127.0.0.1 --port 8080
    
    # Set experiment name
    mlflow.set_experiment("Random Forest Model Training")

    # Start an MLflow run
    with mlflow.start_run():

        # Log the accuracy and F1 score metrics
        mlflow.log_metric("train_accuracy", train_accuracy)
        mlflow.log_metric("test_accuracy", test_accuracy)
        mlflow.log_metric("train_f1", train_f1)
        mlflow.log_metric("test_f1", test_f1)

        # Set a tag that we can use to remind ourselves what this run was for
        mlflow.set_tag("Training Info", "Random Forest model training with accuracy and F1 score metrics.")

        # Log the model
        mlflow.sklearn.log_model(
            sk_model=best_rf_model,
            artifact_path="random_forest_model",
            registered_model_name="RandomForestClassifier"
        )

    return best_rf_model, train_accuracy, test_accuracy, train_f1, test_f1

In [7]:
best_model, train_acc, test_acc, train_f1, test_f1 = train_random_forest("C:/Users/Christian/Documents/Cand. merc/Data-Science-Project/data_science_project/Beat the bookies/data/df_preprocessed.csv")

Best Parameters: {'max_depth': 5, 'max_features': 'sqrt', 'n_estimators': 500}


2024/06/15 11:31:38 INFO mlflow.tracking.fluent: Experiment with name 'Random Forest Model Training' does not exist. Creating a new experiment.


Training Accuracy with Best Random Forest: 0.6100
Test Accuracy with Best Random Forest: 0.5375
Training F1 Score with Best Random Forest: 0.5739
Test F1 Score with Best Random Forest: 0.4956
Confusion Matrix - Training Set with Best Random Forest
[[281  18 172]
 [106 103 262]
 [ 89  16 653]]
Confusion Matrix - Test Set with Best Random Forest
[[24  8 17]
 [ 8  3 27]
 [ 9  5 59]]
Classification Report - Training Set with Best Random Forest
              precision    recall  f1-score   support

           A       0.59      0.60      0.59       471
           D       0.75      0.22      0.34       471
           H       0.60      0.86      0.71       758

    accuracy                           0.61      1700
   macro avg       0.65      0.56      0.55      1700
weighted avg       0.64      0.61      0.57      1700

Classification Report - Test Set with Best Random Forest
              precision    recall  f1-score   support

           A       0.59      0.49      0.53        49
         

Successfully registered model 'RandomForestClassifier'.
2024/06/15 11:31:44 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForestClassifier, version 1
Created version '1' of model 'RandomForestClassifier'.


In [15]:
train_random_forest.py

AttributeError: 'function' object has no attribute 'py'