In [2]:
# MODULE IMPORTS
import requests
import os
import pandas as pd
import datetime
import pickle
import matplotlib.pyplot as plt
import numpy as np
import boto3
import io
import random
import json
import time
import joblib


from dotenv import load_dotenv

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

# MODEL INIT

In [3]:
MODEL_CONFIG = dict(
    NAME = "premier_league",
    VERSION = "v1.0",
    TYPE = "win_outcome"
)
load_dotenv("../boto3_cloudflare.env")

True

# LOADING THE DATA

In [5]:
# TRAINING DATASET
X = pd.read_csv(f"https://raw.githubusercontent.com/lebyanelm/neural-trained-models/main/relative_datasets/cleaned/{MODEL_CONFIG['NAME']}-{MODEL_CONFIG['VERSION']}-train-set.csv").drop_duplicates().dropna()
Y = X.pop("target")
print(f"""TRAINING SHAPE: {X.shape}, {X.columns}""")

# TESTING/EVALUATION DATASET
X_test = pd.read_csv(f"https://raw.githubusercontent.com/lebyanelm/neural-trained-models/main/relative_datasets/cleaned/{MODEL_CONFIG['NAME']}-{MODEL_CONFIG['VERSION']}-test-set.csv").drop_duplicates().dropna()
Y_test = X_test.pop("target")
print(f"""TESTING SHAPE: {X_test.shape}""")

TRAINING SHAPE: (1689, 11), Index(['day', 'month', 'year', 'weekday', 'hour', 'minute', 'matchday',
       'homeId', 'awayId', 'scoreHomeHt', 'scoreAwayHt'],
      dtype='object')
TESTING SHAPE: (423, 11)


# UTILITY FUNCTIONS

In [91]:
def save_model_as_sklearn(selected_model, best_transformer = None):
    model_name = f"{MODEL_CONFIG['NAME']}-{MODEL_CONFIG['VERSION']}-{MODEL_CONFIG['TYPE']}.joblib"
    transformer_name = f"{MODEL_CONFIG['NAME']}-{MODEL_CONFIG['VERSION']}-{MODEL_CONFIG['TYPE']}-transformer.joblib"
    model_folder = f"../models/{model_name}"
    transformer_folder = f"../models/{transformer_name}"
    
    """Save te best selected model and the transformer used in training."""
    joblib.dump(selected_model, model_folder)
    joblib.dump(best_transformer, transformer_folder)

    if MODELS_REGISTRY["models"].get(MODEL_CONFIG["NAME"]) == None:
        MODELS_REGISTRY["models"][MODEL_CONFIG["NAME"]] = dict()
    
    print(MODELS_REGISTRY)
    MODELS_REGISTRY["models"][MODEL_CONFIG["NAME"]][MODEL_CONFIG["TYPE"]] = dict(
        model = model_name,
        transformer = transformer_name,
        params = selected_model.get_params()
    )

    print(MODELS_REGISTRY)
    return save_model_registry()


def load_model_registry():
  with open("../models/models-registry.json", "r") as registry:
    model_registry_data = "".join(registry.readlines())
    return json.loads(model_registry_data)
MODELS_REGISTRY = load_model_registry()


def save_model_registry(model_params = None):
    with open("../models/models-registry.json", "wb") as registry:
      file_contents = json.dumps(MODELS_REGISTRY, indent=4)
      registry.write(file_contents.encode())
      print("Updated registry:", MODELS_REGISTRY)


def betslip_win_rate(X_test, y_test, model, odds_range, bet_amount = 10, matches_per_bet = 5):
    """
    Calculate win and loss rate, and estimate winnings.

    Parameters:
    X_test (ndarray): Test features.
    y_test (ndarray): Actual outcomes for the test set.
    model (tf.keras.Model / sklearn Model): Trained prediction model.
    odds (list): List of odds for each match.
    bet_amount (float): Amount of money placed on each bet. Default is 10.

    Returns:
    tuple: Average win rate, average loss rate, number of bets, total winnings.
    """
    # Predict outcomes
    y_preds = (model.predict(X_test).reshape(-1) > 0.5).astype(int)
    odds = [round(random.random() * odds_range[-1], 2) for _ in range(0, len(y_preds))]
    
    # Ensure the lengths of the predictions and actual values are the same
    if len(y_preds) != len(y_test) or len(y_preds) != len(odds):
        raise ValueError("Predictions, actual values, and odds must have the same length")

    # Initialize counters for win, loss rates and bets
    win_rate = 0
    loss_rate = 0
    bets_count = 0
    total_winnings = 0
    total_odds = 0
    
    # Loop over the predictions in chunks of matches_per_bet
    for i in range(0, len(y_preds), matches_per_bet):
        print(f"Matches: {i}-{i+matches_per_bet}")
        win_count = 0
        loss_count = 0
        bet_winnings = 0
        
        # Ensure there are enough matches left for a full bet
        if i + matches_per_bet > len(y_preds):
            break

        # Calculate wins and losses within the current bet and compute winnings
        for j in range(i, i + matches_per_bet):
            if y_preds[j] == y_test[j]:
                win_count += 1
                bet_winnings += odds[j] * bet_amount  # Calculate winnings for the match
                total_odds += odds[j]
                print(f"W: R{round(bet_winnings, 2)}, (win count:  {win_count}, odds: {odds[j]}, bet amount: {bet_amount})")
            else:
                loss_count += 1
                bet_winnings -= bet_amount  # Subtract the bet amount for a loss
                print(f"L: R{round(bet_winnings, 2)}, (loss count:  {loss_count}, odds: {odds[j]}, bet amount: {bet_amount}))")

        # Update overall win and loss rates and bets count
        if win_count + loss_count > 0:  # Avoid division by zero
            win_rate += win_count / (win_count + loss_count)
            loss_rate += loss_count / (win_count + loss_count)
        bets_count += 1

        total_winnings += bet_winnings - bet_amount

        print(f"Balance: R{total_winnings}")
        print("__________")
        print("")

    # Calculate average win and loss rates
    if bets_count > 0:  # Avoid division by zero
        avg_win_rate = win_rate / bets_count
        avg_loss_rate = loss_rate / bets_count
    else:
        avg_win_rate = 0
        avg_loss_rate = 0

    # Print results
    print(f"Win rate: {avg_win_rate:.2f}%")
    print(f"Loss rate: {avg_loss_rate:.2f}%")
    print(f"Bets count: {bets_count} bets")
    print(f"Bets cost: R{round(bets_count * bet_amount, 2)}")
    print(f"Total odds: x{round(total_odds, 2)}")
    print(f"Total winnings: R{total_winnings:.2f}")

    return avg_win_rate, avg_loss_rate, bets_count, total_winnings


def perfomance_metric(X_test, Y_test, model):
  Y_preds = (model.predict(X_test).reshape(-1) > 0.5).astype(int)
  combined_actual_to_preds = pd.DataFrame(dict(actual=Y_test, prediction=Y_preds))
  print(classification_report(Y_test, Y_preds))
  return pd.crosstab(index=combined_actual_to_preds["actual"], columns=combined_actual_to_preds["prediction"])

# RANDOM OVERSAMPLING

In [7]:
# Randomly oversample the data to equalize the buy/sell count
ros = RandomOverSampler()
X_sampled, Y_sampled = ros.fit_resample(X.values, Y.values)
print("Random Over Sampler: ", X_sampled.shape)

Random Over Sampler:  (1964, 11)


# DATA NORMALIZATION AND STANDARDIZATION

In [8]:
# MIN_MAX SCALER
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_sampled)
X_scaled

array([[0.8       , 0.90909091, 0.66666667, ..., 0.26641998, 0.2       ,
        0.25      ],
       [0.3       , 1.        , 0.33333333, ..., 0.01017576, 0.        ,
        0.25      ],
       [0.33333333, 0.90909091, 0.66666667, ..., 0.2506938 , 0.2       ,
        0.        ],
       ...,
       [0.13333333, 0.        , 0.66666667, ..., 0.00740056, 0.        ,
        0.        ],
       [0.36666667, 0.63636364, 0.66666667, ..., 0.01110083, 0.        ,
        0.25      ],
       [0.23333333, 0.36363636, 0.66666667, ..., 0.01295097, 0.        ,
        0.        ]])

In [None]:
# STANDARD SCALER
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_sampled)
X_scaled

In [79]:
# ROBUST SCALER
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X_sampled)
X_scaled

array([[ 0.625     ,  0.42857143,  0.        , ...,  0.0221519 ,
         1.        ,  1.        ],
       [-0.3125    ,  0.57142857, -1.        , ..., -0.85443038,
         0.        ,  1.        ],
       [-0.25      ,  0.42857143,  0.        , ..., -0.03164557,
         1.        ,  0.        ],
       ...,
       [-0.875     ,  0.42857143, -1.        , ...,  0.01582278,
         0.        ,  1.        ],
       [-0.625     ,  0.42857143, -1.        , ...,  0.        ,
         0.        ,  1.        ],
       [-0.625     ,  0.42857143, -1.        , ..., -0.85443038,
         0.        ,  1.        ]])

In [None]:
# MAX ABS SCALER
scaler = MaxAbsScaler()
X_scaled = scaler.fit_transform(X_sampled)
X_scaled

# GRID SEARCH MODEL TRAINING

In [9]:
models = {
    'LogisticRegression': LogisticRegression(),
    'RandomForestClassifier': RandomForestClassifier(),
    'SVC': SVC(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier()
}

param_grids = {
    'LogisticRegression': {
        'penalty': ['l2'],
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs', 'liblinear']
    },
    'RandomForestClassifier': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    'SVC': {
        'C': [0.1, 1, 10, 100],
        'gamma': [1, 0.1, 0.01, 0.001],
        'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
    },
    'KNeighborsClassifier': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan', 'minkowski']
    },
    'DecisionTreeClassifier': {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'criterion': ['gini', 'entropy']
    }
}

best_models = {}
best_params = {}
for model_name in models:
    print(f"Running GridSearchCV for {model_name}...")
    grid_search = GridSearchCV(estimator=models[model_name], param_grid=param_grids[model_name], cv=5, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_scaled, Y_sampled)
    best_models[model_name] = grid_search.best_estimator_
    best_params[model_name] = grid_search.best_params_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy for {model_name}: {grid_search.best_score_}")

Running GridSearchCV for LogisticRegression...


Best parameters for LogisticRegression: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
Best cross-validation accuracy for LogisticRegression: 0.7566183725398556
Running GridSearchCV for RandomForestClassifier...
Best parameters for RandomForestClassifier: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}
Best cross-validation accuracy for RandomForestClassifier: 0.8233512488965052
Running GridSearchCV for SVC...
Best parameters for SVC: {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
Best cross-validation accuracy for SVC: 0.774462533104845
Running GridSearchCV for KNeighborsClassifier...
Best parameters for KNeighborsClassifier: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
Best cross-validation accuracy for KNeighborsClassifier: 0.7693929480189022
Running GridSearchCV for DecisionTreeClassifier...
Best parameters for DecisionTreeClassifier: {'criterion': 'entropy', 'max_depth': 20, 'min_samples_split': 2}
Best cross-validation accuracy for DecisionTreeClassifier

In [48]:
# SAVE THE BEST SELECTED MODEL:
save_model_as_sklearn(selected_model=best_model)

TypeError: 'str' object does not support item assignment

# MODEL EVALUATIONS

In [None]:
best_model = None
best_model_params = None
precision_benchmark = 0
for model_name, model in best_models.items():
    y_pred = model.predict(scaler.transform(X_test.values))
    precision = precision_score(Y_test, y_pred)
    if precision > precision_benchmark:
        precision_benchmark = precision
        best_model = model
        best_model_params = best_params[model_name]
        
    print(f"Test precision_score for {model_name}: {precision:.2f}")

print()
print("--------------------")
print()

# BETSLIP EVALUATIONS
X_test_scaled = scaler.transform(X_test.values)
betslip_win_rate(X_scaled, Y_sampled, odds_range=[1, 3], model=best_model, bet_amount=10, matches_per_bet=1)
perfomance_metric(X_scaled, Y_sampled, model=best_model)

In [None]:
save_model_as_sklearn(selected_model=best_model, best_transformer=scaler)