In [1]:
# MODULE IMPORTS
import pandas as pd
import random
import json
import joblib


from dotenv import load_dotenv
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV


from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score

# MODEL INIT

In [2]:
MODEL_CONFIG = dict(
    NAME = "PREMIER_LEAGUE",
    VERSION = "v1.0",
    TYPE = "win_outcome"
)
load_dotenv("../boto3_cloudflare.env")

True

# LOADING THE DATA

In [8]:
# TRAINING DATASET
X = pd.read_csv(f"https://raw.githubusercontent.com/lebyanelm/neural-trained-models/main/relative_datasets/cleaned/{MODEL_CONFIG['NAME']}-{MODEL_CONFIG['VERSION']}-train-set.csv").drop_duplicates().dropna()
Y = X.pop("target")
print(f"""TRAINING SHAPE: {X.shape}, {X.columns}""")

# TESTING/EVALUATION DATASET
X_test = pd.read_csv(f"https://raw.githubusercontent.com/lebyanelm/neural-trained-models/main/relative_datasets/cleaned/{MODEL_CONFIG['NAME']}-{MODEL_CONFIG['VERSION']}-test-set.csv").drop_duplicates().dropna()
Y_test = X_test.pop("target")
print(f"""TESTING SHAPE: {X_test.shape}""")

TRAINING SHAPE: (1689, 11), Index(['day', 'month', 'year', 'weekday', 'hour', 'minute', 'matchday',
       'homeId', 'awayId', 'scoreHomeHt', 'scoreAwayHt'],
      dtype='object')
TESTING SHAPE: (423, 11)


# UTILITY FUNCTIONS

In [137]:
def save_model_as_sklearn(selected_model, best_transformer = None):
    model_name = f"{MODEL_CONFIG['NAME']}-{MODEL_CONFIG['VERSION']}-{MODEL_CONFIG['TYPE']}.joblib"
    transformer_name = f"{MODEL_CONFIG['NAME']}-{MODEL_CONFIG['VERSION']}-{MODEL_CONFIG['TYPE']}-transformer.joblib"
    model_folder = f"../models/{model_name}"
    transformer_folder = f"../models/{transformer_name}"
    
    """Save te best selected model and the transformer used in training."""
    joblib.dump(selected_model, model_folder)
    joblib.dump(best_transformer, transformer_folder)

    if MODELS_REGISTRY["models"].get(MODEL_CONFIG["NAME"]) == None:
        MODELS_REGISTRY["models"][MODEL_CONFIG["NAME"]] = dict()
    
    print(MODELS_REGISTRY)
    MODELS_REGISTRY["models"][MODEL_CONFIG["NAME"]][MODEL_CONFIG["TYPE"]] = dict(
        model = model_name,
        transformer = transformer_name,
        params = selected_model.get_params()
    )

    print(MODELS_REGISTRY)
    return save_model_registry()


def load_model_registry():
  with open("../models/models-registry.json", "r") as registry:
    model_registry_data = "".join(registry.readlines())
    return json.loads(model_registry_data)
MODELS_REGISTRY = load_model_registry()


def save_model_registry(model_params = None):
    with open("../models/models-registry.json", "wb") as registry:
      file_contents = json.dumps(MODELS_REGISTRY, indent=4)
      registry.write(file_contents.encode())
      print("Updated registry:", MODELS_REGISTRY)


def betslip_win_rate(x_test, y_test, model, odds, bet_amount = 10, matches_per_bet = 5, broker_balance = 50, available_topup = 100):
    """
    Calculate win and loss rate, and estimate winnings.

    Parameters:
    X_test (ndarray): Test features.
    y_test (ndarray): Actual outcomes for the test set.
    model (tf.keras.Model / sklearn Model): Trained prediction model.
    odds (list): List of odds for each match.
    bet_amount (float): Amount of money placed on each bet. Default is 10.

    Returns:
    tuple: Average win rate, average loss rate, number of bets, total winnings.
    """
    # Predict outcomes
    y_preds = (model.predict(x_test).reshape(-1) > 0.5).astype(int)
    
    # Ensure the lengths of the predictions and actual values are the same
    if len(y_preds) != len(y_test) or len(y_preds) != len(odds):
        print(len(y_preds), len(odds), len(y_test))
        raise ValueError("Predictions, actual values, and odds must have the same length")

    # Initialize counters for win, loss rates and bets
    win_rate = 0
    loss_rate = 0
    bets_count = 0
    total_odds = 0
    
    # Loop over the predictions in chunks of matches_per_bet
    for i in range(0, len(y_preds), matches_per_bet):
        win_count = 0
        loss_count = 0
        has_loss = False
        bet_odds = 0
        
        # Ensure there are enough matches left for a full bet
        if i + matches_per_bet > len(y_preds):
            break

        if broker_balance < bet_amount:
            # Check if topup is available
            if available_topup > 0:
                broker_balance += available_topup * 0.5
                available_topup -= available_topup * 0.5
            else:
                print(f"Balance ran out: {broker_balance}")
                break

        # Calculate wins and losses within the current bet and compute winnings
        for j in range(i, i + matches_per_bet):
            broker_balance -= bet_amount

            if y_preds[j] == y_test[j]:
                win_count += 1
                bet_odds += odds[j]

            else:
                loss_count += 1
                bet_odds = 1
                has_loss = True
            
            if has_loss:
                break

        # Update overall win and loss rates and bets count
        if win_count + loss_count > 0:  # Avoid division by zero
            win_rate += win_count / (win_count + loss_count)
            loss_rate += loss_count / (win_count + loss_count)

        if has_loss == False:
            bet_returns = round((bet_odds - 1) * bet_amount, 2)
            total_odds += bet_odds
        else:
            bet_returns = -bet_amount


        broker_balance += (bet_returns + bet_amount)
        bets_count += 1

    # Calculate average win and loss rates
    if bets_count > 0:  # Avoid division by zero
        avg_win_rate = win_rate / bets_count
        avg_loss_rate = loss_rate / bets_count
    else:
        avg_win_rate = 0
        avg_loss_rate = 0

    # Print results
    print(f"win rate    : {avg_win_rate:.2f}")
    print(f"loss rate   : {avg_loss_rate:.2f}")
    print(f"bets count  : {bets_count} bets")
    print(f"bet odds    : {round(total_odds, 2)}")
    print(f"balance     : R{broker_balance:.2f}")

    return avg_win_rate, avg_loss_rate, total_odds, broker_balance


def perfomance_metric(X_test, Y_test, model):
  Y_preds = (model.predict(X_test).reshape(-1) > 0.5).astype(int)
  combined_actual_to_preds = pd.DataFrame(dict(actual=Y_test, prediction=Y_preds))
  print(classification_report(Y_test, Y_preds))
  return pd.crosstab(index=combined_actual_to_preds["actual"], columns=combined_actual_to_preds["prediction"])

# RANDOM OVERSAMPLING

In [12]:
# Randomly oversample the data to equalize the buy/sell count
ros = RandomOverSampler()
X_resampled, Y_resampled = ros.fit_resample(X.values, Y.values)
print("Random Over Sampler: ", X_resampled.shape)

Random Over Sampler:  (1964, 11)


# DATA NORMALIZATION AND STANDARDIZATION

In [13]:
# MIN_MAX SCALER
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_resampled)
X_scaled

array([[0.8       , 0.90909091, 0.66666667, ..., 0.26641998, 0.2       ,
        0.25      ],
       [0.3       , 1.        , 0.33333333, ..., 0.01017576, 0.        ,
        0.25      ],
       [0.33333333, 0.90909091, 0.66666667, ..., 0.2506938 , 0.2       ,
        0.        ],
       ...,
       [0.43333333, 0.09090909, 0.66666667, ..., 0.60869565, 0.        ,
        0.        ],
       [0.83333333, 1.        , 0.66666667, ..., 0.01295097, 0.        ,
        0.        ],
       [0.46666667, 0.27272727, 0.66666667, ..., 0.01572618, 0.2       ,
        0.        ]])

In [None]:
# STANDARD SCALER
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled)
X_scaled

In [79]:
# ROBUST SCALER
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X_resampled)
X_scaled

array([[ 0.625     ,  0.42857143,  0.        , ...,  0.0221519 ,
         1.        ,  1.        ],
       [-0.3125    ,  0.57142857, -1.        , ..., -0.85443038,
         0.        ,  1.        ],
       [-0.25      ,  0.42857143,  0.        , ..., -0.03164557,
         1.        ,  0.        ],
       ...,
       [-0.875     ,  0.42857143, -1.        , ...,  0.01582278,
         0.        ,  1.        ],
       [-0.625     ,  0.42857143, -1.        , ...,  0.        ,
         0.        ,  1.        ],
       [-0.625     ,  0.42857143, -1.        , ..., -0.85443038,
         0.        ,  1.        ]])

In [None]:
# MAX ABS SCALER
scaler = MaxAbsScaler()
X_scaled = scaler.fit_transform(X_resampled)
X_scaled

# GRID SEARCH MODEL TRAINING

In [14]:
models = {
    'LogisticRegression': LogisticRegression(),
    'RandomForestClassifier': RandomForestClassifier(),
    'SVC': SVC(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier()
}

param_grids = {
    'LogisticRegression': {
        'penalty': ['l2'],
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs', 'liblinear']
    },
    'RandomForestClassifier': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    'SVC': {
        'C': [0.1, 1, 10, 100],
        'gamma': [1, 0.1, 0.01, 0.001],
        'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
    },
    'KNeighborsClassifier': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan', 'minkowski']
    },
    'DecisionTreeClassifier': {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'criterion': ['gini', 'entropy']
    }
}

best_models = {}
best_params = {}
for model_name in models:
    print(f"Running GridSearchCV for {model_name}...")
    grid_search = GridSearchCV(estimator=models[model_name], param_grid=param_grids[model_name], cv=5, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_scaled, Y_resampled)
    best_models[model_name] = grid_search.best_estimator_
    best_params[model_name] = grid_search.best_params_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy for {model_name}: {grid_search.best_score_}")

Running GridSearchCV for LogisticRegression...


Best parameters for LogisticRegression: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
Best cross-validation accuracy for LogisticRegression: 0.7500090876045074
Running GridSearchCV for RandomForestClassifier...
Best parameters for RandomForestClassifier: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Best cross-validation accuracy for RandomForestClassifier: 0.8248831593706185
Running GridSearchCV for SVC...
Best parameters for SVC: {'C': 10, 'gamma': 1, 'kernel': 'rbf'}
Best cross-validation accuracy for SVC: 0.7754894324141871
Running GridSearchCV for KNeighborsClassifier...
Best parameters for KNeighborsClassifier: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
Best cross-validation accuracy for KNeighborsClassifier: 0.7739679077737966
Running GridSearchCV for DecisionTreeClassifier...
Best parameters for DecisionTreeClassifier: {'criterion': 'entropy', 'max_depth': 20, 'min_samples_split': 2}
Best cross-validation accuracy for DecisionTreeCla

# MODEL EVALUATIONS

In [158]:
X_test = X_test.sort_values(["year", "month", "day", "matchday", "hour", "minute"])
X_test_sampled = X_test[-100:]
Y_test_sampled = Y_test[-100:]
X_test_sampled

Unnamed: 0,day,month,year,weekday,hour,minute,matchday,homeId,awayId,scoreHomeHt,scoreAwayHt
309,8.0,10.0,2023.0,7.0,15.0,30.0,8.0,57.0,65.0,0.0,0.0
191,21.0,10.0,2023.0,6.0,14.0,0.0,9.0,402.0,328.0,1.0,0.0
93,21.0,10.0,2023.0,6.0,14.0,0.0,12.0,322.0,340.0,1.0,1.0
267,24.0,10.0,2023.0,2.0,18.0,45.0,13.0,394.0,715.0,0.0,3.0
277,24.0,10.0,2023.0,2.0,18.0,45.0,13.0,384.0,59.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
192,4.0,5.0,2024.0,6.0,11.0,30.0,46.0,385.0,715.0,2.0,1.0
356,5.0,5.0,2024.0,7.0,13.0,0.0,36.0,397.0,58.0,0.0,0.0
118,6.0,5.0,2024.0,1.0,19.0,0.0,36.0,354.0,66.0,2.0,0.0
50,12.0,5.0,2024.0,7.0,15.0,30.0,37.0,66.0,57.0,0.0,1.0


In [157]:
X_test_scaled = scaler.transform(X_test_sampled.values)
odds = [round(1 + random.random() * 2, 2) for _ in range(0, len(X_test_scaled))]

best_model = None
precision_benchmark = 0
for model_name, model in best_models.items():
    y_pred = model.predict(X_test_scaled)
    precision = precision_score(Y_test_sampled, y_pred)
    if precision > precision_benchmark:
        precision_benchmark = precision
        best_model = model
        
    print(f"Test precision_score for {model_name}: {precision:.2f}")

print()
print("-------------------------------------------------------------------------------------------------")
print()

# BETSLIP EVALUATIONS
betslip_win_rate(X_test_scaled, Y_test_sampled, model=best_model, odds=odds, bet_amount=50, matches_per_bet=1, broker_balance = 66.08)

print()
print("-------------------------------------------------------------------------------------------------")
print()

perfomance_metric(X_test_scaled, Y_test_sampled, model=best_model)

Test precision_score for LogisticRegression: 0.57
Test precision_score for RandomForestClassifier: 0.64
Test precision_score for SVC: 0.61
Test precision_score for KNeighborsClassifier: 0.61
Test precision_score for DecisionTreeClassifier: 0.58

-------------------------------------------------------------------------------------------------



KeyError: 0

In [None]:
# SAVE THE BEST SELECTED MODEL:
save_model_as_sklearn(selected_model=best_model)

In [None]:
save_model_as_sklearn(selected_model=best_model, best_transformer=scaler)

array([ 8.,  9., 10., 11., 12.,  1.,  2.,  3.])