In [None]:
# MODULE IMPORTS
import pandas as pd
import numpy as np
import random
import json
import joblib
import locale
import logging
import math

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


from dotenv import load_dotenv
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV


from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score

# MODEL INIT

In [None]:
MODEL_CONFIG = dict(
    NAME = "all_leagues",
    VERSION = "v1.0",
    TYPE = "fulltime_win_outcome"
)
load_dotenv("../boto3_cloudflare.env")

# LOADING THE DATA

In [None]:
# TRAINING DATASET
X = pd.read_csv(f"https://raw.githubusercontent.com/lebyanelm/neural-trained-models/main/relative_datasets/cleaned/{MODEL_CONFIG['NAME']}_{MODEL_CONFIG['TYPE']}-{MODEL_CONFIG['VERSION']}-train-set.csv").drop_duplicates().dropna()
X = X.sort_values(["year", "month", "day", "weekday", "matchday", "hour"])
Y = X.pop("target")
print(f"""TRAINING SHAPE: {X.shape}, {X.columns}""")

# TESTING/EVALUATION DATASET
X_eval = pd.read_csv(f"https://raw.githubusercontent.com/lebyanelm/neural-trained-models/main/relative_datasets/cleaned/{MODEL_CONFIG['NAME']}_{MODEL_CONFIG['TYPE']}-{MODEL_CONFIG['VERSION']}-test-set.csv").drop_duplicates().dropna()
X_eval = X_eval.sort_values(["year", "month", "day", "weekday", "matchday", "hour"])
Y_test = X_eval.pop("target")
print(f"""TESTING SHAPE: {X_eval.shape}""")

# UTILITY FUNCTIONS

In [None]:
def save_model_as_sklearn(selected_model, best_transformer = None):
    model_name = f"{MODEL_CONFIG['NAME']}-{MODEL_CONFIG['VERSION']}-{MODEL_CONFIG['TYPE']}.joblib"
    transformer_name = f"{MODEL_CONFIG['NAME']}-{MODEL_CONFIG['VERSION']}-{MODEL_CONFIG['TYPE']}-transformer.joblib"
    model_folder = f"../models/{model_name}"
    transformer_folder = f"../models/{transformer_name}"
    
    """Save te best selected model and the transformer used in training."""
    joblib.dump(selected_model, model_folder)
    joblib.dump(best_transformer, transformer_folder)

    if MODELS_REGISTRY["models"].get(MODEL_CONFIG["NAME"]) == None:
        MODELS_REGISTRY["models"][MODEL_CONFIG["NAME"]] = dict()
    
    print(MODELS_REGISTRY)
    MODELS_REGISTRY["models"][MODEL_CONFIG["NAME"]][MODEL_CONFIG["TYPE"]] = dict(
        model = model_name,
        transformer = transformer_name,
        params = selected_model.get_params()
    )

    print(MODELS_REGISTRY)
    return save_model_registry()


def load_model_registry():
  with open("../models/models-registry.json", "r") as registry:
    model_registry_data = "".join(registry.readlines())
    return json.loads(model_registry_data)
MODELS_REGISTRY = load_model_registry()


def save_model_registry(model_params = None):
    with open("../models/models-registry.json", "wb") as registry:
      file_contents = json.dumps(MODELS_REGISTRY, indent=4)
      registry.write(file_contents.encode())
      print("Updated registry:", MODELS_REGISTRY)


def format_number_with_space(number):
    locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
    formatted_number = locale.format_string("%n", number, grouping=True).replace(',', ' ')
    return formatted_number

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def betslip_win_rate(X_eval, y_eval, model, odds, bet_amount=10, matches_per_bet=5, broker_balance=100, available_topup=100, log=False):
    """
    Calculate win and loss rate, and estimate winnings.

    Parameters:
    X_test (ndarray): Test features.
    y_test (ndarray): Actual outcomes for the test set.
    model (tf.keras.Model / sklearn Model): Trained prediction model.
    odds (list): List of odds for each match.
    bet_amount (float): Amount of money placed on each bet. Default is 10.

    Returns:
    tuple: Average win rate, average loss rate, total odds, final broker balance.
    """
    # Generate predictions
    if hasattr(model, 'predict'):
        predictions = model.predict(X_eval)
    else:
        raise ValueError("Model does not have a predict method.")
    
    # Ensure predictions are binary (0 or 1)
    predictions = (predictions > 0.5).astype(bool)
    
    # Initialize counts and earnings
    win_count = 0
    loss_count = 0
    odds_earned = 0

    # Loop over matches per bet
    for i in range(0, len(y_eval), matches_per_bet):
        # Calculate the stake as 10% of the available balance
        current_bet_amount = round(broker_balance * 0.1, 2)
        broker_balance -= current_bet_amount
        
        # If balance is too low to continue betting, break the loop
        if broker_balance < 0:
            logging.info("Balance depleted.")
            break
        
        # Calculate total odds for the current set of matches
        total_odds = math.prod(odds[i:i+matches_per_bet])
        if log:
            print("---------------------------------", i)
            print(f"{X_test.iloc[i].day}, {X_test.iloc[i].month}, {X_test.iloc[i].year}: bet amount           | R{current_bet_amount:.2f} ({total_odds})".replace(",", " "))
            print(f"{X_test.iloc[i].day}, {X_test.iloc[i].month}, {X_test.iloc[i].year}: balance              | R{broker_balance:.2f}".replace(",", " "))
        
        # Determine if the current bet is a win or loss
        betslip_comparison = predictions[i:i+matches_per_bet] == y_eval[i:i+matches_per_bet]
        if all(betslip_comparison):
            broker_balance += current_bet_amount * total_odds
            if log:
                print(f"{X_test.iloc[i].day} {X_test.iloc[i].month} {X_test.iloc[i].year}: amount earned           | R{current_bet_amount*total_odds:.2f}".replace(",", " "))
                print(f"{X_test.iloc[i].day}, {X_test.iloc[i].month}, {X_test.iloc[i].year}: balance               | R{broker_balance:.2f}".replace(",", " "))
                print("---------------------------------")


            odds_earned += total_odds
            win_count += 1
        else:
            loss_count += 1
        
        # If balance drops below zero, consider the available topup
        if broker_balance < 0 and available_topup > 0:
            topup_needed = -broker_balance
            topup_used = min(topup_needed, available_topup)
            broker_balance += topup_used
            available_topup -= topup_used
            if broker_balance < 0:
                broker_balance = 0  # Ensure the balance does not go negative


    # Calculate average win rate and loss rate
    total_bets = win_count + loss_count
    avg_win_rate = win_count / total_bets if total_bets else 0
    avg_loss_rate = loss_count / total_bets if total_bets else 0
        
    logging.info(f"win rate           | {avg_win_rate*100:.2f}%")
    logging.info(f"loss rate          | {avg_loss_rate*100:.2f}%")
    logging.info(f"odds earned        | {odds_earned:.2f}")
    logging.info(f"final balance      | R{broker_balance:,.2f}".replace(",", " "))
    logging.info(f"topup left         | R{available_topup:,.2f}".replace(",", " "))
  
    return avg_win_rate, avg_loss_rate, total_odds, broker_balance, available_topup

# Example usage:
# X_test, y_test should be numpy arrays, model should be a trained model, odds should be a list of odds
# avg_win_rate, avg_loss_rate, total_odds, final_balance = betslip_win_rate(X_test, y_test, model, odds)


def perfomance_metric(X_test, Y_test, model):
    Y_preds = (model.predict(X_test).reshape(-1) > 0.5).astype(int)
    combined_actual_to_preds = pd.DataFrame(dict(actual=Y_test, prediction=Y_preds))
    print(classification_report(Y_test, Y_preds))
    return pd.crosstab(index=combined_actual_to_preds["actual"], columns=combined_actual_to_preds["prediction"])

# RANDOM OVERSAMPLING

In [None]:
# Randomly oversample the data to equalize the buy/sell count
ros = RandomOverSampler()
X_resampled, Y_resampled = ros.fit_resample(X, Y)
print("Random Over Sampler: ", X_resampled.shape)
# X_resampled = X_resampled.sort_values(["year", "month", "day", "matchday", "hour", "minute"])
X_resampled

# DATA NORMALIZATION AND STANDARDIZATION

In [None]:
# MIN_MAX SCALER
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_resampled)
X_scaled

In [None]:
# STANDARD SCALER
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled)
X_scaled

In [None]:
# ROBUST SCALER
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X_resampled)
X_scaled

In [None]:
# MAX ABS SCALER
scaler = MaxAbsScaler()
X_scaled = scaler.fit_transform(X_resampled)
X_scaled

# GRID SEARCH MODEL TRAINING

In [None]:
models = {
    'LogisticRegression': LogisticRegression(),
    'RandomForestClassifier': RandomForestClassifier(),
    # 'SVC': SVC(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier()
}

param_grids = {
    'LogisticRegression': {
        'penalty': ['l2'],
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs', 'liblinear']
    },
    'RandomForestClassifier': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    # 'SVC': {
    #     'C': [0.1, 1, 10, 100],
    #     'gamma': [1, 0.1, 0.01, 0.001],
    #     'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
    # },
    'KNeighborsClassifier': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan', 'minkowski']
    },
    'DecisionTreeClassifier': {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'criterion': ['gini', 'entropy']
    }
}

best_models = {}
best_params = {}
for model_name in models:
    print(f"Running GridSearchCV for {model_name}...")
    grid_search = GridSearchCV(estimator=models[model_name], param_grid=param_grids[model_name], cv=5, n_jobs=-1, scoring='precision')
    grid_search.fit(X_scaled, Y_resampled)
    best_models[model_name] = grid_search.best_estimator_
    best_params[model_name] = grid_search.best_params_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy for {model_name}: {grid_search.best_score_}")

# MODEL EVALUATIONS

In [None]:
X_test_scaled = scaler.transform(X_test.values)
odds = [round(1 + random.random() * 1.5, 2) for _ in range(0, len(X_test_scaled))]

best_model = None
precision_benchmark = 0
for model_name, model in best_models.items():
    y_pred = model.predict(X_test_scaled)
    precision = precision_score(Y_test, y_pred)
    if precision > precision_benchmark:
        precision_benchmark = precision
        best_model = model
    print(f"Test precision_score for {model_name}: {precision:.2f}")

In [None]:
print()
print("-------------------------------------------------------------------------------------------------")
print()

# BETSLIP EVALUATIONS
betslip_win_rate(X_test_scaled, Y_test, model=best_model, odds=odds, matches_per_bet=1, broker_balance = 100, log=True)

print()
print("-------------------------------------------------------------------------------------------------")
print()

perfomance_metric(X_test_scaled, Y_test, model=best_model)

In [None]:
# SAVE THE BEST SELECTED MODEL:
save_model_as_sklearn(selected_model=best_model)

In [129]:
save_model_as_sklearn(selected_model=best_model, best_transformer=scaler)

{'root': 'https://raw.githubusercontent.com/lebyanelm/neural-trained-models/main/models/', 'models': {'premier_league': {'win_outcome': {'model': 'premier_league-v1.0-win_outcome.joblib', 'transformer': 'premier_league-v1.0-win_outcome-transformer.joblib', 'params': {'C': 0.1, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}}}, 'all_leagues': {}}}
{'root': 'https://raw.githubusercontent.com/lebyanelm/neural-trained-models/main/models/', 'models': {'premier_league': {'win_outcome': {'model': 'premier_league-v1.0-win_outcome.joblib', 'transformer': 'premier_league-v1.0-win_outcome-transformer.joblib', 'params': {'C': 0.1, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': Non