In [21]:
# MODULE IMPORTS
import pandas as pd
import numpy as np
import random
import json
import joblib
import locale
import logging
import math
import yfinance

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


from dotenv import load_dotenv
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split


from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score


# MODEL INIT

In [3]:
MODEL_CONFIG = dict(
    NAME = "fulltime_win_outcome",
    VERSION = "v1.0"
)
load_dotenv("../boto3_cloudflare.env")

True

# LOADING THE DATA

In [18]:
ticker = "ZAR=X"
data = yfinance.download(ticker, period="2y", interval="1h")
data = data.drop(["Close", "Volume"], axis=1).rename({"Adj Close": "Close"}, axis=1)

def relative_strength_index(data, window):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi
data['RSI_14'] = relative_strength_index(data['Close'], 14)

def exponential_moving_average(data, window):
    return data.ewm(span=window, adjust=False).mean()
data['EMA_20'] = exponential_moving_average(data['Close'], 20)

def macd(data, slow=26, fast=12, signal=9):
    fast_ema = exponential_moving_average(data, fast)
    slow_ema = exponential_moving_average(data, slow)
    macd_line = fast_ema - slow_ema
    signal_line = macd_line.ewm(span=signal, adjust=False).mean()
    return macd_line, signal_line
data['MACD'], data['MACD_Signal'] = macd(data['Close'])

def bollinger_bands(data, window=20, num_of_std=2):
    rolling_mean = data.rolling(window).mean()
    rolling_std = data.rolling(window).std()
    upper_band = rolling_mean + (rolling_std * num_of_std)
    lower_band = rolling_mean - (rolling_std * num_of_std)
    return rolling_mean, upper_band, lower_band
data['BB_Middle'], data['BB_Upper'], data['BB_Lower'] = bollinger_bands(data['Close'])

def average_true_range(high, low, close, window=14):
    high_low = high - low
    high_close = (high - close.shift()).abs()
    low_close = (low - close.shift()).abs()

    # Ensure the series have names
    high_low.name = 'high_low'
    high_close.name = 'high_close'
    low_close.name = 'low_close'

    true_range = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
    atr = true_range.rolling(window=window).mean()
    return atr
data['ATR'] = average_true_range(data['High'], data['Low'], data['Close'])

def compare_with_previous(close_price, previous_close):
    if close_price > previous_close:
        return 1
    elif close_price < previous_close:
        return 0
    else:
        return -1

# Shift the 'Close' column to compare with the previous candle
data['Previous_Close'] = data['Close'].shift(1)
# Apply the function to determine the direction
data['Direction'] = data.apply(lambda row: compare_with_previous(row['Close'], row['Previous_Close']), axis=1)
data = data.drop(["Previous_Close", "Low", "Close"], axis=1)

data = data.dropna()
data = data[data["Direction"] != -1]
data

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,RSI_14,EMA_20,MACD,MACD_Signal,BB_Middle,BB_Upper,BB_Lower,ATR,Direction
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2022-06-27 19:00:00+01:00,15.842300,15.853880,57.734865,15.839083,0.012179,0.014916,15.838535,15.912599,15.764471,0.049312,1
2022-06-27 20:00:00+01:00,15.850210,15.868450,57.406753,15.841442,0.012691,0.014471,15.842040,15.913778,15.770302,0.048889,1
2022-06-27 21:00:00+01:00,15.861100,15.866380,57.603976,15.843047,0.012505,0.014078,15.843605,15.915327,15.771883,0.046334,0
2022-06-27 22:00:00+01:00,15.848100,15.857200,61.482273,15.842897,0.010874,0.013437,15.844598,15.915584,15.773613,0.043939,0
2022-06-27 23:00:00+01:00,15.839130,15.852800,45.174293,15.843126,0.009778,0.012705,15.846371,15.915448,15.777294,0.038667,1
...,...,...,...,...,...,...,...,...,...,...,...
2024-06-25 07:00:00+01:00,18.069160,18.104380,35.635910,18.092352,0.007731,0.016415,18.115963,18.190480,18.041446,0.038860,1
2024-06-25 08:00:00+01:00,18.086809,18.118601,48.292308,18.093699,0.008130,0.014758,18.112631,18.182177,18.043085,0.038110,1
2024-06-25 09:00:00+01:00,18.104950,18.186911,72.244889,18.099953,0.012567,0.014320,18.109400,18.160794,18.058005,0.041432,1
2024-06-25 10:00:00+01:00,18.162680,18.215099,70.369298,18.107772,0.017710,0.014998,18.110280,18.166059,18.054500,0.042382,1


In [19]:
data = data.reset_index(drop=True)
data_Y = data.pop("Direction")
X, X_test, Y, Y_test = train_test_split(data, data_Y, test_size=0.3, stratify=data_Y)
X

Unnamed: 0,Open,High,RSI_14,EMA_20,MACD,MACD_Signal,BB_Middle,BB_Upper,BB_Lower,ATR
10926,18.793560,18.821470,49.758959,18.800513,-0.010390,-0.014135,18.787059,18.831038,18.743080,0.043104
8753,18.841530,18.844400,49.834438,18.842743,0.013587,0.018948,18.841818,18.923243,18.760393,0.064634
314,17.161900,17.163300,58.282875,17.119391,0.040506,0.045592,17.124797,17.274052,16.975542,0.069729
12169,18.378799,18.410690,44.104953,18.400280,-0.008405,-0.008855,18.403366,18.478249,18.328483,0.085925
5327,18.318911,18.334101,39.903742,18.333709,-0.012725,-0.009471,18.339262,18.406467,18.272056,0.041415
...,...,...,...,...,...,...,...,...,...,...
9017,19.077829,19.092699,60.757763,19.056176,0.036953,0.044160,19.064098,19.156909,18.971288,0.050155
9452,18.711960,18.745590,60.424275,18.718516,0.006837,0.003624,18.699094,18.775612,18.622575,0.046957
5239,18.429701,18.449699,60.404295,18.435442,0.017498,0.021952,18.439091,18.505199,18.372984,0.033255
10986,18.704050,18.708651,49.538801,18.678505,0.003370,-0.000502,18.661926,18.732670,18.591182,0.040461


# UTILITY FUNCTIONS

In [20]:
def save_model_as_sklearn(selected_model, best_transformer = None):
    model_name = f"{MODEL_CONFIG['NAME']}-{MODEL_CONFIG['VERSION']}-{MODEL_CONFIG['TYPE']}.joblib"
    transformer_name = f"{MODEL_CONFIG['NAME']}-{MODEL_CONFIG['VERSION']}-{MODEL_CONFIG['TYPE']}-transformer.joblib"
    model_folder = f"../models/{model_name}"
    transformer_folder = f"../models/{transformer_name}"
    
    """Save te best selected model and the transformer used in training."""
    joblib.dump(selected_model, model_folder)
    joblib.dump(best_transformer, transformer_folder)

    MODELS_REGISTRY["models"][MODEL_CONFIG["NAME"]] = dict(
        model = model_name,
        transformer = transformer_name,
        params = selected_model.get_params()
    )
    print(MODELS_REGISTRY)
    return save_model_registry()


def load_model_registry():
  with open("../models/models-registry.json", "r") as registry:
    model_registry_data = "".join(registry.readlines())
    return json.loads(model_registry_data)
MODELS_REGISTRY = load_model_registry()


def save_model_registry(model_params = None):
    with open("../models/models-registry.json", "wb") as registry:
      file_contents = json.dumps(MODELS_REGISTRY, indent=4)
      registry.write(file_contents.encode())
      print("Updated registry:", MODELS_REGISTRY)


def format_number_with_space(number):
    locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
    formatted_number = locale.format_string("%n", number, grouping=True).replace(',', ' ')
    return formatted_number

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Example usage:
# X_test, y_test should be numpy arrays, model should be a trained model, odds should be a list of odds
# avg_win_rate, avg_loss_rate, total_odds, final_balance = betslip_win_rate(X_test, y_test, model, odds)


def perfomance_metric(X_test, Y_test, model):
    Y_preds = (model.predict(X_test).reshape(-1) > 0.5).astype(int)
    combined_actual_to_preds = pd.DataFrame(dict(actual=Y_test, prediction=Y_preds))
    print(classification_report(Y_test, Y_preds))
    return pd.crosstab(index=combined_actual_to_preds["actual"], columns=combined_actual_to_preds["prediction"])

# RANDOM OVERSAMPLING

In [22]:
# Randomly oversample the data to equalize the buy/sell count
ros = RandomUnderSampler()
X_resampled, Y_resampled = ros.fit_resample(X, Y)
print("Random Over Sampler: ", X_resampled.shape)
# X_resampled = X_resampled.sort_values(["year", "month", "day", "matchday", "hour", "minute"])
X_resampled

Random Over Sampler:  (8574, 10)


Unnamed: 0,Open,High,RSI_14,EMA_20,MACD,MACD_Signal,BB_Middle,BB_Upper,BB_Lower,ATR
1393,17.693140,17.701370,64.822895,17.615620,0.045488,0.040235,17.607244,17.714336,17.500152,0.052711
9689,19.163321,19.168631,68.133301,19.065770,0.041367,0.029165,19.057287,19.159770,18.954805,0.043181
5407,19.178711,19.180000,63.702915,19.123204,0.070226,0.083618,19.150165,19.308486,18.991845,0.028271
1165,17.304899,17.304899,49.635779,17.276511,0.014889,0.017224,17.283625,17.352445,17.214805,0.069579
10549,18.702400,18.728910,47.172681,18.656065,-0.009830,-0.016076,18.655401,18.741853,18.568949,0.060718
...,...,...,...,...,...,...,...,...,...,...
3322,16.950701,17.042980,68.512212,16.963346,-0.015676,-0.030058,16.936545,16.996168,16.876922,0.047679
3743,17.078100,17.096701,55.922909,17.071467,-0.013569,-0.025359,17.058276,17.117296,16.999256,0.060003
9490,18.671989,18.690201,46.764316,18.654264,0.008411,0.006381,18.654690,18.716312,18.593068,0.056715
9452,18.711960,18.745590,60.424275,18.718516,0.006837,0.003624,18.699094,18.775612,18.622575,0.046957


# DATA NORMALIZATION AND STANDARDIZATION

In [26]:
# MIN_MAX SCALER
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_resampled.values)
X_scaled

array([[0.46126877, 0.45890234, 0.64326239, ..., 0.43905469, 0.44633291,
        0.28202084],
       [0.82430321, 0.8210494 , 0.67683383, ..., 0.78454242, 0.81632745,
        0.2166226 ],
       [0.82810359, 0.82385565, 0.63190446, ..., 0.82008852, 0.82574879,
        0.11430457],
       ...,
       [0.7029779 , 0.70296406, 0.46012696, ..., 0.6785472 , 0.72431897,
        0.30949425],
       [0.71284786, 0.71663519, 0.59865516, ..., 0.69272107, 0.73182414,
        0.24253003],
       [0.64314912, 0.64360385, 0.59845254, ..., 0.6280869 , 0.66833987,
        0.14850283]])

In [None]:
# STANDARD SCALER
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled.values)
X_scaled

In [None]:
# ROBUST SCALER
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X_resampled.values)
X_scaled

In [None]:
# MAX ABS SCALER
scaler = MaxAbsScaler()
X_scaled = scaler.fit_transform(X_resampled)
X_scaled

# GRID SEARCH MODEL TRAINING

In [14]:
models = {
    'LogisticRegression': LogisticRegression(multi_class='auto'),
    'RandomForestClassifier': RandomForestClassifier(),
    # 'SVC': SVC(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier()
}

param_grids = {
    'LogisticRegression': {
        'classifier__penalty': ['l2'],
        'classifier__C': [0.01, 0.1, 1, 10, 100],
        'classifier__solver': ['lbfgs', 'liblinear']
    },
    'RandomForestClassifier': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20, 30],
        'classifier__min_samples_split': [2, 5, 10]
    },
    'KNeighborsClassifier': {
        'classifier__n_neighbors': [3, 5, 7, 9],
        'classifier__weights': ['uniform', 'distance'],
        'classifier__metric': ['euclidean', 'manhattan', 'minkowski']
    },
    'DecisionTreeClassifier': {
        'classifier__max_depth': [None, 10, 20, 30],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__criterion': ['gini', 'entropy']
    }
}

# Create a dictionary of classifiers
classifiers = {
    'LogisticRegression': LogisticRegression(multi_class='auto', max_iter=10000),
    'RandomForestClassifier': RandomForestClassifier(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier()
}

# Placeholder for best models
best_models = {}

# Assuming X_train and y_train are your training data
# X_train = ...
# y_train = ...
from sklearn.pipeline import Pipeline
for clf_name, clf in classifiers.items():
    print(f"Running GridSearchCV for {clf_name}")
    
    # Create a pipeline with a scaler and the classifier
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', clf)
    ])
    
    # Create the GridSearchCV object
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grids[clf_name],
        scoring='accuracy',
        cv=5,
        n_jobs=-1,
        verbose=1
    )
    
    # Fit the GridSearchCV
    grid_search.fit(X, Y)
    
    # Get the best model and its parameters
    best_models[clf_name] = grid_search.best_estimator_
    print(f"Best parameters for {clf_name}: {grid_search.best_params_}")
    print(f"Best cross-validation score for {clf_name}: {grid_search.best_score_}\n")

# best_models = {}
# best_params = {}
# for model_name in models:
#     print(f"Running GridSearchCV for {model_name}...")
#     grid_search = GridSearchCV(estimator=models[model_name], param_grid=param_grids[model_name], cv=5, n_jobs=-1, scoring='precision')
#     grid_search.fit(X_scaled, Y_resampled)
#     best_models[model_name] = grid_search.best_estimator_
#     best_params[model_name] = grid_search.best_params_
#     print(f"Best parameters for {model_name}: {grid_search.best_params_}")
#     print(f"Best cross-validation accuracy for {model_name}: {grid_search.best_score_}")

Running GridSearchCV for LogisticRegression
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters for LogisticRegression: {'classifier__C': 100, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
Best cross-validation score for LogisticRegression: 0.799446286126925

Running GridSearchCV for RandomForestClassifier
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters for RandomForestClassifier: {'classifier__max_depth': 10, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 50}
Best cross-validation score for RandomForestClassifier: 0.6016688666095689

Running GridSearchCV for KNeighborsClassifier
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters for KNeighborsClassifier: {'classifier__metric': 'euclidean', 'classifier__n_neighbors': 9, 'classifier__weights': 'uniform'}
Best cross-validation score for KNeighborsClassifier: 0.5479957008749391

Running GridSearchCV for DecisionTreeClassif

In [29]:
best_model = LogisticRegression(C=100, penalty="l2", solver="liblinear")
best_model.fit(X, Y)
best_model

# MODEL EVALUATIONS

In [27]:
X_test_scaled = scaler.transform(X_test.values)
odds = [round(1 + random.random() * 1.5, 2) for _ in range(0, len(X_test_scaled))]

best_model = None
precision_benchmark = 0
for model_name, model in best_models.items():
    y_pred = model.predict(X_test_scaled)
    precision = precision_score(Y_test, y_pred)
    if precision > precision_benchmark:
        precision_benchmark = precision
        best_model = model
    print(f"Test precision_score for {model_name}: {precision:.2f}")



Test precision_score for LogisticRegression: 0.52
Test precision_score for RandomForestClassifier: 0.47
Test precision_score for KNeighborsClassifier: 0.50
Test precision_score for DecisionTreeClassifier: 0.00


  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
X_test_scaled = scaler.transform(X_test.values)
X_test_scaled

array([[0.33295309, 0.32942862, 0.5807327 , ..., 0.31659103, 0.33895963,
        0.27432088],
       [0.70147922, 0.70064364, 0.57832134, ..., 0.67430432, 0.72842344,
        0.18100982],
       [0.31510982, 0.31418747, 0.69617668, ..., 0.31391201, 0.30638236,
        0.28127571],
       ...,
       [0.77804776, 0.77839632, 0.72391511, ..., 0.73937749, 0.77656226,
        0.19469324],
       [0.77822061, 0.77396355, 0.42861134, ..., 0.74346281, 0.81352174,
        0.12697267],
       [0.71345025, 0.71866138, 0.62653768, ..., 0.68849671, 0.73072568,
        0.35249286]])

In [30]:
y_pred = best_model.predict(X_test_scaled)
precision = precision_score(Y_test, y_pred)
precision



0.723404255319149

In [31]:
perfomance_metric(X_test_scaled, Y_test, model=best_model)

              precision    recall  f1-score   support

           0       0.51      0.98      0.67      1861
           1       0.72      0.06      0.10      1837

    accuracy                           0.52      3698
   macro avg       0.62      0.52      0.39      3698
weighted avg       0.62      0.52      0.39      3698





prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1822,39
1,1735,102


In [None]:
# SAVE THE BEST SELECTED MODEL:
save_model_as_sklearn(selected_model=best_model)

In [129]:
save_model_as_sklearn(selected_model=best_model, best_transformer=scaler)

{'root': 'https://raw.githubusercontent.com/lebyanelm/neural-trained-models/main/models/', 'models': {'premier_league': {'win_outcome': {'model': 'premier_league-v1.0-win_outcome.joblib', 'transformer': 'premier_league-v1.0-win_outcome-transformer.joblib', 'params': {'C': 0.1, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}}}, 'all_leagues': {}}}
{'root': 'https://raw.githubusercontent.com/lebyanelm/neural-trained-models/main/models/', 'models': {'premier_league': {'win_outcome': {'model': 'premier_league-v1.0-win_outcome.joblib', 'transformer': 'premier_league-v1.0-win_outcome-transformer.joblib', 'params': {'C': 0.1, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': Non