In [14]:
import pandas as pd
# pd.reset_option("all")
pd.set_option('display.max_columns', None)
pd.options.display.max_seq_items = 2000
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from tqdm import tqdm
import xgboost as xgb

In [23]:
df = pd.read_csv('final_df.csv', index_col = [0])

In [24]:
feature_cols = [col for col in df.columns if col != 'target']
X = df[feature_cols]
y = df['target']

In [68]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)
odds_test = X_test[['mean_0','mean_1','mean_2']]

In [60]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define parameters for the XGBoost classifier
param = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training, output probability
    'num_class': 3}  # the number of classes that exist in this dataset
num_round = 20  # the number of training iterations

bst = xgb.train(param, dtrain, num_round)

# Predict probabilities
preds = bst.predict(dtest)

# Convert the encoded labels back to original form
preds_labels = le.inverse_transform(preds.argmax(axis=1))

Parameters: { "silent" } are not used.



In [75]:
def profit_score(y_true, y_pred_proba, **kwargs):
    odds_data = kwargs.get('odds_data')
    alpha = kwargs.get('alpha', np.array([0.057, 0.034, 0.037]))
#     alpha = kwargs.get('alpha', np.array([0.0, 0.0, 0.0]))
    bankroll = kwargs.get('bankroll', 10000)
    kelly_fraction = kwargs.get('kelly_fraction', 0.025)
    outcomes = ["draw    ", "home win", "away win"]

    total_profit = 0
    actual_outcomes = y_true

    for i in range(len(y_pred_proba)):
        odds = np.array(odds_data.iloc[i])
        pred_probs = y_pred_proba[i]
        implied_probs = 1 / odds
        diffs = pred_probs - implied_probs
        j = np.argmax(diffs)

        if diffs[j] > alpha[j]:
            bet_fraction = kelly_fraction * (pred_probs[j] * odds[j] - (1 - pred_probs[j])) / (odds[j])
            bet_amount = bet_fraction * bankroll

            if actual_outcomes[i] == j:
                profit = (odds[j] - 1) * bet_amount
            else:
                profit = -bet_amount

            bankroll += profit
            total_profit += profit

            print(f"bankroll: {bankroll:.1f} - game: {i+1} - outcome: {outcomes[actual_outcomes[i]]} - bet: {outcomes[j]} - pred prob: {pred_probs[j]:.3f} - given prob: {implied_probs[j]:.3f} - odds: {odds[j]:.3f} - bet size: {bet_amount:.3f} - profit/Loss: {profit:.3f}")

    return total_profit

In [76]:
profit = profit_score(y_test, preds, odds_data = odds_test)

bankroll: 9877.5 - game: 2 - outcome: draw     - bet: home win - pred prob: 0.688 - given prob: 0.633 - odds: 1.580 - bet size: 122.536 - profit/Loss: -122.536
bankroll: 9855.3 - game: 9 - outcome: home win - bet: away win - pred prob: 0.199 - given prob: 0.136 - odds: 7.364 - bet size: 22.151 - profit/Loss: -22.151
bankroll: 10011.2 - game: 11 - outcome: home win - bet: home win - pred prob: 0.301 - given prob: 0.202 - odds: 4.953 - bet size: 39.428 - profit/Loss: 155.854
bankroll: 9867.0 - game: 12 - outcome: draw     - bet: home win - pred prob: 0.749 - given prob: 0.692 - odds: 1.445 - bet size: 144.168 - profit/Loss: -144.168
bankroll: 9947.2 - game: 14 - outcome: home win - bet: home win - pred prob: 0.580 - given prob: 0.525 - odds: 1.904 - bet size: 88.736 - profit/Loss: 80.179
bankroll: 9893.6 - game: 21 - outcome: home win - bet: draw     - pred prob: 0.395 - given prob: 0.296 - odds: 3.374 - bet size: 53.529 - profit/Loss: -53.529
bankroll: 9865.0 - game: 25 - outcome: home 

In [61]:
preds_df = pd.DataFrame(preds, columns = ['pred_0','pred_1','pred_2'])

odds_df = X_test[['mean_0','mean_1','mean_2']].apply(lambda x: 1/x)

In [62]:
po_df = pd.concat([preds_df, odds_df.reset_index(drop = True)], axis = 1)

In [63]:
po_df = po_df[['mean_0','pred_0','mean_1','pred_1','mean_2','pred_2']]

In [64]:
po_df['target'] = y_test

In [None]:
class TqdmScore:
    def __init__(self, n_splits):
        self.bar = tqdm(total=n_splits)
        
    def __call__(self, estimator, X, y):
        score = accuracy_score(y, estimator.predict(X))
        self.bar.update()
        return score

def select_features(X, y, model):
    feature_sets = {}

    rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(2),
                  scoring='accuracy',
                  min_features_to_select=30)
    rfecv.fit(X, y)

    selected_features_rfe = [f for f, s in zip(X.columns, rfecv.support_) if s]
    feature_sets['rfe'] = {'features': selected_features_rfe, 'score': rfecv.grid_scores_[-1]}

    mi_selector = SelectKBest(score_func=mutual_info_classif, k=10)
    mi_selector.fit(X, y)

    selected_features_mi = [f for f, s in zip(X.columns, mi_selector.get_support()) if s]
    feature_sets['mutual_info'] = {'features': selected_features_mi, 'score': mi_selector.scores_}

    corr_matrix = X.corr().abs()
    high_corr_vars = [column for column in corr_matrix.columns if any(corr_matrix[column] > 0.75)]
    
    selected_features_corr = high_corr_vars
    feature_sets['correlation'] = {'features': selected_features_corr, 'score': np.mean([corr_matrix.loc[f, f] for f in high_corr_vars])}

    return feature_sets