In [2]:
import pandas as pd
# pd.reset_option("all")
pd.set_option('display.max_columns', None)
pd.options.display.max_seq_items = 2000
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from tqdm import tqdm
# import xgboost as xgb

In [3]:
df = pd.read_csv('data/final_df.csv', index_col = [0])

In [4]:
feature_cols = [col for col in df.columns if col != 'target']
X = df[feature_cols]
y = df['target']

In [5]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)
odds_test = X_test[['mean_0','mean_1','mean_2']]

In [60]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

param = {
    'max_depth': 3, 
    'eta': 0.3,
    'silent': 1, 
    'objective': 'multi:softprob', 
    'num_class': 3} 
num_round = 20  

bst = xgb.train(param, dtrain, num_round)

preds = bst.predict(dtest)

preds_labels = le.inverse_transform(preds.argmax(axis=1))

Parameters: { "silent" } are not used.



In [8]:
def profit_score(y_true, y_pred_proba, **kwargs):
    odds_data = kwargs.get('odds_data')
    alpha = kwargs.get('alpha', np.array([0.057, 0.034, 0.037]))
#     alpha = kwargs.get('alpha', np.array([0.0, 0.0, 0.0]))
    bankroll = kwargs.get('bankroll', 10000)
    kelly_fraction = kwargs.get('kelly_fraction', 0.025)
    outcomes = ["draw    ", "home win", "away win"]

    total_profit = 0
    actual_outcomes = y_true

    for i in range(len(y_pred_proba)):
        odds = np.array(odds_data.iloc[i])
        pred_probs = y_pred_proba[i]
        implied_probs = 1 / odds
        diffs = pred_probs - implied_probs
        j = np.argmax(diffs)

        if diffs[j] > alpha[j]:
            bet_fraction = kelly_fraction * (pred_probs[j] * odds[j] - (1 - pred_probs[j])) / (odds[j])
            bet_amount = bet_fraction * bankroll

            if actual_outcomes[i] == j:
                profit = (odds[j] - 1) * bet_amount
            else:
                profit = -bet_amount

            bankroll += profit
            total_profit += profit

            print(f"bankroll: {bankroll:.1f} - game: {i+1} - outcome: {outcomes[actual_outcomes[i]]} - bet: {outcomes[j]} - pred prob: {pred_probs[j]:.3f} - given prob: {implied_probs[j]:.3f} - odds: {odds[j]:.3f} - bet size: {bet_amount:.3f} - profit/Loss: {profit:.3f}")

    return total_profit

In [9]:
profit = profit_score(y_test, preds, odds_data = odds_test)

In [61]:
preds_df = pd.DataFrame(preds, columns = ['pred_0','pred_1','pred_2'])

odds_df = X_test[['mean_0','mean_1','mean_2']].apply(lambda x: 1/x)

In [62]:
po_df = pd.concat([preds_df, odds_df.reset_index(drop = True)], axis = 1)

In [63]:
po_df = po_df[['mean_0','pred_0','mean_1','pred_1','mean_2','pred_2']]

In [64]:
po_df['target'] = y_test