In [1]:
import pandas as pd; pd.set_option('display.max_columns', None)
import numpy as np
from pycaret.classification import *

In [2]:
def calcular_mediacg(_df):
    final_df = _df.copy()

    def calcular_media(row):
        date, home, away = row['Date'], row['Home'], row['Away']

        df_cum = _df[(_df['Date'] < date) & ((_df['Home'] == home) | (_df['Away'] == home))].tail(7)
        if len(df_cum) == 7:
            df_cum.loc[(df_cum['Home'] == home), 'real_cg'] = df_cum['CustoGolHome']
            df_cum.loc[(df_cum['Away'] == home), 'real_cg'] = df_cum['CustoGolAway']
            media_CG = df_cum['real_cg'].mean()
            dp_CG = df_cum['real_cg'].std()
            cv_CG = dp_CG / media_CG
            final_df.loc[row.name, 'Avg_CG_H'] = media_CG
            final_df.loc[row.name, 'DP_CG_H'] = dp_CG
            final_df.loc[row.name, 'CV_CG_H'] = cv_CG
        else:
            final_df.loc[row.name, 'Avg_CG_H'] = 0
            final_df.loc[row.name, 'DP_CG_H'] = 0
            final_df.loc[row.name, 'CV_CG_H'] = 0

        df_cum = _df[(_df['Date'] < date) & ((_df['Home'] == away) | (_df['Away'] == away))].tail(7)
        if len(df_cum) == 7:
            df_cum.loc[(df_cum['Home'] == away), 'real_cg'] = df_cum['CustoGolHome']
            df_cum.loc[(df_cum['Away'] == away), 'real_cg'] = df_cum['CustoGolAway']
            media_CG = df_cum['real_cg'].mean()
            dp_CG = df_cum['real_cg'].std()
            cv_CG = dp_CG / media_CG
            final_df.loc[row.name, 'Avg_CG_A'] = media_CG
            final_df.loc[row.name, 'DP_CG_A'] = dp_CG
            final_df.loc[row.name, 'CV_CG_A'] = cv_CG
        else:
            final_df.loc[row.name, 'Avg_CG_A'] = 0
            final_df.loc[row.name, 'DP_CG_A'] = 0
            final_df.loc[row.name, 'CV_CG_A'] = 0

    _df.apply(calcular_media, axis=1)

    return final_df


# Cria classes do target
def cria_alvos(_df):
    # Back Home
    _df.loc[(_df['Home_Pts'] > _df['Away_Pts']), 'Back_Home'] = 1
    _df.loc[(_df['Home_Pts'] < _df['Away_Pts']), 'Back_Home'] = 0
    
    _df.loc[(_df['Back_Home']) == 1, 'PL_Home'] = _df.Odds_H - 1
    _df.loc[(_df['Back_Home']) == 0, 'PL_Home'] = - 1
    
    # Back Away
    _df.loc[(_df['Home_Pts'] < _df['Away_Pts']), 'Back_Away'] = 1
    _df.loc[(_df['Home_Pts'] > _df['Away_Pts']), 'Back_Away'] = 0
    
    _df.loc[(_df['Back_Away']) == 1, 'PL_Away'] = _df.Odds_A - 1
    _df.loc[(_df['Back_Away']) == 0, 'PL_Away'] = - 1
    
    # Over/Under
    _df.loc[(_df['Home_Pts'] + _df['Away_Pts']) > _df['Over_Line'], 'Back_Over'] = 1
    _df.loc[(_df['Home_Pts'] + _df['Away_Pts']) < _df['Over_Line'], 'Back_Over'] = 0
    
    _df.loc[(_df['Back_Over']) == 1, 'PL_Over'] = _df.Odds_Over - 1
    _df.loc[(_df['Back_Over']) == 0, 'PL_Over'] = - 1

    _df.loc[(_df['Back_Over']) == 0, 'PL_Under'] = _df.Odds_Under - 1
    _df.loc[(_df['Back_Over']) == 1, 'PL_Under'] = - 1
    
    # HA
    _df.loc[((_df['Home_Pts'] + _df['HA_Line'])) > _df['Away_Pts'], 'Back_HA_H'] = 1
    _df.loc[((_df['Home_Pts'] + _df['HA_Line'])) < _df['Away_Pts'], 'Back_HA_H'] = 0
    _df.loc[((_df['Home_Pts'] + _df['HA_Line'])) == _df['Away_Pts'], 'Back_HA_H'] = 2
    
    _df.loc[(_df['Back_HA_H']) == 1, 'PL_HA_H'] = _df.HA_Odds_H - 1
    _df.loc[(_df['Back_HA_H']) == 0, 'PL_HA_H'] = - 1
    _df.loc[(_df['Back_HA_H']) == 2, 'PL_HA_H'] = - 0

    _df.loc[(_df['Back_HA_H']) == 0, 'PL_HA_A'] = _df.HA_Odds_A - 1
    _df.loc[(_df['Back_HA_H']) == 1, 'PL_HA_A'] = - 1
    _df.loc[(_df['Back_HA_H']) == 2, 'PL_HA_A'] = - 0

    return _df


################################################################################
# Prepara o _DF
################################################################################

def prepara_df(_df):
  _df = _df[(_df.Over_Line >= 5)]
  _df = _df[(_df['Home_Pts'] + _df['Away_Pts']) != _df['Over_Line']]
  _df = _df[_df['HA_Odds_A'] != 0]
  _df = _df[_df['Odds_H'] != 0]
  _df = _df[_df['Odds_A'] != 0]
  _df = _df[_df['Odds_Over'] != 0]
  _df = _df[_df['Odds_Under'] != 0]

  _df = cria_alvos(_df)

  _df['P(H)'] = 1 / _df['Odds_H']
  _df['P(A)'] = 1 / _df['Odds_A']
  _df['P(O)'] = 1 / _df['Odds_Over']
  _df['P(U)'] = 1 / _df['Odds_Under']
  
  _df['P_Diff'] = ((1 / _df['Odds_H']) + (1 / _df['Odds_A'])) - 1
  _df['Porc_Over_Home'] = _df.groupby('Home')['Back_Over'].rolling(7).mean().reset_index(level=0, drop=True)
  _df['Porc_Over_Away'] = _df.groupby('Away')['Back_Over'].rolling(7).mean().reset_index(level=0, drop=True)
  _df['Porc_Over_Home']  = _df.groupby('Home')['Porc_Over_Home'].shift(1)
  _df['Porc_Over_Away']  = _df.groupby('Away')['Porc_Over_Away'].shift(1)
  _df['Porc_Over_Home'] = _df['Porc_Over_Home'].replace(np.nan, 0)
  _df['Porc_Over_Away'] = _df['Porc_Over_Away'].replace(np.nan, 0)
  _df.reset_index(inplace=True, drop=True)

  # Custo do gol
  _df['CustoGolHome'] = _df['Home_Pts'] / (1 / _df['Odds_H'])
  _df['CustoGolAway'] = _df['Away_Pts'] / (1 / _df['Odds_A'])
  _df['CustoGolHome'] = _df['CustoGolHome'].replace(np.inf, 0)
  _df['CustoGolAway'] = _df['CustoGolAway'].replace(np.inf, 0)
  _df.reset_index(drop=True, inplace=True)

  # Média, dp e cv do custo do gol
  _df = calcular_mediacg(_df)
  
  # Último custo do gol
  _df['Last_CG_H']  = _df.groupby('Home')['CustoGolHome'].shift(1)
  _df['Last_CG_A']  = _df.groupby('Away')['CustoGolAway'].shift(1)
  _df['Last_CG_H'] = _df['Last_CG_H'].replace(np.nan, 0)
  _df['Last_CG_A'] = _df['Last_CG_A'].replace(np.nan, 0)
  
  limit_up_h = _df.CustoGolHome.mean() + _df.CustoGolHome.std()
  limit_up_a = _df.CustoGolAway.mean() + _df.CustoGolAway.std()
  _df.loc[(_df['CustoGolHome'] > limit_up_h), 'Acima_Last_CG_H'] = 1
  _df.loc[(_df['CustoGolHome'] <= limit_up_h), 'Acima_Last_CG_H'] = 0
  _df.loc[(_df['CustoGolAway'] > limit_up_a), 'Acima_Last_CG_A'] = 1
  _df.loc[(_df['CustoGolAway'] <= limit_up_a), 'Acima_Last_CG_A'] = 0
  _df['Acima_Last_CG_H']  = _df.groupby('Home')['Acima_Last_CG_H'].shift(1)
  _df['Acima_Last_CG_A']  = _df.groupby('Away')['Acima_Last_CG_A'].shift(1)
  _df['Acima_Last_CG_H'] = _df['Acima_Last_CG_H'].replace(np.nan, 0)
  _df['Acima_Last_CG_A'] = _df['Acima_Last_CG_A'].replace(np.nan, 0)

  limit_down_h = _df.CustoGolHome.mean() - _df.CustoGolHome.std()
  limit_down_a = _df.CustoGolAway.mean() - _df.CustoGolAway.std()
  _df.loc[(_df['CustoGolHome'] < limit_down_h), 'Abaixo_Last_CG_H'] = 1
  _df.loc[(_df['CustoGolHome'] >= limit_down_h), 'Abaixo_Last_CG_H'] = 0
  _df.loc[(_df['CustoGolAway'] < limit_down_a), 'Abaixo_Last_CG_A'] = 1
  _df.loc[(_df['CustoGolAway'] >= limit_down_a), 'Abaixo_Last_CG_A'] = 0
  _df['Abaixo_Last_CG_H']  = _df.groupby('Home')['Abaixo_Last_CG_H'].shift(1)
  _df['Abaixo_Last_CG_A']  = _df.groupby('Away')['Abaixo_Last_CG_A'].shift(1)
  _df['Abaixo_Last_CG_H'] = _df['Abaixo_Last_CG_H'].replace(np.nan, 0)
  _df['Abaixo_Last_CG_A'] = _df['Abaixo_Last_CG_A'].replace(np.nan, 0)

  _df['CV_ML'] = (_df[['Odds_H', 'Odds_A']].std(axis=1)) / (_df[['Odds_H', 'Odds_A']].mean(axis=1))
  _df['CV_Over'] = (_df[['Odds_Over', 'Odds_Under']].std(axis=1)) / (_df[['Odds_Over', 'Odds_Under']].mean(axis=1))

  _df.pop('CustoGolHome')
  _df.pop('CustoGolAway')

  # Organiza as colunas do dataset
  _df = _df[['Date', 'Season', 'Season_Time', 'Time', 'Home', 'Away', 'Home_Pts',
       'Away_Pts', 'Odds_H', 'Odds_A', 'Over_Line', 'Odds_Over', 'Odds_Under',
       'HA_Line', 'HA_Odds_H', 'HA_Odds_A', 'P(H)', 'P(A)', 'P(O)', 'P(U)', 'P_Diff',
       'Porc_Over_Home', 'Porc_Over_Away', 'Last_CG_H', 'Last_CG_A', 'Avg_CG_H', 'Avg_CG_A', 
       'DP_CG_H', 'DP_CG_A', 'CV_CG_H', 'CV_CG_A',
       'Acima_Last_CG_H', 'Acima_Last_CG_A', 'Abaixo_Last_CG_H', 'Abaixo_Last_CG_A', 'CV_ML',
       'CV_Over', 'Back_Home', 'PL_Home',
       'Back_Away', 'PL_Away', 'Back_Over', 'PL_Over', 'PL_Under', 'Back_HA_H',
       'PL_HA_H', 'PL_HA_A']]
  
  return _df

In [3]:
df0 = pd.read_csv('../data/wnba/wnba-2019.csv')
df1 = pd.read_csv('../data/wnba/wnba-2020.csv')
df2 = pd.read_csv('../data/wnba/wnba-2021.csv')
df3 = pd.read_csv('../data/wnba/wnba-2022.csv')
df4 = pd.read_csv('../data/wnba/wnba-2023.csv')

df = pd.concat([df0, df1, df2, df3, df4])
df['Date'] = pd.to_datetime(df['Date'], format='%Y/%m/%d')
df.reset_index(drop=True, inplace=True)
dataset = prepara_df(df)

In [4]:
df = dataset[(dataset['Odds_H'] >= 1.6) & (dataset['Odds_H'] < 50.5)]
df.shape

(455, 47)

In [5]:
target = 'Back_Home'

In [6]:
features = ['CV_Over', 'CV_ML', 'HA_Odds_A', 'Last_CG_H', 'Odds_Under', 'DP_CG_A', 'Odds_Over', 'Abaixo_Last_CG_A', 'Odds_H']

In [7]:
cls = setup(data = df,
            ignore_features = [x for x in df.columns.to_list() if x not in features and x != target],
            target = target,
            train_size = 0.5,
            normalize = True,
            normalize_method = 'minmax',
            session_id = 2023,
            verbose = False
            )

x_treino = get_config(variable='X_train')
x_teste = get_config(variable='X_test')

In [None]:
compare_models(sort='precision')

In [111]:
list_models = ['lda', 'lightgbm', 'rf', 'gbc', 'dt', 'et', 'lr', 'ridge', 'knn', 'nb', 'ada', 'qda', 'svm']

In [None]:
for item in list_models:
    model = create_model(item, verbose=False)
    previsoes = predict_model(model)

In [8]:
model = create_model('gbc', verbose=False)
previsoes = predict_model(model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.6798,0.6782,0.5506,0.5976,0.5731,0.3176,0.3183


In [None]:
# Selecionar as colunas desejadas do data2
data2_subset = df['Odds_H']

# Juntar as colunas "C" e "D" do data2_subset ao data1 com base nos índices
previsoes = previsoes.merge(data2_subset, left_index=True, right_index=True)

In [9]:
previsoes.loc[((previsoes['prediction_label'] == 1) & (previsoes[target] == 1)), 'PL_Back_Home_Predictions'] = previsoes.Odds_H - 1
previsoes.loc[((previsoes['prediction_label'] == 1) & (previsoes[target] == 0)), 'PL_Back_Home_Predictions'] = - 1
previsoes.loc[((previsoes['prediction_label'] != 1)), 'PL_Back_Home_Predictions'] = 0
previsoes['PL_Back_Home_Predictions'].sum()

19.73

In [10]:
previsoes[previsoes['prediction_label'] == 1].shape[0], previsoes[previsoes['prediction_label'] == 1]['Odds_H'].mean()

(82, 2.2557318)

In [12]:
final = finalize_model(model)

In [18]:
save_model(final, 'back_home')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=C:\Users\johnn\AppData\Local\Temp\joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['Odds_H', 'Odds_Over',
                                              'Odds_Under', 'HA_Odds_A',
                                              'Last_CG_H', 'DP_CG_A',
                                              'Abaixo_Last_CG_A', 'CV_ML',
                                              'CV_Over'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False...
                                             criterion='friedman_mse', init=None,
                                             learning_rate=0.1, loss='log

# Script para melhores variáveis

In [104]:
colunas = ['Over_Line', 'Odds_Over', 'Odds_Under',
       'HA_Line', 'HA_Odds_H', 'HA_Odds_A', 'P(H)', 'P(A)', 'P(O)', 'P(U)',
       'P_Diff', 'Porc_Over_Home', 'Porc_Over_Away', 'Last_CG_H', 'Last_CG_A',
       'Avg_CG_H', 'Avg_CG_A', 'DP_CG_H', 'DP_CG_A', 'CV_CG_H', 'CV_CG_A',
       'Acima_Last_CG_H', 'Acima_Last_CG_A', 'Abaixo_Last_CG_H',
       'Abaixo_Last_CG_A', 'CV_ML', 'CV_Over']

In [105]:
import random

def pegar_elementos_aleatorios(lista):
    # Gerar um número aleatório entre 2 e 7
    num_elementos = random.randint(2, 10)

    # Garantir que o número de elementos não ultrapasse o tamanho da lista
    num_elementos = min(num_elementos, len(lista))

    # Selecionar elementos aleatórios da lista
    elementos_aleatorios = random.sample(lista, num_elementos)

    return elementos_aleatorios

In [107]:
melhor_PL = -10000

for i in range(100):
    features = pegar_elementos_aleatorios(colunas)
    features.append('Odds_H')

    cls = setup(data = df,
            ignore_features = [x for x in df.columns.to_list() if x not in features and x != target],
            target = target,
            train_size = 0.5,
            normalize = True,
            normalize_method = 'minmax',
            session_id = 2023, 
            verbose = False
            )
    
    model = create_model('gbc', verbose = False)
    previsoes = predict_model(model, verbose = False)

    previsoes.loc[((previsoes['prediction_label'] == 1) & (previsoes[target] == 1)), 'PL_Back_Home_Predictions'] = previsoes.Odds_H - 1
    previsoes.loc[((previsoes['prediction_label'] == 1) & (previsoes[target] == 0)), 'PL_Back_Home_Predictions'] = - 1
    previsoes.loc[((previsoes['prediction_label'] != 1)), 'PL_Back_Home_Predictions'] = 0

    PL = previsoes['PL_Back_Home_Predictions'].sum()
    
    if PL > melhor_PL:
        melhor_PL = PL
        print(f'**Novo melhor PL: {melhor_PL}**')
        print(f'Features: {features}')

**Novo melhor PL: -3.4600002765655518**
Features: ['Porc_Over_Away', 'Acima_Last_CG_A', 'P(O)', 'P(A)', 'DP_CG_A', 'Over_Line', 'Last_CG_H', 'CV_Over', 'Acima_Last_CG_H', 'Odds_H']
**Novo melhor PL: 19.03000259399414**
Features: ['CV_Over', 'CV_ML', 'P(H)', 'HA_Odds_A', 'Last_CG_H', 'Odds_Under', 'P(A)', 'DP_CG_A', 'Odds_Over', 'Abaixo_Last_CG_A', 'Odds_H']
