### Importações de bibliotecas

In [1]:
import pandas as pd; pd.set_option('display.max_columns', None)
# from pycaret.classification import *
# import random
# import pickle
import numpy as np
# import statsmodels.stats.proportion as sm
# from scipy import stats
# import matplotlib.pyplot as plt
# import plotly.graph_objects as go
# import matplotlib.pyplot as plt
# import math

### Informações do método e funções

In [2]:
# Define a variável target
target = 'Back_Over'

# Cria classes do target
def cria_alvos(df):
    # Back Home
    df.loc[(df['Home_Pts'] > df['Away_Pts']), 'Back_Home'] = 1
    df.loc[(df['Home_Pts'] < df['Away_Pts']), 'Back_Home'] = 0
    
    df.loc[(df['Back_Home']) == 1, 'PL_Home'] = df.Odds_H - 1
    df.loc[(df['Back_Home']) == 0, 'PL_Home'] = - 1
    
    # Back Away
    df.loc[(df['Home_Pts'] < df['Away_Pts']), 'Back_Away'] = 1
    df.loc[(df['Home_Pts'] > df['Away_Pts']), 'Back_Away'] = 0
    
    df.loc[(df['Back_Away']) == 1, 'PL_Home'] = df.Odds_A - 1
    df.loc[(df['Back_Away']) == 0, 'PL_Home'] = - 1
    
    # Over/Under
    df.loc[(df['Home_Pts'] + df['Away_Pts']) > df['Over_Line'], 'Back_Over'] = 1
    df.loc[(df['Home_Pts'] + df['Away_Pts']) < df['Over_Line'], 'Back_Over'] = 0
    
    df.loc[(df['Back_Over']) == 1, 'PL_Over'] = df.Odds_Over - 1
    df.loc[(df['Back_Over']) == 0, 'PL_Over'] = - 1

    df.loc[(df['Back_Over']) == 0, 'PL_Under'] = df.Odds_Under - 1
    df.loc[(df['Back_Over']) == 1, 'PL_Under'] = - 1
    
    # HA
    df.loc[((df['Home_Pts'] + df['HA_Line'])) > df['Away_Pts'], 'Back_HA_H'] = 1
    df.loc[((df['Home_Pts'] + df['HA_Line'])) < df['Away_Pts'], 'Back_HA_H'] = 0
    df.loc[((df['Home_Pts'] + df['HA_Line'])) == df['Away_Pts'], 'Back_HA_H'] = 2
    
    df.loc[(df['Back_HA_H']) == 1, 'PL_HA_H'] = df.HA_Odds_H - 1
    df.loc[(df['Back_HA_H']) == 0, 'PL_HA_H'] = - 1
    df.loc[(df['Back_HA_H']) == 2, 'PL_HA_H'] = - 0

    df.loc[(df['Back_HA_H']) == 0, 'PL_HA_A'] = df.HA_Odds_A - 1
    df.loc[(df['Back_HA_H']) == 1, 'PL_HA_A'] = - 1
    df.loc[(df['Back_HA_H']) == 2, 'PL_HA_A'] = - 0

    return df

### Funções

In [3]:
# Calcula o profit
def calcula_profit(df):
    stake = 1
    green = stake * (df.Odds_Over - 1)
    red = -stake

    df.loc[(df['prediction_label'] == 1) & (df[target] == 1), 'Profit'] = green
    df.loc[(df['prediction_label'] == 1) & (df[target] == 0), 'Profit'] = red
    df.loc[(df['prediction_label'] != 1),'Profit'] = 0

    return df

In [4]:
# Calcula o winrate
def calcula_wr(df):

    certos = df[(df['prediction_label'] == 1) & (df[target] == 1)]
    winrate = len(certos) / len(df)
    winrate = round(winrate*100, 2)

    return winrate

In [5]:
def grafico_por_mes(df):
    # Construindo o gráfico de profit acumulado
    df['Date'] = pd.to_datetime(df['Date'])
    df['mes'] = df['Date'].dt.month
    df['Profit_acu'] = df['Profit'].cumsum()
    ax = df.groupby('mes')['Profit_acu'].last().plot(kind='line', figsize=(5,3), marker='o')
    ax.set_xlabel('Mês')
    ax.set_ylabel('Profit Acumulado')
    ax.set_xticks(df['mes'].unique())
    for i,j in zip(df['mes'].unique(),df.groupby('mes')['Profit_acu'].last()):
        ax.annotate('{:.2f}'.format(j),xy=(i,j), fontsize=9)

    plt.show()

In [6]:
def plota_grafico(df):
    df.Profit_acu.plot(figsize=(6,5), marker='o')
    last_x = df['Profit_acu'].index[-1]
    last_y = df['Profit_acu'].values[-1]
    plt.text(last_x, last_y, str(round(last_y, 2)))
    plt.show()

In [7]:
def risco_ruina(df, stake=1, perc_banca=1, num_blocos=5):
     dados = df.Profit.tolist()
     
     if stake != 1:
          multiplicado = []
          for valor in dados:
               multiplicado.append(valor * stake)
          dados = multiplicado

     blocos = np.array_split(dados, num_blocos)

     medias = []

     for bloco in blocos:
          meds = np.mean(bloco)
          medias.append(meds)

     desv_pad = np.std(medias)
     media_geral = np.mean(medias)

     # Risco de ruína
     risco = math.exp((-2 * media_geral * 1) / (desv_pad ** 2)) * 100

     # Intervalo de confiança
     margem_erro = 1.96 * (desv_pad/(num_blocos ** (1/2)))
     margem_cima = round((media_geral + margem_erro), 2)
     margem_baixo = round((media_geral - margem_erro), 2)
     
     media_por_dp = media_geral / desv_pad
     raiz = 1.96 / (num_blocos ** (1/2))

     print(f'Média: {round(media_geral,2)}')
     print(f'Desvio padrão: {round(desv_pad,2)}')
     print('')
     print(f'Blocos de {len(bloco)} partidas')
     print(f'Risco de ruína: {risco:.2f}%')
     print('')
     print(f"Intervalo de confiança: ({margem_baixo}, {margem_cima})")
     print('')
     print(f'Média/DP: {media_por_dp:.2f}')
     print(f'1,96/raiz: {raiz:.2f}')

In [8]:
def int_confianca(df):
    dados = df.Profit.tolist()

    # Nível de confiança desejado (por exemplo, 95%)
    confianca = 0.95

    # Cálculo do intervalo de confiança para a média
    media = np.mean(dados)
    n = len(dados)
    erro_padrao = stats.sem(dados)
    intervalo = stats.t.interval(confianca, df=n-1, loc=media, scale=erro_padrao)
    intervalo = tuple(round(item, 2) for item in intervalo)

    # Exibição do resultado
    print(f'Média: {round(media,2)}')
    print(f'Desvio padrão: {round(erro_padrao,2)}')
    print(f"Intervalo de confiança: {intervalo}")

In [9]:
def teste_variancia(df):
    dados = df.Profit.tolist()

    # Cálculo do intervalo de confiança para a média
    media = np.mean(dados)
    n = len(dados)
    erro_padrao = stats.sem(dados)
    desvio_padrao = np.std(dados)

    media_por_dp = media / desvio_padrao
    raiz = 1.96 / math.sqrt(n)

    # Exibição do resultado
    print(f'Média/DP: {media_por_dp:.2f}')
    print(f'1,96/raiz: {raiz:.2f}')

In [10]:
################################################################################
# Prepara o DF
################################################################################

def prepara_df(df):
  df = df[(df.Over_Line >= 5)]
  df = df[(df['Home_Pts'] + df['Away_Pts']) != df['Over_Line']]
  df = df[df['HA_Odds_A'] != 0]
  df = df[df['Odds_H'] != 0]
  df = df[df['Odds_A'] != 0]
  df = df[df['Odds_Over'] != 0]
  df = df[df['Odds_Under'] != 0]

  df = cria_alvos(df)

  df['P(H)'] = 1 / df['Odds_H']
  df['P(A)'] = 1 / df['Odds_A']
  df['P(O)'] = 1 / df['Odds_Over']
  df['P(U)'] = 1 / df['Odds_Under']
  
  df['P_Diff'] = ((1 / df['Odds_H']) + (1 / df['Odds_A'])) - 1
  df['Porc_Over_Home'] = df.groupby('Home')[target].rolling(3).mean().reset_index(level=0, drop=True)
  df['Porc_Over_Away'] = df.groupby('Away')[target].rolling(3).mean().reset_index(level=0, drop=True)
  df['Porc_Over_Home']  = df.groupby('Home')['Porc_Over_Home'].shift(1)
  df['Porc_Over_Away']  = df.groupby('Away')['Porc_Over_Away'].shift(1)
  df['Porc_Over_Home'] = df['Porc_Over_Home'].replace(np.nan, 0)
  df['Porc_Over_Away'] = df['Porc_Over_Away'].replace(np.nan, 0)
  df.reset_index(inplace=True, drop=True)

  # Custo do gol
  df['CustoGolHome'] = df['Home_Pts'] / (1 / df['Odds_H'])
  df['CustoGolAway'] = df['Away_Pts'] / (1 / df['Odds_A'])
  df['CustoGolHome'] = df['CustoGolHome'].replace(np.inf, 0)
  df['CustoGolAway'] = df['CustoGolAway'].replace(np.inf, 0)
  df.reset_index(drop=True, inplace=True)
  
  # Último custo do gol
  df['Last_CG_H']  = df.groupby('Home')['CustoGolHome'].shift(1)
  df['Last_CG_A']  = df.groupby('Away')['CustoGolAway'].shift(1)
  df['Last_CG_H'] = df['Last_CG_H'].replace(np.nan, 0)
  df['Last_CG_A'] = df['Last_CG_A'].replace(np.nan, 0)

  # Média móvel do custo do gol
  df['MediaCustoGolHome'] = df.groupby('Home')['CustoGolHome'].rolling(window=3).mean().reset_index(level=0, drop=True)
  df['MediaCustoGolAway'] = df.groupby('Away')['CustoGolAway'].rolling(window=3).mean().reset_index(level=0, drop=True)

  df['MediaCustoGolHome']  = df.groupby('Home')['MediaCustoGolHome'].shift(1)
  df['MediaCustoGolAway']  = df.groupby('Away')['MediaCustoGolAway'].shift(1)

  df['MediaCustoGolHome'] = df['MediaCustoGolHome'].replace(np.nan, 0)
  df['MediaCustoGolAway'] = df['MediaCustoGolAway'].replace(np.nan, 0)

  limit_up_h = df.CustoGolHome.mean() + df.CustoGolHome.std()
  limit_up_a = df.CustoGolAway.mean() + df.CustoGolAway.std()
  df.loc[(df['CustoGolHome'] > limit_up_h), 'Acima_Last_CG_H'] = 1
  df.loc[(df['CustoGolHome'] <= limit_up_h), 'Acima_Last_CG_H'] = 0
  df.loc[(df['CustoGolAway'] > limit_up_a), 'Acima_Last_CG_A'] = 1
  df.loc[(df['CustoGolAway'] <= limit_up_a), 'Acima_Last_CG_A'] = 0
  df['Acima_Last_CG_H']  = df.groupby('Home')['Acima_Last_CG_H'].shift(1)
  df['Acima_Last_CG_A']  = df.groupby('Away')['Acima_Last_CG_A'].shift(1)
  df['Acima_Last_CG_H'] = df['Acima_Last_CG_H'].replace(np.nan, 0)
  df['Acima_Last_CG_A'] = df['Acima_Last_CG_A'].replace(np.nan, 0)

  limit_down_h = df.CustoGolHome.mean() - df.CustoGolHome.std()
  limit_down_a = df.CustoGolAway.mean() - df.CustoGolAway.std()
  df.loc[(df['CustoGolHome'] < limit_down_h), 'Abaixo_Last_CG_H'] = 1
  df.loc[(df['CustoGolHome'] >= limit_down_h), 'Abaixo_Last_CG_H'] = 0
  df.loc[(df['CustoGolAway'] < limit_down_a), 'Abaixo_Last_CG_A'] = 1
  df.loc[(df['CustoGolAway'] >= limit_down_a), 'Abaixo_Last_CG_A'] = 0
  df['Abaixo_Last_CG_H']  = df.groupby('Home')['Abaixo_Last_CG_H'].shift(1)
  df['Abaixo_Last_CG_A']  = df.groupby('Away')['Abaixo_Last_CG_A'].shift(1)
  df['Abaixo_Last_CG_H'] = df['Abaixo_Last_CG_H'].replace(np.nan, 0)
  df['Abaixo_Last_CG_A'] = df['Abaixo_Last_CG_A'].replace(np.nan, 0)

  df['CV_ML'] = (df[['Odds_H', 'Odds_A']].std(axis=1)) / (df[['Odds_H', 'Odds_A']].mean(axis=1))
  df['CV_Over'] = (df[['Odds_Over', 'Odds_Under']].std(axis=1)) / (df[['Odds_Over', 'Odds_Under']].mean(axis=1))

  df.pop('CustoGolHome')
  df.pop('CustoGolAway')
  
  return df

### Ajuste dos datasets de treino e teste

In [11]:
treino0 = pd.read_csv('../data/wnba/wnba-2019.csv')
treino1 = pd.read_csv('../data/wnba/wnba-2020.csv')
treino2 = pd.read_csv('../data/wnba/wnba-2021.csv')
treino3 = pd.read_csv('../data/wnba/wnba-2022.csv')
treino4 = pd.read_csv('../data/wnba/wnba-2023.csv')
treino = pd.concat([treino0, treino1, treino2, treino3, treino4])
treino.sort_values('Date', inplace=True)
flt = (treino['Home_Pts'] != treino['Away_Pts'])
treino = treino[flt]
treino.reset_index(inplace=True)

treino = prepara_df(treino)
treino.drop('index', axis=1, inplace=True)

colunas_float = ['Home_Pts', 'Away_Pts',
       'Odds_H', 'Odds_A', 'Over_Line', 'Odds_Over', 'Odds_Under', 'HA_Line',
       'HA_Odds_H', 'HA_Odds_A', 'Back_Home', 'PL_Home', 'PL_Over', 'PL_Under', 'P(H)', 'P(A)', 'P(O)',
       'P(U)', 'P_Diff', 'Porc_Over_Home', 'Porc_Over_Away', 'MediaCustoGolHome', 'MediaCustoGolAway', 'Last_CG_H', 'Last_CG_A', 'CV_ML', 'CV_Over']
treino[colunas_float] = treino[colunas_float].astype(float)

In [13]:
treino

Unnamed: 0,Date,Season,Season_Time,Time,Home,Away,Home_Pts,Away_Pts,Odds_H,Odds_A,Over_Line,Odds_Over,Odds_Under,HA_Line,HA_Odds_H,HA_Odds_A,Back_Home,PL_Home,Back_Away,Back_Over,PL_Over,PL_Under,Back_HA_H,PL_HA_H,PL_HA_A,P(H),P(A),P(O),P(U),P_Diff,Porc_Over_Home,Porc_Over_Away,Last_CG_H,Last_CG_A,MediaCustoGolHome,MediaCustoGolAway,Acima_Last_CG_H,Acima_Last_CG_A,Abaixo_Last_CG_H,Abaixo_Last_CG_A,CV_ML,CV_Over
0,2019-05-11,2019,WNBA - PRÉ-TEMPORADA,23:00,Phoenix Mercury F,Los Angeles Sparks F,82.0,75.0,1.71,2.00,157.5,1.83,1.83,-4.5,1.99,1.71,1.0,-1.00,0.0,0.0,-1.00,0.83,1.0,0.99,-1.00,0.584795,0.500000,0.546448,0.546448,0.084795,0.000000,0.000000,0.00,0.00,0.000000,0.000000,0.0,0.0,0.0,0.0,0.110545,0.000000
1,2019-05-13,2019,WNBA - PRÉ-TEMPORADA,20:00,Connecticut Sun F,New York Liberty F,100.0,66.0,1.34,3.15,155.0,1.81,1.89,-8.0,1.95,1.74,1.0,-1.00,0.0,1.0,0.81,-1.00,1.0,0.95,-1.00,0.746269,0.317460,0.552486,0.529101,0.063729,0.000000,0.000000,0.00,0.00,0.000000,0.000000,0.0,0.0,0.0,0.0,0.570095,0.030578
2,2019-05-13,2019,WNBA - PRÉ-TEMPORADA,18:00,Atlanta Dream F,Dallas Wings F,82.0,59.0,1.50,2.50,155.0,1.83,1.83,-7.0,1.97,1.73,1.0,-1.00,0.0,0.0,-1.00,0.83,1.0,0.97,-1.00,0.666667,0.400000,0.546448,0.546448,0.066667,0.000000,0.000000,0.00,0.00,0.000000,0.000000,0.0,0.0,0.0,0.0,0.353553,0.000000
3,2019-05-14,2019,WNBA - PRÉ-TEMPORADA,13:00,Chicago Sky F,Indiana Fever F,58.0,69.0,1.45,2.60,156.5,1.83,1.83,-5.5,1.99,1.71,0.0,1.60,1.0,0.0,-1.00,0.83,0.0,-1.00,0.71,0.689655,0.384615,0.546448,0.546448,0.074271,0.000000,0.000000,0.00,0.00,0.000000,0.000000,0.0,0.0,0.0,0.0,0.401567,0.000000
4,2019-05-15,2019,WNBA - PRÉ-TEMPORADA,23:00,Seattle Storm F,Phoenix Mercury F,84.0,87.0,2.20,1.62,155.5,1.83,1.83,1.5,1.97,1.73,0.0,0.62,1.0,1.0,0.83,-1.00,0.0,-1.00,0.73,0.454545,0.617284,0.546448,0.546448,0.071829,0.000000,0.000000,0.00,0.00,0.000000,0.000000,0.0,0.0,0.0,0.0,0.214724,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
846,2023-06-29,2023,WNBA,23:00,Las Vegas Aces F,New York Liberty F,98.0,81.0,1.36,3.30,172.0,1.91,1.91,-8.0,1.91,1.91,1.0,-1.00,0.0,1.0,0.91,-1.00,1.0,0.91,-1.00,0.735294,0.303030,0.523560,0.523560,0.038324,0.333333,1.000000,108.07,131.72,102.163333,139.626667,0.0,0.0,0.0,0.0,0.588750,0.000000
847,2023-06-30,2023,WNBA,21:00,Chicago Sky F,Los Angeles Sparks F,86.0,78.0,1.74,2.15,154.5,1.91,1.91,-3.0,1.91,1.91,1.0,-1.00,0.0,1.0,0.91,-1.00,1.0,0.91,-1.00,0.574713,0.465116,0.523560,0.523560,0.039829,0.666667,0.000000,135.20,141.75,158.766667,154.316667,0.0,0.0,0.0,0.0,0.149056,0.000000
848,2023-07-01,2023,WNBA,23:00,Phoenix Mercury F,Minnesota Lynx F,76.0,86.0,1.80,2.05,159.0,1.82,1.97,-999.0,1.92,1.92,0.0,1.05,1.0,1.0,0.82,-1.00,0.0,-1.00,0.92,0.555556,0.487805,0.549451,0.507614,0.043360,0.333333,0.333333,161.20,222.75,506.576667,365.683333,0.0,0.0,0.0,0.0,0.091832,0.055972
849,2023-07-01,2023,WNBA,16:00,Las Vegas Aces F,Connecticut Sun F,102.0,84.0,1.13,6.25,168.5,1.91,1.91,-12.0,2.00,1.79,1.0,-1.00,0.0,1.0,0.91,-1.00,1.0,1.00,-1.00,0.884956,0.160000,0.523560,0.523560,0.044956,0.666667,1.000000,133.28,137.06,113.310000,138.033333,0.0,0.0,0.0,0.0,0.981135,0.000000


In [17]:
treino.to_excel('WNBA-2019-2023.xlsx', index=False)