In [1]:
from nba_api.stats.static import players, teams
from nba_api.stats.library.parameters import SeasonAll
from nba_api.stats.endpoints import leaguegamelog
from nba_api.stats.endpoints import boxscoreadvancedv2
from nba_api.stats.endpoints import boxscorescoringv2

import random
import pandas as pd
import numpy as np
from tqdm import tqdm
import time as time
from time import sleep
from datetime import date
from IPython.core.display import clear_output
import sqlite3

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait       
from selenium.webdriver.common.by import By       
from selenium.webdriver.support import expected_conditions as EC

options = Options()
options.headless = True

import random

# Data Refresh

In [2]:
def season_string(season):
    return str(season) + '-' + str(season+1)[-2:]

In [3]:
def update_team_basic_boxscores(conn, season):
    table_name = 'team_basic_boxscores'
    season_str = season_string(season)
        
    dfs = []
    for season_type in ['Regular Season', 'Playoffs']:
        team_gamelogs = leaguegamelog.LeagueGameLog(season=season_str, season_type_all_star=season_type).get_data_frames()[0]
        dfs.append(team_gamelogs)
        
    team_gamelogs_updated = pd.concat(dfs)
    team_gamelogs_updated['SEASON'] = season_str
    team_gamelogs_updated.drop(columns = ['SEASON_ID', 'VIDEO_AVAILABLE'], inplace=True)
    
    team_gamelogs_updated.to_sql(table_name, conn, if_exists='append', index=False)

    cur = conn.cursor()
    cur.execute('DELETE FROM {} WHERE rowid NOT IN (SELECT min(rowid) FROM {} GROUP BY TEAM_ID, GAME_ID)'.format(table_name, table_name))
    conn.commit()
    
    return None

In [4]:
def update_team_advanced_boxscores(conn, season, dates):
    table_name = 'team_advanced_boxscores'
    
    season_str = season_string(season)
    
    game_ids_not_added = []
    
    # Pull the GAME_IDs from my data
    game_ids_in_db = pd.read_sql('''SELECT DISTINCT team_basic_boxscores.GAME_ID FROM team_basic_boxscores
                INNER JOIN team_advanced_boxscores 
                ON team_basic_boxscores.GAME_ID = team_advanced_boxscores.GAME_ID
                AND team_basic_boxscores.TEAM_ID = team_advanced_boxscores.TEAM_ID
                WHERE SEASON = "{}" '''.format(season_str), conn)

    game_ids_in_db = game_ids_in_db['GAME_ID'].tolist()
    
    missing_game_ids = []
    if len(dates) != 0:
        for date in dates:
            gamelogs = leaguegamelog.LeagueGameLog(
                season=season_str, date_from_nullable=date, date_to_nullable=date).get_data_frames()[0]
            missing_game_ids.extend(gamelogs['GAME_ID'].unique())
            
    else:        
        # get up to date GAME_IDs
        to_date_game_ids = []
        for season_type in ['Regular Season', 'Playoffs']:
            to_date_gamelogs = leaguegamelog.LeagueGameLog(season=season_str, season_type_all_star=season_type).get_data_frames()[0]
            to_date_game_ids.extend(to_date_gamelogs['GAME_ID'].unique())
        
        # See which game_ids are missing
        missing_game_ids = set(to_date_game_ids) - set(game_ids_in_db)
        
    num_games_updated = len(missing_game_ids)
    print("num_games_updated:", num_games_updated)
    
    if num_games_updated == 0:
        print("All team advanced boxscores up to date in season {}".format(season_str))
        return None
    
    for game_id in tqdm(missing_game_ids, desc='progress'):
        try:
            boxscores = boxscoreadvancedv2.BoxScoreAdvancedV2(game_id).get_data_frames()[1]
            boxscores.to_sql(table_name, conn, if_exists='append', index=False)
            sleep(2)
        except:
            game_ids_not_added.append(game_id)  
    
    cur = conn.cursor()
    cur.execute('DELETE FROM {} WHERE rowid NOT IN (SELECT max(rowid) FROM {} GROUP BY TEAM_ID, GAME_ID)'.format(table_name, table_name))
    conn.commit()
    
    return game_ids_not_added, missing_game_ids

In [7]:
#update boxscores with any missing data
date = date.today()
year = date.year
month = date.month

if month >= 9:
    season = year
else:
    season = year-1
    
con = sqlite3.connect('nba.db')
update_team_basic_boxscores(con, season)
try:
    game_ids_not_added, game_ids_added = update_team_advanced_boxscores(con, season, [])
    print('Number of games missing: ', len(game_ids_not_added))
except:
    print('Database already up to date')


num_games_updated: 0
All team advanced boxscores up to date in season 2021-22
Database already up to date


# Data Preprocessing

In [17]:
#pull in full updated datasets
team_basic_boxscores_df = pd.read_sql('select * from team_basic_boxscores', con)
team_advanced_boxscores_df = pd.read_sql('select * from team_advanced_boxscores', con)

team_boxscores_df = team_basic_boxscores_df.merge(team_advanced_boxscores_df, how='inner', on=['GAME_ID', 'TEAM_ID'])

team_boxscores_df.head()

Unnamed: 0,SEASON,TEAM_ID,TEAM_ABBREVIATION_x,TEAM_NAME_x,GAME_ID,GAME_DATE,MATCHUP,WL,MIN_x,FGM,...,TM_TOV_PCT,EFG_PCT,TS_PCT,USG_PCT,E_USG_PCT,E_PACE,PACE,PACE_PER40,POSS,PIE
0,2021-22,1610612747,LAL,Los Angeles Lakers,22100002,2021-10-19,LAL vs. GSW,L,240,45,...,16.1,0.553,0.551,1.0,0.198,115.28,112.5,93.75,112.0,0.422
1,2021-22,1610612744,GSW,Golden State Warriors,22100002,2021-10-19,GSW @ LAL,W,240,41,...,15.0,0.516,0.57,1.0,0.2,115.28,112.5,93.75,113.0,0.578
2,2021-22,1610612751,BKN,Brooklyn Nets,22100001,2021-10-19,BKN @ MIL,L,240,37,...,12.7,0.542,0.552,1.0,0.2,105.02,102.0,85.0,102.0,0.407
3,2021-22,1610612749,MIL,Milwaukee Bucks,22100001,2021-10-19,MIL vs. BKN,W,240,48,...,7.8,0.538,0.562,1.0,0.194,105.02,102.0,85.0,102.0,0.593
4,2021-22,1610612754,IND,Indiana Pacers,22100003,2021-10-20,IND @ CHA,L,240,42,...,15.9,0.561,0.607,1.0,0.199,112.22,106.5,88.75,107.0,0.52


In [18]:
#add home team flag
team_boxscores_df['HOME_TEAM'] = team_boxscores_df['MATCHUP'].str[4] == 'v'

team_boxscores_df['HOME_TEAM']

0         True
1        False
2        False
3         True
4        False
         ...  
56303     True
56304     True
56305    False
56306    False
56307     True
Name: HOME_TEAM, Length: 56308, dtype: bool

In [19]:
team_boxscores_df.drop(columns=['TEAM_ABBREVIATION_x',
                                'TEAM_NAME_x',
                                'MATCHUP',
                                'TEAM_NAME_y',
                                'TEAM_ABBREVIATION_y',
                                'MIN_y'], inplace=True)
team_boxscores_df.rename(columns={'MIN_x':'MIN'}, inplace=True)

In [20]:
#manually calculate estimation of missing rebound percentage stats
oreb_pct_calc = np.empty(len(team_boxscores_df))
dreb_pct_calc = np.empty(len(team_boxscores_df))
reb_pct_calc = np.empty(len(team_boxscores_df))

for i, row in tqdm(team_boxscores_df.iterrows()):
    game_id = row['GAME_ID']
    team_id = row['TEAM_ID']
    
    opp_row = team_boxscores_df[team_boxscores_df['GAME_ID'] == game_id]
    opp_row = opp_row[opp_row['TEAM_ID'] != team_id]
    
    oreb_pct_calc[i] = row['OREB'] / (row['OREB'] + opp_row['DREB'])
    dreb_pct_calc[i] = row['DREB'] / (row['DREB'] + opp_row['OREB'])
    reb_pct_calc[i] = row['REB'] / (row['REB'] + opp_row['REB'])

team_boxscores_df['OREB_PCT_CALC'] = oreb_pct_calc
team_boxscores_df['DREB_PCT_CALC'] = dreb_pct_calc
team_boxscores_df['REB_PCT_CALC'] = reb_pct_calc

56308it [10:56, 85.80it/s] 


In [21]:
#fill in missing rebound percentage stats with calculated values
rebound_pct_cols = ['OREB_PCT', 'DREB_PCT', 'REB_PCT']

for col in rebound_pct_cols:
    team_boxscores_df[col].fillna(value=team_boxscores_df[col + '_CALC'], inplace=True)

team_boxscores_df.drop(columns=['OREB_PCT_CALC',
                                'DREB_PCT_CALC',
                                'REB_PCT_CALC'], inplace=True)
team_boxscores_df.head()

Unnamed: 0,SEASON,TEAM_ID,GAME_ID,GAME_DATE,WL,MIN,FGM,FGA,FG_PCT,FG3M,...,EFG_PCT,TS_PCT,USG_PCT,E_USG_PCT,E_PACE,PACE,PACE_PER40,POSS,PIE,HOME_TEAM
0,2021-22,1610612747,22100002,2021-10-19,L,240,45,95,0.474,15,...,0.553,0.551,1.0,0.198,115.28,112.5,93.75,112.0,0.422,True
1,2021-22,1610612744,22100002,2021-10-19,W,240,41,93,0.441,14,...,0.516,0.57,1.0,0.2,115.28,112.5,93.75,113.0,0.578,False
2,2021-22,1610612751,22100001,2021-10-19,L,240,37,84,0.44,17,...,0.542,0.552,1.0,0.2,105.02,102.0,85.0,102.0,0.407,False
3,2021-22,1610612749,22100001,2021-10-19,W,240,48,105,0.457,17,...,0.538,0.562,1.0,0.194,105.02,102.0,85.0,102.0,0.593,True
4,2021-22,1610612754,22100003,2021-10-20,L,240,42,90,0.467,17,...,0.561,0.607,1.0,0.199,112.22,106.5,88.75,107.0,0.52,False


In [22]:
#store actual point spread for each game
game_ids = team_boxscores_df['GAME_ID'].unique()
spreads = np.empty(len(game_ids))

for i, game_id in tqdm(enumerate(game_ids)):
    spread = team_boxscores_df[(team_boxscores_df['GAME_ID']==game_id) &
                               (team_boxscores_df['HOME_TEAM']==True)]['PLUS_MINUS']
    spreads[i] = spread

spreads_df = pd.DataFrame(data={'GAME_ID':game_ids, 'SPREAD':spreads})
spreads_df.head()

28154it [04:44, 98.98it/s] 


Unnamed: 0,GAME_ID,SPREAD
0,22100002,-7.0
1,22100001,23.0
2,22100003,1.0
3,22100004,-6.0
4,22100006,-15.0


In [23]:
#change W/L column to 1's and 0's
team_boxscores_df['WL'] = team_boxscores_df['WL'].map({'W':1, 'L':0})

#convert game date to datetime
team_boxscores_df['GAME_DATE'] = pd.to_datetime(team_boxscores_df['GAME_DATE'])

In [24]:
#elo rating helper functions
#credit to rogerfitz

def get_K(MOV, elo_diff):
    """This K multiplier """
    K_0 = 20    

    if MOV > 0:
        multiplier = (MOV+3)**(0.8)/(7.5+0.006*(elo_diff))
    else:
        multiplier = (-MOV+3)**(0.8)/(7.5+0.006*(-elo_diff))
        
    return K_0*multiplier, K_0*multiplier

def get_S(team_score, opp_score):
    """S is 1 if the team wins, and 0 if the team loses"""
    S_team, S_opp = 0, 0
    if team_score > opp_score:
        S_team = 1
    else:
        S_opp = 1
    return S_team, S_opp


def elo_prediction(team_rating, opp_rating):
    """Generate the probability of a home victory based on the teams' elo ratings"""
    E_team = 1.0/(1 + 10 ** ((opp_rating - team_rating) / (400.0)))
    return E_team

def elo_update(team_score, opp_score, team_rating, opp_rating):
    # Add 100 to the home_rating for home court advantage   
    team_rating += 100
    
    E_team = elo_prediction(team_rating, opp_rating)
    E_opp = 1.0 - E_team
    
    MOV = team_score - opp_score
    if MOV > 0:
        elo_diff = team_rating - opp_rating
    else:
        elo_diff = opp_rating - team_rating
            
    S_team, S_opp = get_S(team_score, opp_score)
    
    K_team, K_opp = get_K(MOV, elo_diff)

    return K_team*(S_team-E_team), K_opp*(S_opp-E_opp)
    

def season_reset(rating):
    new_rating = 0.75*rating + 0.25*1505
    return new_rating

In [25]:
def add_elo_ratings(df):
    df.sort_values(['GAME_DATE', 'GAME_ID', 'HOME_TEAM'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    elo_col = np.empty(df.shape[0])
    elo_dict = {}
    cur_season = df.iloc[0]['SEASON']
    
    for i, row in tqdm(df.iterrows()):
        
        if i%2 != 0:
            continue
        
        if row['SEASON'] != cur_season:
            cur_season = row['SEASON']
            elo_dict = {team_id:season_reset(elo) for team_id, elo in elo_dict.items()}
        
        away_row = row
        home_row = df.iloc[i+1]
        
        #check both rows are from same game
        if away_row['GAME_ID'] != home_row['GAME_ID']:
            print('game ids do not match')
            print('home game id: ', home_row['GAME_ID'])
            print('away game id: ', away_row['GAME_ID'])
            print('iteration: ', i)
            break
        
        home_team_id = home_row['TEAM_ID']
        away_team_id = away_row['TEAM_ID']
        
        if home_team_id not in elo_dict:
            elo_dict[home_team_id] = 1300
        if away_team_id not in elo_dict:
            elo_dict[away_team_id] = 1300
        
        home_elo = elo_dict[home_team_id]
        away_elo = elo_dict[away_team_id]

        elo_col[i+1] = home_elo
        elo_col[i] = away_elo
        
        home_pts = home_row['PTS']
        away_pts = away_row['PTS']

        home_elo_update, away_elo_update = elo_update(home_pts, away_pts, home_elo, away_elo)
        
        new_home_elo = home_elo + home_elo_update
        new_away_elo = away_elo + away_elo_update
        
        elo_dict[home_team_id] = new_home_elo
        elo_dict[away_team_id] = new_away_elo
    
    df['ELO'] = elo_col
        
    return df, elo_dict

In [26]:
#add elo ratings
team_boxscores_df, elo_dict = add_elo_ratings(team_boxscores_df)
team_boxscores_df.head()

56308it [00:12, 4405.56it/s]


Unnamed: 0,SEASON,TEAM_ID,GAME_ID,GAME_DATE,WL,MIN,FGM,FGA,FG_PCT,FG3M,...,TS_PCT,USG_PCT,E_USG_PCT,E_PACE,PACE,PACE_PER40,POSS,PIE,HOME_TEAM,ELO
0,2000-01,1610612755,20000001,2000-10-31,1,240,38,66,0.576,3,...,0.638,1.0,0.2,86.88,88.0,73.33,88.0,0.819,False,1300.0
1,2000-01,1610612752,20000001,2000-10-31,0,240,25,70,0.357,3,...,0.447,1.0,0.201,86.88,88.0,73.33,88.0,0.181,True,1300.0
2,2000-01,1610612739,20000002,2000-10-31,1,240,32,78,0.41,2,...,0.467,1.0,0.202,99.76,98.0,81.67,98.0,0.512,False,1300.0
3,2000-01,1610612751,20000002,2000-10-31,0,240,31,85,0.365,3,...,0.425,1.0,0.196,99.76,98.0,81.67,98.0,0.488,True,1300.0
4,2000-01,1610612764,20000003,2000-10-31,0,240,33,72,0.458,4,...,0.521,1.0,0.195,100.54,98.0,81.67,98.0,0.415,False,1300.0


In [27]:
#sort dataframe by date
team_boxscores_df.sort_values(by=['GAME_DATE'], ascending=True, inplace=True)

team_boxscores_df.head()

Unnamed: 0,SEASON,TEAM_ID,GAME_ID,GAME_DATE,WL,MIN,FGM,FGA,FG_PCT,FG3M,...,TS_PCT,USG_PCT,E_USG_PCT,E_PACE,PACE,PACE_PER40,POSS,PIE,HOME_TEAM,ELO
0,2000-01,1610612755,20000001,2000-10-31,1,240,38,66,0.576,3,...,0.638,1.0,0.2,86.88,88.0,73.33,88.0,0.819,False,1300.0
25,2000-01,1610612763,20000013,2000-10-31,1,240,39,90,0.433,4,...,0.463,1.0,0.201,98.56,97.0,80.83,97.0,0.581,True,1300.0
24,2000-01,1610612760,20000013,2000-10-31,0,240,32,84,0.381,8,...,0.47,1.0,0.199,98.56,97.0,80.83,97.0,0.419,False,1300.0
23,2000-01,1610612757,20000012,2000-10-31,0,240,34,85,0.4,4,...,0.467,1.0,0.2,88.62,87.5,72.92,88.0,0.361,True,1300.0
21,2000-01,1610612744,20000011,2000-10-31,1,240,32,79,0.405,4,...,0.501,1.0,0.202,103.76,104.0,86.67,104.0,0.526,True,1300.0


In [30]:
#take exponentially weighted moving average of stats for each game
num_games = 50

non_feature_cols = {'SEASON', 'TEAM_ID', 'GAME_ID', 'GAME_DATE', 'HOME_TEAM', 'TEAM_CITY', 'ELO'}
feature_cols = set(team_boxscores_df.columns) - non_feature_cols

weighted_avgs = []

for i, row in tqdm(team_boxscores_df.iterrows()):
    team_id = row['TEAM_ID']
    game_date = row['GAME_DATE']

    temp_df = team_boxscores_df[(team_boxscores_df['TEAM_ID'] == team_id) &
                                (team_boxscores_df['GAME_DATE'] < game_date)].copy()
    temp_df.sort_values(by=['GAME_DATE'], ascending=True, inplace=True)
    temp_df = temp_df.tail(num_games)
    
    if len(temp_df) < num_games:
        continue
    
    temp_df[list(feature_cols)] = temp_df[list(feature_cols)].ewm(span=num_games).mean()
    
    row[list(feature_cols)] = temp_df.iloc[-1]
    
    weighted_avgs.append(row)

weighted_avg_df = pd.DataFrame(weighted_avgs)
weighted_avg_df.reset_index(drop=True, inplace=True)
weighted_avg_df.head()

56308it [12:26, 75.47it/s]


Unnamed: 0,SEASON,TEAM_ID,GAME_ID,GAME_DATE,WL,MIN,FGM,FGA,FG_PCT,FG3M,...,TS_PCT,USG_PCT,E_USG_PCT,E_PACE,PACE,PACE_PER40,POSS,PIE,HOME_TEAM,ELO
0,2000-01,1610612760,20000689,2001-02-05,0.569296,240.928247,36.836888,80.440563,0.459182,5.612216,...,0.536996,1.0,0.198435,95.770107,94.28911,78.574564,94.476716,0.497257,True,1321.428336
1,2000-01,1610612760,20000704,2001-02-07,0.592322,240.891845,36.964668,80.489445,0.460425,5.569799,...,0.536773,1.0,0.198366,95.543005,94.168661,78.474349,94.34891,0.501037,False,1325.541563
2,2000-01,1610612743,20000708,2001-02-08,0.523831,240.848196,36.659073,83.50921,0.438861,6.778872,...,0.522728,1.0,0.198908,95.604322,94.263939,78.553205,94.690775,0.491382,False,1312.520179
3,2000-01,1610612751,20000706,2001-02-08,0.313914,240.0,33.258465,79.768853,0.41608,4.035858,...,0.495208,1.0,0.198558,94.494865,93.587443,77.98997,93.638356,0.435177,True,1158.589256
4,2000-01,1610612766,20000707,2001-02-08,0.405933,243.483534,33.042004,78.836785,0.42157,3.881714,...,0.495746,1.0,0.199187,90.996865,90.377281,75.313878,91.692683,0.500157,True,1299.264022


In [31]:
#add number of rest days
rest_days = np.empty(weighted_avg_df.shape[0])

for i, row in tqdm(weighted_avg_df.iterrows()):
    game_date = row['GAME_DATE']
    team_id = row['TEAM_ID']
    rest_days_df = weighted_avg_df[(weighted_avg_df['TEAM_ID'] == team_id) &
                                     (weighted_avg_df['GAME_DATE'] < game_date)].copy()
    if len(rest_days_df) == 0:
        #assuming earliest game for each team was at the start of the 2000 season, so will assume 4 months rest since their last game of the 1999-2000 season
        rest_days[i] = 120
        continue
    
    rest_days_df.sort_values(by=['GAME_DATE'], ascending=False, inplace=True, ignore_index=True)
    last_game_date = rest_days_df.iloc[0]['GAME_DATE']
    
    delta = game_date - last_game_date
    rest_days[i] = delta.days
    
weighted_avg_df['REST_DAYS'] = rest_days

54808it [04:39, 195.96it/s]


In [None]:
#reformat so each game is represented by a single row which is the difference between each team's stats
game_ids = weighted_avg_df['GAME_ID'].unique()

revised_rows = []
missing_game_ids = []
missing_game_count = 0

feature_cols.add('ELO')

for game_id in tqdm(game_ids):
    
    home_team_row = weighted_avg_df[(weighted_avg_df['GAME_ID']==game_id) &
                                      (weighted_avg_df['HOME_TEAM']==True)]
    away_team_row = weighted_avg_df[(weighted_avg_df['GAME_ID']==game_id) &
                                      (weighted_avg_df['HOME_TEAM']==False)]
    
    try:
        stats_diff = home_team_row[feature_cols].subtract(np.array(away_team_row[feature_cols]))
        stats_diff[['SEASON','GAME_DATE','GAME_ID','HOME_TEAM_ID','HOME_TEAM_CITY']] = home_team_row[['SEASON',
                                                                                                      'GAME_DATE',
                                                                                                      'GAME_ID',
                                                                                                      'TEAM_ID',
                                                                                                      'TEAM_CITY']]
        revised_rows.append(stats_diff)
    
    except:
        missing_game_ids.append(game_id)
        missing_game_count += 1

final_df = pd.concat(revised_rows)
final_df.head()

print('Number of missing games: ', missing_game_count)

In [None]:
#add spread actuals
final_df = final_df.merge(spreads_df, how='inner', on=['GAME_ID'])
final_df.head()

In [None]:
def check_missing_vals(df):
    cols_w_missing_vals = []
    for col in df.columns:
        if df[col].isna().sum() != 0:
            cols_w_missing_vals.append(col)
    return cols_w_missing_vals

In [None]:
#check for missing values
if len(check_missing_vals(team_boxscores_df)) > 0:
    raise(ValueError('Data has missing values'))

In [None]:
#save updated data as csv
final_df.to_csv('training_data.csv')

In [None]:
#possibly add code to remove data that is more that 20 seasons old

# Pull Betting Spreads and Moneylines For Current Day's Games

In [14]:
#pull spreads and moneylines for the day
def pull_spreads(date):
    
    dates_with_no_data = []
    
    seasons = []
    gm_dates = []
    away_teams = []
    home_teams = []
    away_scoreboards = []
    home_scoreboards = []
    away_spreads = []
    home_spreads = []
    
    web = 'https://www.sportsbookreview.com/betting-odds/nba-basketball/?date={}'.format(date)
    path = '../Downloads/chromedriver'
    driver = webdriver.Chrome(path)
    driver.get(web)
    sleep(random.randint(1,2))

    try:
        single_row_events = driver.find_elements_by_class_name('eventMarketGridContainer-3QipG')

    except:
        print("No Data for {}".format(date))
        dates_with_no_data.append(date)

    num_postponed_events = len(driver.find_elements_by_class_name('eventStatus-3EHqw'))

    num_listed_events = len(single_row_events)
    cutoff = num_listed_events - num_postponed_events

    for event in single_row_events[:cutoff]:

        away_team = event.find_elements_by_class_name('participantBox-3ar9Y')[0].text
        home_team = event.find_elements_by_class_name('participantBox-3ar9Y')[1].text
        away_teams.append(away_team)
        home_teams.append(home_team)
        gm_dates.append(date)

        scoreboard = event.find_elements_by_class_name('scoreboard-1TXQV')

        home_score = []
        away_score = []

        for score in scoreboard:
            quarters = score.find_elements_by_class_name('scoreboardColumn-2OtpR')
            for i in range(len(quarters)):
                scores = quarters[i].text.split('\n')
                away_score.append(scores[0])
                home_score.append(scores[1])

            home_score = ",".join(home_score)
            away_score = ",".join(away_score)

            away_scoreboards.append(away_score)
            home_scoreboards.append(home_score)


        if len(away_scoreboards) != len(away_teams):
            num_to_add = len(away_teams) - len(away_scoreboards)
            for i in range(num_to_add):
                away_scoreboards.append('')
                home_scoreboards.append('')

        spreads = event.find_elements_by_class_name('pointer-2j4Dk')
        away_lines = []
        home_lines = []
        for i in range(len(spreads)):    
            if i % 2 == 0:
                away_lines.append(spreads[i].text)
            else:
                home_lines.append(spreads[i].text)

        away_lines = ",".join(away_lines)
        home_lines = ",".join(home_lines)

        away_spreads.append(away_lines)
        home_spreads.append(home_lines)

        if len(away_spreads) != len(away_teams):
            num_to_add = len(away_teams) - len(away_spreads)
            for i in range(num_to_add):
                away_scoreboards.append('')
                home_scoreboards.append('')

    driver.quit()
    clear_output(wait=True)

    df = pd.DataFrame({'GM_DATE':gm_dates,
                      'AWAY_TEAM':away_teams,
                      'HOME_TEAM':home_teams,
                      'AWAY_SCOREBOARD':away_scoreboards,
                      'HOME_SCOREBOARD':home_scoreboards,
                      'AWAY_SPREAD':away_spreads,
                      'HOME_SPREAD':home_spreads})

    df = df.sort_values(['GM_DATE']).reset_index(drop=True)

    
    return df

In [19]:
spreads_df = pull_spreads(date)
#spreads_df = pull_spreads('2022-06-16')

  driver = webdriver.Chrome(path)
  single_row_events = driver.find_elements_by_class_name('eventMarketGridContainer-3QipG')
  num_postponed_events = len(driver.find_elements_by_class_name('eventStatus-3EHqw'))


In [20]:
spreads_df

Unnamed: 0,GM_DATE,AWAY_TEAM,HOME_TEAM,AWAY_SCOREBOARD,HOME_SCOREBOARD,AWAY_SPREAD,HOME_SPREAD
0,2022-06-16,Golden State,Boston,,,"-,+4-113,-,+3½-118,-","-,-4-109,-,-3½-118,-"


In [None]:
#function to create model input for a given game
#i.e. turns any given game into the difference of each team's average stats and elo and adds rest days

def create_model_input(home_team, away_team, game_date):
    
    non_feature_cols = {'SEASON', 'TEAM_ID', 'GAME_ID', 'GAME_DATE', 'HOME_TEAM', 'TEAM_CITY', 'ELO'}
    
    
    #pull each team's last n games
    home_team_df = team_boxscores_df[team_boxscores_df['TEAM_CITY']==home_team].tail(num_games)
    away_team_df = team_boxscores_df[team_boxscores_df['TEAM_CITY']==away_team].tail(num_games)
    
    #compute weighted average of each team's stats over the last n games
    home_team_df[list(feature_cols)] = home_team_df[list(feature_cols)].ewm(span=num_games).mean()
    away_team_df[list(feature_cols)] = away_team_df[list(feature_cols)].ewm(span=num_games).mean()
    
    home_row = home_team_df.iloc[-1]
    away_row = away_team_df.iloc[-1]
    
    #add rest days
    home_row['REST_DAYS'] = game_date - home_row['GAME_DATE']
    away_row['REST_DAYS'] = game_date - away_row['GAME_DATE']
    
    non_feature_cols.remove('ELO')
    feature_cols = set(team_boxscores_df.columns) - non_feature_cols
    
    #compute difference between home team stats and away team stats
    output_row = home_row[feature_cols].subtract(np.array(away_row[feature_cols]))
    
    return output_row
    

In [None]:
#create model inputs for current day's games

test_data = np.empty(spreads_df.shape[0])

for i, game in spreads_df.iterrows():
    home_team = game['HOME_TEAM']
    away_team = game['AWAY_TEAM']
    game_date = game['GM_DATE']
    
    test_row = create_model_input(home_team, away_team, game_date)
    
test_df = pd.DataFrame(test_data)
test_df.head()

In [None]:
#add code to delete data earlier than 20 seasons ago

In [None]:
#evaluate available bets

In [None]:
#send email notification