In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import selenium

import sys
sys.path.append('..')
from src.etl import *
from src.data.update_data import *

import optuna
%load_ext autoreload
%autoreload 2

from sklearn.multioutput import MultiOutputRegressor
import lightgbm as lgb



In [2]:
Path.home().joinpath('NBA_Model_v1')

WindowsPath('C:/Users/Jordan Nishimura/NBA_Model_v1')

1) Update Data
2) Preprocess and reload into SQL DB
3) Pull Updated Preprocessed Data from SQL DB
4) Train Model on fully updated data
5) Pull Days Matchups
6) Predict on Days Matchups

## Update Data

In [110]:
db_path = Path.home() / 'NBA_Model_v1' / 'data' / 'nba.db'
season = 2021
update_all_data(db_path=db_path, season=season)



progress: 100%|██████████| 1/1 [00:12<00:00, 12.65s/it]


## Preprocess and Reload into SQL DB

In [111]:
%run ..\\src\\etl.py

Loading raw team boxscore data from sql database...
Loading betting data from sql database...
Cleaning Data...
Merging Boxscore and Betting Data...
Aggregating over last 5, 10, and 20 game windows
adding rest days
creating matchups between Home and Away team aggregated stats
Resorting by date
dropping nulls
loading table back into sql db as team_stats_ewa_matchup


In [170]:
def season_to_string(x):
    return str(x) + '-' + str(x+1)[-2:]

def get_training_data_all(target, con):   

    df = pd.read_sql('SELECT * FROM team_stats_ewa_matchup', con=con)
    df = df.drop(columns=['index'])

    df = df.sort_values('GAME_DATE')

    df = df.dropna()

    columns_to_drop = ['SEASON', 'HOME_TEAM_ABBREVIATION', 'GAME_DATE', 'GAME_ID', 'MATCHUP',
                        'HOME_HOME_GAME', 'HOME_TEAM_SCORE', 'HOME_ML', 'HOME_SPREAD',
                        'HOME_ATS_DIFF', 'HOME_TEAM_COVERED', 'HOME_POINT_DIFF',
                        'HOME_WL', 'AWAY_ML', 'AWAY_TEAM_SCORE',
                        'HOME_PTS_L5', 'HOME_PTS_L10', 'HOME_PTS_L20',
                        'HOME_PLUS_MINUS_L5', 'HOME_PLUS_MINUS_L10', 'HOME_PLUS_MINUS_L20',
                        'HOME_NET_RATING_L5', 'HOME_NET_RATING_L10', 'HOME_NET_RATING_L20',
                        'HOME_POSS_L5', 'HOME_POSS_L10', 'HOME_POSS_L20',
                        'HOME_PTS_opp_L5', 'HOME_PTS_opp_L10', 'HOME_PTS_opp_L20',
                        'HOME_PLUS_MINUS_opp_L5', 'HOME_PLUS_MINUS_opp_L10', 'HOME_PLUS_MINUS_opp_L20',
                        'HOME_NET_RATING_opp_L5', 'HOME_NET_RATING_opp_L10', 'HOME_NET_RATING_opp_L20',
                        'HOME_POSS_opp_L5', 'HOME_POSS_opp_L10', 'HOME_POSS_opp_L20',
                        'HOME_REB_L5', 'HOME_REB_L10', 'HOME_REB_L20',  
                        'HOME_REB_opp_L5', 'HOME_REB_opp_L10', 'HOME_REB_opp_L20',       
                        'AWAY_PTS_L5', 'AWAY_PTS_L10', 'AWAY_PTS_L20',
                        'AWAY_PLUS_MINUS_L5', 'AWAY_PLUS_MINUS_L10', 'AWAY_PLUS_MINUS_L20',
                        'AWAY_NET_RATING_L5', 'AWAY_NET_RATING_L10', 'AWAY_NET_RATING_L20',
                        'AWAY_POSS_L5', 'AWAY_POSS_L10', 'AWAY_POSS_L20',
                        'AWAY_PTS_opp_L5', 'AWAY_PTS_opp_L10', 'AWAY_PTS_opp_L20',
                        'AWAY_PLUS_MINUS_opp_L5', 'AWAY_PLUS_MINUS_opp_L10', 'AWAY_PLUS_MINUS_opp_L20',
                        'AWAY_NET_RATING_opp_L5', 'AWAY_NET_RATING_opp_L10', 'AWAY_NET_RATING_opp_L20',
                        'AWAY_POSS_opp_L5', 'AWAY_POSS_opp_L10', 'AWAY_POSS_opp_L20',
                        'AWAY_REB_L5', 'AWAY_REB_L10', 'AWAY_REB_L20',
                        'AWAY_REB_opp_L5', 'AWAY_REB_opp_L10', 'AWAY_REB_opp_L20']

    X_train = df.drop(columns=columns_to_drop)
    y_train = df[target]

    return X_train, y_train


In [171]:
target = ['HOME_TEAM_SCORE', 'AWAY_TEAM_SCORE']
db_filepath = Path.home().joinpath('NBA_model_v1', 'data', 'nba.db')
connection = sqlite3.connect(db_filepath)

X_train, y_train = get_training_data_all(target = target, con=connection)

In [172]:
X_train.shape

(10955, 566)

In [199]:
#load study with best hyperparameters
study_name = str(Path.home().joinpath('NBA_model_v1', 'models', 'hyperparameter_tuning', 'LGBMRegressor'))    
storage_name = "sqlite:///{}.db".format(study_name)

study = optuna.load_study(study_name = study_name, storage = storage_name)

params = study.best_params
print(params)

# instantiate model with hyperparameters
lgbr_model = MultiOutputRegressor(lgb.LGBMRegressor(**params))

## train model on full data

lgbr_model.fit(X_train, y_train)




{'boosting_type': 'gbdt', 'colsample_bytree': 0.7318848324101718, 'learning_rate': 0.10103401117148772, 'max_depth': 61, 'min_child_weight': 0.8780493419526695, 'num_leaves': 10, 'reg_alpha': 0.6129570091832104, 'reg_lambda': 8.769459451498605, 'subsample': 0.8136328018522077}


In [205]:
#load study with best hyperparameters
from sklearn import pipeline
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler




study_name = str(Path.home().joinpath('NBA_model_v1', 'models', 'hyperparameter_tuning', 'SGDRegressor_ScorePredictor'))    
storage_name = "sqlite:///{}.db".format(study_name)

study = optuna.load_study(study_name = study_name, storage = storage_name)

params = study.best_params
print(params)

# instantiate model with hyperparameters
sgd_model = MultiOutputRegressor(Pipeline([('scaler', StandardScaler()),
                                           ('sgd', SGDRegressor(**params, max_iter=10000))
                                           ]
                                          )
                                 )

## train model on full data

sgd_model.fit(X_train, y_train)



{'alpha': 0.04008569825513905, 'epsilon': 5.2966062017695155, 'l1_ratio': 0.9498835228685188, 'loss': 'huber'}


In [174]:
HOME_TEAM = 'SAS'
AWAY_TEAM = 'LAL'


In [216]:
import numpy as np
import pandas as pd
import sqlite3
from pathlib import Path
import warnings


def load_team_data(conn, start_season, end_season):
    """Loads basic, advanced, and scoring boxscores 
    from sqlite db and merges them into one dataframe"""
    

    basic = pd.read_sql("SELECT * FROM team_basic_boxscores", conn)
    adv = pd.read_sql("SELECT * FROM team_advanced_boxscores", conn)
    scoring = pd.read_sql("SELECT * FROM team_scoring_boxscores", conn)
    tracking = pd.read_sql("SELECT * FROM team_tracking_boxscores", conn)

    basic = basic.loc[basic['SEASON'].between(start_season, end_season)]
    basic[['GAME_ID', 'TEAM_ID']] = basic[['GAME_ID', 'TEAM_ID']].astype(str)
    adv[['GAME_ID', 'TEAM_ID']] = adv[['GAME_ID', 'TEAM_ID']].astype(str)
    scoring[['GAME_ID', 'TEAM_ID']] = scoring[['GAME_ID', 'TEAM_ID']].astype(str)
    tracking[['GAME_ID', 'TEAM_ID']] = tracking[['GAME_ID', 'TEAM_ID']].astype(str)

    df = pd.merge(basic, adv, how='left', on=[
                    'GAME_ID', 'TEAM_ID'], suffixes=['', '_y'])
    df = pd.merge(df, scoring, how='left', on=[
                  'GAME_ID', 'TEAM_ID'], suffixes=['', '_y'])
    
    df = pd.merge(df, tracking, how='left', on=['GAME_ID', 'TEAM_ID'],
                  suffixes=['', '_y'])
    

    df = df.drop(columns=['TEAM_NAME_y', 'TEAM_CITY',
                          'TEAM_ABBREVIATION_y',
                          'TEAM_CITY_y', 'MIN_y',
                          'FG_PCT_y', 'AST_y'])
    
    return df


def clean_team_data(df):
    """This function cleans the team_data
    1) Changes W/L to 1/0 
    2) Changes franchise abbreviations to their most 
    recent abbreviation for consistency
    3) Converts GAME_DATE to datetime object
    4) Creates a binary column 'HOME_GAME'
    5) Removes 3 games where advanced stats were not collected
    """
    df = df.copy()
    df['WL'] = (df['WL'] == 'W').astype(int)

    abbr_mapping = {'NJN': 'BKN',
                    'CHH': 'CHA',
                    'VAN': 'MEM',
                    'NOH': 'NOP',
                    'NOK': 'NOP',
                    'SEA': 'OKC'}

    df['TEAM_ABBREVIATION'] = df['TEAM_ABBREVIATION'].replace(abbr_mapping)
    df['MATCHUP'] = df['MATCHUP'].str.replace('NJN', 'BKN')
    df['MATCHUP'] = df['MATCHUP'].str.replace('CHH', 'CHA')
    df['MATCHUP'] = df['MATCHUP'].str.replace('VAN', 'MEM')
    df['MATCHUP'] = df['MATCHUP'].str.replace('NOH', 'NOP')
    df['MATCHUP'] = df['MATCHUP'].str.replace('NOK', 'NOP')
    df['MATCHUP'] = df['MATCHUP'].str.replace('SEA', 'OKC')

    df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'])

    df['HOME_GAME'] = df['MATCHUP'].str.contains('vs').astype(int)

    return df


# from src.data.make_team_dataset import prep_for_aggregation

def prep_for_aggregation(df):
    """This function...
    1) Removes categories that are percentages,
    as we will be averaging them and do not want to average 
    percentages. 
    2) Converts shooting percentage stats into raw values"""
    df = df.copy()

    df = df.drop(columns=['FT_PCT', 'FG_PCT', 'FG3_PCT', 'DREB_PCT',
                          'OREB_PCT', 'REB_PCT', 'AST_PCT', 'AST_TOV',
                          'AST_RATIO', 'E_TM_TOV_PCT', 'TM_TOV_PCT',
                          'EFG_PCT', 'TS_PCT', 'USG_PCT', 'E_USG_PCT',
                          'PACE_PER40', 'MIN', 'PIE', 'CFG_PCT', 'UFG_PCT',
                          'DFG_PCT', 'E_OFF_RATING', 'E_DEF_RATING', 'E_NET_RATING'])

    df['FG2M'] = df['FGM'] - df['FG3M']
    df['FG2A'] = df['FGA'] - df['FG3A']
    df['PTS_2PT_MR'] = (df['PTS'] * df['PCT_PTS_2PT_MR']).astype('int8')
    df['PTS_FB'] = (df['PTS'] * df['PCT_PTS_FB']).astype('int8')
    df['PTS_OFF_TOV'] = (df['PTS'] * df['PCT_PTS_OFF_TOV']).astype('int8')
    df['PTS_PAINT'] = (df['PTS'] * df['PCT_PTS_PAINT']).astype('int8')
    df['AST_2PM'] = (df['FG2M'] * df['PCT_AST_2PM']).astype('int8')
    df['AST_3PM'] = (df['FG3M'] * df['PCT_AST_3PM']).astype('int8')
    df['UAST_2PM'] = (df['FG2M'] * df['PCT_UAST_2PM']).astype('int8')
    df['UAST_3PM'] = (df['FG3M'] * df['PCT_UAST_3PM']).astype('int8')

    df['POINT_DIFF'] = df['PLUS_MINUS']
    df['RECORD'] = df['WL']
    df['TEAM_SCORE'] = df['PTS']
    
    df = df.drop(columns = ['PCT_FGA_2PT', 'PCT_FGA_3PT', 'PCT_PTS_2PT',
                          'PCT_PTS_2PT_MR', 'PCT_PTS_3PT', 'PCT_PTS_FB',
                          'PCT_PTS_FT','PCT_PTS_OFF_TOV', 'PCT_PTS_PAINT', 
                          'PCT_AST_2PM', 'PCT_UAST_2PM','PCT_AST_3PM',
                          'PCT_UAST_3PM', 'PCT_AST_FGM', 'PCT_UAST_FGM',
                          'E_PACE'])
    
    ## Reorder Columns
    
    df = df[['SEASON', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'TEAM_SCORE', 'WL', 'POINT_DIFF', 'HOME_GAME', 'RECORD',
       'FG2M', 'FG2A', 'FG3M', 'FG3A', 'FTM', 'FTA', 'OREB', 'DREB', 'REB', 
       'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
       'PLUS_MINUS', 'OFF_RATING', 'DEF_RATING', 'NET_RATING', 'PACE',
       'POSS', 'DIST', 'ORBC', 'DRBC', 'RBC', 'TCHS', 'SAST', 'FTAST', 'PASS',
       'CFGM', 'CFGA', 'UFGM', 'UFGA', 'DFGM', 'DFGA', 'PTS_2PT_MR', 'PTS_FB', 'PTS_OFF_TOV', 'PTS_PAINT', 'AST_2PM',
       'AST_3PM', 'UAST_2PM', 'UAST_3PM']]

    return df


def load_betting_data(conn):
    spreads = pd.read_sql("SELECT * FROM spreads", conn)
    moneylines = pd.read_sql("SELECT * FROM moneylines", conn)

    return spreads, moneylines


def convert_american_to_decimal(x):
    return np.where(x>0, (100+x)/100, 1+(100.0/-x))          


def clean_moneyline_df(df):
    abbr_mapping = {'Boston': 'BOS', 'Portland': 'POR',
                    'L.A. Lakers': 'LAL', 'Brooklyn': 'BKN',
                    'Cleveland': 'CLE', 'Toronto': 'TOR',
                    'Philadelphia': 'PHI', 'Memphis': 'MEM',
                    'Minnesota': 'MIN', 'New Orleans': 'NOP',
                    'Oklahoma City': 'OKC', 'Dallas': 'DAL',
                    'San Antonio': 'SAS', 'Denver': 'DEN',
                    'Golden State': 'GSW', 'L.A. Clippers': 'LAC',
                    'Orlando': 'ORL', 'Utah': 'UTA',
                    'Charlotte': 'CHA', 'Detroit': 'DET',
                    'Miami': 'MIA', 'Phoenix': 'PHX',
                    'Atlanta': 'ATL', 'New York': 'NYK',
                    'Indiana': 'IND', 'Chicago': 'CHI',
                    'Houston': 'HOU', 'Milwaukee': 'MIL',
                    'Sacramento': 'SAC', 'Washington': 'WAS'}

    df['HOME_TEAM'] = df['HOME_TEAM'].replace(abbr_mapping)
    df['AWAY_TEAM'] = df['AWAY_TEAM'].replace(abbr_mapping)

    away_mls = df['AWAY_ML'].str.split(",", expand=True)
    home_mls = df['HOME_ML'].str.split(",", expand=True)

    away_mls = away_mls.replace('-', np.nan).replace('', np.nan)
    away_mls = away_mls.fillna(value=np.nan)
    away_mls = away_mls.astype(float)

    home_mls = home_mls.replace('-', np.nan).replace('', np.nan)
    home_mls = home_mls.fillna(value=np.nan)
    home_mls = home_mls.astype(float)

    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
    
        highest_away_ml = away_mls.apply(lambda row: np.nanmax(
            abs(row)) if np.nanmax(row) > 0 else -np.nanmax(abs(row)), axis=1)
        highest_away_ml = convert_american_to_decimal(highest_away_ml)
        highest_away_ml = pd.DataFrame(
            highest_away_ml, columns=['HIGHEST_AWAY_ML'])

        highest_home_ml = home_mls.apply(lambda row: np.nanmax(
            abs(row)) if np.nanmax(row) > 0 else -np.nanmax(abs(row)), axis=1)
        highest_home_ml = convert_american_to_decimal(highest_home_ml)
        highest_home_ml = pd.DataFrame(
            highest_home_ml, columns=['HIGHEST_HOME_ML'])

    moneylines = pd.concat(
        [df.iloc[:, :4], highest_home_ml, highest_away_ml], axis=1)
    
    moneylines['GM_DATE'] = pd.to_datetime(moneylines['GM_DATE'])

    return moneylines

def clean_spreads_df(df):
    abbr_mapping = {'Boston': 'BOS', 'Portland': 'POR',
                    'L.A. Lakers': 'LAL', 'Brooklyn': 'BKN',
                    'Cleveland': 'CLE', 'Toronto': 'TOR',
                    'Philadelphia': 'PHI', 'Memphis': 'MEM',
                    'Minnesota': 'MIN', 'New Orleans': 'NOP',
                    'Oklahoma City': 'OKC', 'Dallas': 'DAL',
                    'San Antonio': 'SAS', 'Denver': 'DEN',
                    'Golden State': 'GSW', 'L.A. Clippers': 'LAC',
                    'Orlando': 'ORL', 'Utah': 'UTA',
                    'Charlotte': 'CHA', 'Detroit': 'DET',
                    'Miami': 'MIA', 'Phoenix': 'PHX',
                    'Atlanta': 'ATL', 'New York': 'NYK',
                    'Indiana': 'IND', 'Chicago': 'CHI',
                    'Houston': 'HOU', 'Milwaukee': 'MIL',
                    'Sacramento': 'SAC', 'Washington': 'WAS'}

    df['HOME_TEAM'] = df['HOME_TEAM'].replace(abbr_mapping)
    df['AWAY_TEAM'] = df['AWAY_TEAM'].replace(abbr_mapping)

    away_spreads = df['AWAY_SPREAD'].str.split(",", expand=True)
    home_spreads = df['HOME_SPREAD'].str.split(",", expand=True)

    for col in away_spreads.columns:
        away_spreads[col] = away_spreads[col].str[:-4]
        away_spreads[col] = away_spreads[col].str.replace('½', '.5')
        away_spreads[col] = away_spreads[col].str.replace('PK', '0')

        away_spreads[col] = away_spreads[col].astype(str).apply(
            lambda x: x if x == '' else (x[:-1] if x[-1] == '-' else x))

    away_spreads = away_spreads.replace('-', np.nan)
    away_spreads = away_spreads.replace('', np.nan)
    away_spreads = away_spreads.replace('None', np.nan)
    away_spreads = away_spreads.fillna(value=np.nan)

    away_spreads = away_spreads.astype(float)

    for col in home_spreads.columns:
        home_spreads[col] = home_spreads[col].str[:-4]
        home_spreads[col] = home_spreads[col].str.replace('½', '.5')
        home_spreads[col] = home_spreads[col].str.replace('PK', '0')

        home_spreads[col] = home_spreads[col].astype(str).apply(
            lambda x: x if x == '' else (x[:-1] if x[-1] == '-' else x))

    home_spreads = home_spreads.replace('-', np.nan).replace('', np.nan).replace('None', np.nan)
    home_spreads = home_spreads.fillna(value=np.nan)

    home_spreads = home_spreads.astype(float)

    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)

        highest_away_spread = away_spreads.apply(
            lambda row: -np.nanmax(abs(row)) if np.nanmax(row) < 0 else np.nanmax(abs(row)), axis=1)
        highest_away_spread = pd.DataFrame(
            highest_away_spread, columns=['HIGHEST_AWAY_SPREAD'])

        highest_home_spread = home_spreads.apply(
            lambda row: -np.nanmax(abs(row)) if np.nanmax(row) < 0 else np.nanmax(abs(row)), axis=1)
        highest_home_spread = pd.DataFrame(
            highest_home_spread, columns=['HIGHEST_HOME_SPREAD'])

    spreads = pd.concat(
        [df.iloc[:, :4], highest_home_spread, highest_away_spread], axis=1)
    spreads['GM_DATE'] = pd.to_datetime(spreads['GM_DATE'])

    return spreads


def merge_betting_and_boxscore_data(clean_spreads, clean_mls, clean_boxscores):
    clean_boxscores['HOME_TEAM'] = clean_boxscores['MATCHUP'].apply(
        lambda x: x[:3] if 'vs' in x else x[-3:])
    clean_boxscores['AWAY_TEAM'] = clean_boxscores['MATCHUP'].apply(
        lambda x: x[:3] if '@' in x else x[-3:])

    temp = pd.merge(clean_mls, clean_spreads, on=[
                    'SEASON', 'GM_DATE', 'HOME_TEAM', 'AWAY_TEAM'])

    merged_df = pd.merge(clean_boxscores, temp, how='left', 
                         left_on=['SEASON', 'HOME_TEAM', 'AWAY_TEAM', 'GAME_DATE'],
                         right_on=['SEASON', 'HOME_TEAM', 'AWAY_TEAM', 'GM_DATE'])

    merged_df['ML'] = merged_df.apply(lambda row: row['HIGHEST_HOME_ML'] if row['HOME_GAME'] == 1
                                      else row['HIGHEST_AWAY_ML'], axis=1)

    merged_df['SPREAD'] = merged_df.apply(lambda row: row['HIGHEST_HOME_SPREAD'] if row['HOME_GAME'] == 1
                                          else -row['HIGHEST_HOME_SPREAD'], axis=1)

    merged_df = merged_df.drop(columns=['HOME_TEAM', 'AWAY_TEAM', 'GM_DATE',
                                        'HIGHEST_HOME_ML', 'HIGHEST_AWAY_ML',
                                        'HIGHEST_HOME_SPREAD', 'HIGHEST_AWAY_SPREAD'])

    merged_df['ATS_DIFF'] = merged_df['POINT_DIFF'] + merged_df['SPREAD']

    merged_df['TEAM_COVERED'] = (merged_df['ATS_DIFF'] > 0).astype(int)
    

    return merged_df


def normalize_per_100_poss(df):
    df = df.copy(deep=True)
    
    df.iloc[:, 12:27] = 100*df.iloc[:, 12:27].div(df['PACE'], axis=0) 
    df.iloc[:,  34:-4] = 100*df.iloc[:, 34:-4].div(df['PACE'], axis=0) 
    
    return df


def create_matchups(df):
    """This function makes each row a matchup between 
    team and opp"""
    df = df.copy()
    

    matchups = pd.merge(df, df.iloc[:, :-4], on=['GAME_ID'], suffixes=['', '_opp'])
    matchups = matchups.loc[matchups['TEAM_ABBREVIATION'] != matchups['TEAM_ABBREVIATION_opp']]

    matchups = matchups.drop(columns = ['SEASON_opp', 'TEAM_ABBREVIATION_opp', 'GAME_DATE_opp',
                                        'MATCHUP_opp', 'HOME_GAME_opp', 'TEAM_NAME_opp', 
                                        'TEAM_ID_opp', 'WL_opp']
                             )
    
    matchups
    
    return matchups


def build_team_avg_stats_df(df: pd.DataFrame, span = 10) -> pd.DataFrame:    
    """This function finds the average for each team and opp statistic up to (and NOT including) the given date.
    """
    
    df = df.copy(deep=True)

    df = df.sort_values(['TEAM_ABBREVIATION', 'GAME_DATE']).reset_index(drop=True)

    

    drop_cols = ['TEAM_ID', 'TEAM_NAME', 'GAME_ID', 'MATCHUP', 
                 'HOME_GAME', 'TEAM_SCORE', 'ML', 'SPREAD', 
                'GAME_DATE', 'POINT_DIFF', 'WL', 'TEAM_SCORE_opp',
                'POINT_DIFF_opp', 'RECORD', 'RECORD_opp', 'TEAM_COVERED']

    stats = df.drop(columns=drop_cols)

    avg_stat_holder = []

    for stat in stats.columns[2:]:
        avg_stats = stats.groupby(['TEAM_ABBREVIATION'])[stat].ewm(span=span).mean().reset_index(drop=True)
        avg_stat_holder.append(avg_stats)
    
    
    matchup_info = df[['SEASON', 'TEAM_ABBREVIATION', 'GAME_DATE',
                          'GAME_ID', 'MATCHUP', 'HOME_GAME', 'TEAM_SCORE',
                          'ML', 'SPREAD', 'ATS_DIFF', 'RECORD', 'TEAM_COVERED', 
                          'POINT_DIFF', 'WL']]   

    avg_stats = pd.concat(avg_stat_holder, axis=1)
    

    avg_stats = avg_stats.rename(columns={'ATS_DIFF':'AVG_ATS_DIFF'})
    
    avg_stats = pd.concat([matchup_info, avg_stats], axis=1)
    
    avg_stats['WIN_PCT'] = avg_stats.groupby(['TEAM_ABBREVIATION'])['RECORD'].rolling(window=span).mean().values
    avg_stats['COVER_PCT'] = avg_stats.groupby(['TEAM_ABBREVIATION'])['TEAM_COVERED'].rolling(window=span).mean().values

    avg_stats = avg_stats.drop(columns='RECORD')

    avg_stats = avg_stats.sort_values(['TEAM_ABBREVIATION', 'GAME_DATE'])
    avg_stats.iloc[:, 14:] = avg_stats.iloc[:, 14:].shift(1).where(avg_stats['TEAM_ABBREVIATION'].eq(avg_stats['TEAM_ABBREVIATION'].shift()))

    avg_stats = avg_stats.add_suffix('_L{}'.format(span))
    
    avg_stats = avg_stats.rename(columns = {'SEASON_L{}'.format(span):'SEASON',
                                           'TEAM_ABBREVIATION_L{}'.format(span):'TEAM_ABBREVIATION',
                                           'GAME_DATE_L{}'.format(span):'GAME_DATE',
                                           'GAME_ID_L{}'.format(span):'GAME_ID',
                                           'MATCHUP_L{}'.format(span): 'MATCHUP', 
                                           'HOME_GAME_L{}'.format(span): 'HOME_GAME', 
                                           'TEAM_SCORE_L{}'.format(span):'TEAM_SCORE',
                                           'ML_L{}'.format(span):'ML', 
                                           'SPREAD_L{}'.format(span):'SPREAD',
                                           'ATS_DIFF_L{}'.format(span):'ATS_DIFF',
                                           'RECORD_L{}'.format(span):'RECORD', 
                                           'TEAM_COVERED_L{}'.format(span):'TEAM_COVERED',
                                           'POINT_DIFF_L{}'.format(span):'POINT_DIFF',
                                           'WL_L{}'.format(span):'WL'})
    
    return avg_stats


def add_percentage_features(df, span):
    """Add the following features for both team and opp:
    OREB_PCT, DREB_PCT, REB_PCT, TS_PCT, EFG_PCT, AST_RATIO, TOV_PCT, PIE.
    """
    
    df = df.copy()
    
    df['OREB_PCT_L{}'.format(span)] = df['OREB_L{}'.format(span)] / (df['OREB_L{}'.format(span)] + df['DREB_opp_L{}'.format(span)])
    df['OREB_PCT_opp_L{}'.format(span)] = df['OREB_opp_L{}'.format(span)] / (df['OREB_opp_L{}'.format(span)] + df['DREB_L{}'.format(span)])

    df['DREB_PCT_L{}'.format(span)] = df['DREB_L{}'.format(span)] / (df['DREB_L{}'.format(span)] + df['OREB_opp_L{}'.format(span)])
    df['DREB_PCT_opp_L{}'.format(span)] = df['DREB_opp_L{}'.format(span)] / (df['DREB_opp_L{}'.format(span)] + df['OREB_L{}'.format(span)])

    df['REB_PCT_L{}'.format(span)] = df['REB_L{}'.format(span)] / (df['REB_L{}'.format(span)] + df['REB_opp_L{}'.format(span)])
    df['REB_PCT_opp_L{}'.format(span)] = df['REB_opp_L{}'.format(span)] / (df['REB_opp_L{}'.format(span)] + df['REB_L{}'.format(span)])

    df['TS_PCT_L{}'.format(span)] = df['PTS_L{}'.format(span)] / ((2*(df['FG2A_L{}'.format(span)] + df['FG3A_L{}'.format(span)]) + 0.44*df['FTA_L{}'.format(span)]))
    
    df['TS_PCT_opp_L{}'.format(span)] = df['PTS_opp_L{}'.format(span)] / ((2*(df['FG2A_opp_L{}'.format(span)] + df['FG3A_opp_L{}'.format(span)]) + 0.44*df['FTA_opp_L{}'.format(span)]))

    df['EFG_PCT_L{}'.format(span)] = (df['FG2M_L{}'.format(span)] + 1.5*df['FG3M_L{}'.format(span)]) / (df['FG2A_L{}'.format(span)]
                                                                    + df['FG3A_L{}'.format(span)])
    df['EFG_PCT_opp_L{}'.format(span)] = (df['FG2M_opp_L{}'.format(span)] + 1.5*df['FG3M_opp_L{}'.format(span)]) / (df['FG2A_opp_L{}'.format(span)] 
                                                                 + df['FG3A_opp_L{}'.format(span)])

    df['AST_RATIO_L{}'.format(span)] = (df['AST_L{}'.format(span)] * 100) / df['PACE_L{}'.format(span)]
    df['AST_RATIO_opp_L{}'.format(span)] = (df['AST_opp_L{}'.format(span)] * 100) / df['PACE_opp_L{}'.format(span)]

    df['TOV_PCT_L{}'.format(span)] = 100*df['TOV_L{}'.format(span)] / (df['FG2A_L{}'.format(span)] 
                                               + df['FG3A_L{}'.format(span)] 
                                               + 0.44*df['FTA_L{}'.format(span)] 
                                               + df['TOV_L{}'.format(span)])
    
    df['TOV_PCT_opp_L{}'.format(span)] = 100*df['TOV_opp_L{}'.format(span)] / (df['FG2A_opp_L{}'.format(span)] 
                                             + df['FG3A_opp_L{}'.format(span)] 
                                             + 0.44*df['FTA_opp_L{}'.format(span)] 
                                             + df['TOV_opp_L{}'.format(span)])
    
    
    df['PIE_L{}'.format(span)] = ((df['PTS_L{}'.format(span)] + df['FG2M_L{}'.format(span)] + df['FG3M_L{}'.format(span)] + df['FTM_L{}'.format(span)] 
                 - df['FG2A_L{}'.format(span)] - df['FG3A_L{}'.format(span)] - df['FTA_L{}'.format(span)] 
                 + df['DREB_L{}'.format(span)] + df['OREB_L{}'.format(span)]/2
                + df['AST_L{}'.format(span)] + df['STL_L{}'.format(span)] + df['BLK_L{}'.format(span)]/2
                - df['PF_L{}'.format(span)] - df['TOV_L{}'.format(span)]) 
                 / (df['PTS_L{}'.format(span)] + df['PTS_opp_L{}'.format(span)] + df['FG2M_L{}'.format(span)] + df['FG2M_opp_L{}'.format(span)]
                   + df['FG3M_L{}'.format(span)] + df['FG3M_opp_L{}'.format(span)] + df['FTM_L{}'.format(span)] + df['FTM_opp_L{}'.format(span)]
                   - df['FG2A_L{}'.format(span)] - df['FG2A_opp_L{}'.format(span)] - df['FG3A_L{}'.format(span)] - df['FG3A_opp_L{}'.format(span)] 
                    - df['FTA_L{}'.format(span)] - df['FTA_opp_L{}'.format(span)] + df['DREB_L{}'.format(span)] + df['DREB_opp_L{}'.format(span)]
                    + (df['OREB_L{}'.format(span)]+df['OREB_opp_L{}'.format(span)])/2 + df['AST_L{}'.format(span)] + df['AST_opp_L{}'.format(span)]
                    + df['STL_L{}'.format(span)] + df['STL_opp_L{}'.format(span)] + (df['BLK_L{}'.format(span)] + df['BLK_opp_L{}'.format(span)])/2
                    - df['PF_L{}'.format(span)] - df['PF_opp_L{}'.format(span)] - df['TOV_L{}'.format(span)] - df['TOV_opp_L{}'.format(span)]))
        
    return df


def add_rest_days(df):
    
    df['prev_game'] = df.groupby(['SEASON', 'TEAM_ABBREVIATION'])['GAME_DATE'].shift(1)

    df['REST'] = (df['GAME_DATE'] - df['prev_game']) / np.timedelta64(1, 'D')
            
    df.loc[df['REST'] >= 8, 'REST'] = 8
    
    df = df.drop(columns=['prev_game'])
    
    return df


def season_to_string(x):
    return str(x) + '-' + str(x+1)[-2:]


def load_and_process_data(start_season, end_season):
    start_season = season_to_string(start_season)
    end_season = season_to_string(end_season)

    db_filepath = Path.home().joinpath('NBA_model_v1', 'data', 'nba.db')

    conn = sqlite3.connect(db_filepath)
    
    print("Loading raw team boxscore data from sql database...")
    
    df = load_team_data(conn, start_season, end_season)
    print("Loading betting data from sql database...")
    spreads, moneylines = load_betting_data(conn)
    
    print("Cleaning Data...")
    df = clean_team_data(df)
    df = prep_for_aggregation(df)

    clean_mls = clean_moneyline_df(df = moneylines)
    clean_spreads = clean_spreads_df(df = spreads)
    
    print("Merging Boxscore and Betting Data...")
    merged_df = merge_betting_and_boxscore_data(
        clean_spreads, clean_mls, clean_boxscores = df)
    
    
    stats_per_100 = normalize_per_100_poss(merged_df)

    print("Aggregating over last 5, 10, and 20 game windows")
    
    matchups = create_matchups(stats_per_100)
    
    team_stats_ewa_5 = build_team_avg_stats_df(matchups, span=5)
    team_stats_ewa_5 = add_percentage_features(team_stats_ewa_5, span=5)

    team_stats_ewa_10 = build_team_avg_stats_df(matchups, span=10)
    team_stats_ewa_10 = add_percentage_features(team_stats_ewa_10, span=10)

    team_stats_ewa_20 = build_team_avg_stats_df(matchups, span=20)
    team_stats_ewa_20 = add_percentage_features(team_stats_ewa_20, span=20)


    temp = pd.merge(team_stats_ewa_5, team_stats_ewa_10, how='inner',
                    on=['SEASON', 'TEAM_ABBREVIATION', 'GAME_DATE',
                        'GAME_ID', 'MATCHUP', 'HOME_GAME', 'TEAM_SCORE',
                        'ML', 'SPREAD', 'ATS_DIFF', 'TEAM_COVERED', 
                        'POINT_DIFF', 'WL'])

    df_full = pd.merge(temp, team_stats_ewa_20, how='inner', 
                       on=['SEASON', 'TEAM_ABBREVIATION', 'GAME_DATE',
                            'GAME_ID', 'MATCHUP', 'HOME_GAME', 'TEAM_SCORE',
                            'ML', 'SPREAD', 'ATS_DIFF', 'TEAM_COVERED', 
                            'POINT_DIFF', 'WL'])

    df_full = df_full.sort_values(['GAME_DATE', 'GAME_ID', 'HOME_GAME'])
    
    
    columns_to_drop = ['PTS_L5', 'PTS_L10', 'PTS_L20',
                        'PLUS_MINUS_L5', 'PLUS_MINUS_L10', 'PLUS_MINUS_L20',
                        'NET_RATING_L5', 'NET_RATING_L10', 'NET_RATING_L20',
                        'POSS_L5', 'POSS_L10', 'POSS_L20',
                        'REB_L5', 'REB_L10', 'REB_L20',
                        'REB_opp_L5', 'REB_opp_L10', 'REB_opp_L20',
                        'PTS_opp_L5', 'PTS_opp_L10', 'PTS_opp_L20',
                        'PLUS_MINUS_opp_L5', 'PLUS_MINUS_opp_L10', 'PLUS_MINUS_opp_L20',
                        'NET_RATING_opp_L5', 'NET_RATING_opp_L10', 'NET_RATING_opp_L20',
                        'POSS_opp_L5', 'POSS_opp_L10', 'POSS_opp_L20']
    
    df_full = df_full.drop(columns = columns_to_drop)
    
    print("adding rest days")
    df_full = add_rest_days(df_full)
    
    return df_full
    

def make_matchup_row(home_team, away_team, df):
    
    print("creating matchups between Home and Away team aggregated stats")

    matchup_info_cols = ['SEASON', 'TEAM_ABBREVIATION', 'GAME_DATE', 'GAME_ID', 'MATCHUP',
        'HOME_GAME', 'TEAM_SCORE', 'ML', 'SPREAD', 'ATS_DIFF', 'TEAM_COVERED',
        'POINT_DIFF', 'WL']

    most_recent_home_stats = df.loc[df['TEAM_ABBREVIATION'] == home_team].tail(1).drop(columns=matchup_info_cols).values
    most_recent_away_stats = df.loc[df['TEAM_ABBREVIATION'] == away_team].tail(1).drop(columns=matchup_info_cols).values

    matchup_row = pd.DataFrame(np.concatenate([most_recent_home_stats, most_recent_away_stats], axis=1), columns=X_train.columns)
        
    return matchup_row

In [217]:
df_full = load_and_process_data(start_season = 2013, end_season = 2021)


# row = get_data_for_matchup(home_team='LAL', away_team='OKC', start_season=2013, end_season=2021, table_name='team_stats_ewa_matchup')

Loading raw team boxscore data from sql database...
Loading betting data from sql database...
Cleaning Data...
Merging Boxscore and Betting Data...
Aggregating over last 5, 10, and 20 game windows
adding rest days


In [219]:
home_team = 'GSW'
away_team = 'LAL'

row = make_matchup_row(home_team, away_team, df = df_full)
lgbr_model.predict(row)

creating matchups between Home and Away team aggregated stats


array([[110.81939504, 111.24257733]])

In [220]:
home_team = 'PHI'
away_team = 'BOS'

row = make_matchup_row(home_team, away_team, df = df_full)
lgbr_model.predict(row)

creating matchups between Home and Away team aggregated stats


array([[102.70060833, 100.81850387]])

In [221]:
home_team = 'LAL'
away_team = 'OKC'

row = make_matchup_row(home_team, away_team, df = df_full)
lgbr_model.predict(row)

creating matchups between Home and Away team aggregated stats


array([[113.77664723, 114.94091434]])

In [198]:
lgbr_model.predict(row)

array([[113.77664723, 114.94091434]])

In [184]:
home_team = 'LAL'
away_team = 'SAS'
start_season = 2013
end_season = 2021

start_season = season_to_string(start_season)
end_season = season_to_string(end_season)

db_filepath = Path.home().joinpath('NBA_model_v1', 'data', 'nba.db')

conn = sqlite3.connect(db_filepath)

print("Loading raw team boxscore data from sql database...")

df = load_team_data(conn, start_season, end_season)
print("Loading betting data from sql database...")
spreads, moneylines = load_betting_data(conn)

print("Cleaning Data...")

df = clean_team_data(df)
df = prep_for_aggregation(df)

clean_mls = clean_moneyline_df(df = moneylines)
clean_spreads = clean_spreads_df(df = spreads)


print("Merging Boxscore and Betting Data...")
merged_df = merge_betting_and_boxscore_data(
    clean_spreads, clean_mls, clean_boxscores = df)


stats_per_100 = normalize_per_100_poss(merged_df)

print("Aggregating over last 5, 10, and 20 game windows")

matchups = create_matchups(stats_per_100)

team_stats_ewa_5 = build_team_avg_stats_df(matchups, span=5)
team_stats_ewa_5 = add_percentage_features(team_stats_ewa_5, span=5)

team_stats_ewa_10 = build_team_avg_stats_df(matchups, span=10)
team_stats_ewa_10 = add_percentage_features(team_stats_ewa_10, span=10)

team_stats_ewa_20 = build_team_avg_stats_df(matchups, span=20)
team_stats_ewa_20 = add_percentage_features(team_stats_ewa_20, span=20)


temp = pd.merge(team_stats_ewa_5, team_stats_ewa_10, how='inner',
                on=['SEASON', 'TEAM_ABBREVIATION', 'GAME_DATE',
                    'GAME_ID', 'MATCHUP', 'HOME_GAME', 'TEAM_SCORE',
                    'ML', 'SPREAD', 'ATS_DIFF', 'TEAM_COVERED', 
                    'POINT_DIFF', 'WL'])

df_full = pd.merge(temp, team_stats_ewa_20, how='inner', 
                    on=['SEASON', 'TEAM_ABBREVIATION', 'GAME_DATE',
                        'GAME_ID', 'MATCHUP', 'HOME_GAME', 'TEAM_SCORE',
                        'ML', 'SPREAD', 'ATS_DIFF', 'TEAM_COVERED', 
                        'POINT_DIFF', 'WL'])

df_full = df_full.sort_values(['GAME_DATE', 'GAME_ID', 'HOME_GAME'])


columns_to_drop = ['PTS_L5', 'PTS_L10', 'PTS_L20',
                    'PLUS_MINUS_L5', 'PLUS_MINUS_L10', 'PLUS_MINUS_L20',
                    'NET_RATING_L5', 'NET_RATING_L10', 'NET_RATING_L20',
                    'POSS_L5', 'POSS_L10', 'POSS_L20',
                    'REB_L5', 'REB_L10', 'REB_L20',
                    'REB_opp_L5', 'REB_opp_L10', 'REB_opp_L20',
                    'PTS_opp_L5', 'PTS_opp_L10', 'PTS_opp_L20',
                    'PLUS_MINUS_opp_L5', 'PLUS_MINUS_opp_L10', 'PLUS_MINUS_opp_L20',
                    'NET_RATING_opp_L5', 'NET_RATING_opp_L10', 'NET_RATING_opp_L20',
                    'POSS_opp_L5', 'POSS_opp_L10', 'POSS_opp_L20']


df_full = df_full.drop(columns = columns_to_drop)

print("adding rest days")
df_full = add_rest_days(df_full)

print("creating matchups between Home and Away team aggregated stats")

matchup_info_cols = ['SEASON', 'TEAM_ABBREVIATION', 'GAME_DATE', 'GAME_ID', 'MATCHUP',
       'HOME_GAME', 'TEAM_SCORE', 'ML', 'SPREAD', 'ATS_DIFF', 'TEAM_COVERED',
       'POINT_DIFF', 'WL']

most_recent_home_stats = df_full.loc[df_full['TEAM_ABBREVIATION'] == home_team].tail(1).drop(columns=matchup_info_cols).values
most_recent_away_stats = df_full.loc[df_full['TEAM_ABBREVIATION'] == away_team].tail(1).drop(columns=matchup_info_cols).values

matchup_row = pd.DataFrame(np.concatenate([most_recent_home_stats, most_recent_away_stats], axis=1), columns=X_train.columns)

Loading raw team boxscore data from sql database...
Loading betting data from sql database...
Cleaning Data...
Merging Boxscore and Betting Data...
Aggregating over last 5, 10, and 20 game windows
adding rest days


creating matchups between Home and Away team aggregated stats


Unnamed: 0,FG2M_L5,FG2A_L5,FG3M_L5,FG3A_L5,FTM_L5,FTA_L5,OREB_L5,DREB_L5,AST_L5,STL_L5,...,TS_PCT_L20,TS_PCT_opp_L20,EFG_PCT_L20,EFG_PCT_opp_L20,AST_RATIO_L20,AST_RATIO_opp_L20,TOV_PCT_L20,TOV_PCT_opp_L20,PIE_L20,REST
10810,29.598352,53.032355,8.836455,29.109184,23.71382,31.905133,7.354883,34.745477,21.104468,7.071961,...,0.614004,0.630166,0.54171,0.569435,23.057299,27.382987,12.22665,11.003484,0.461889,2.0


In [187]:
matchup_info_cols = ['SEASON', 'TEAM_ABBREVIATION', 'GAME_DATE', 'GAME_ID', 'MATCHUP',
       'HOME_GAME', 'TEAM_SCORE', 'ML', 'SPREAD', 'ATS_DIFF', 'TEAM_COVERED',
       'POINT_DIFF', 'WL']
most_recent_home_stats = df_full.loc[df_full['TEAM_ABBREVIATION'] == home_team].tail(1).drop(columns=matchup_info_cols)
most_recent_away_stats = df_full.loc[df_full['TEAM_ABBREVIATION'] == away_team].tail(1).drop(columns=matchup_info_cols)

most_recent_away_stats

Unnamed: 0,AWAY_FG2M_L5,AWAY_FG2A_L5,AWAY_FG3M_L5,AWAY_FG3A_L5,AWAY_FTM_L5,AWAY_FTA_L5,AWAY_OREB_L5,AWAY_DREB_L5,AWAY_AST_L5,AWAY_STL_L5,...,AWAY_TS_PCT_L20,AWAY_TS_PCT_opp_L20,AWAY_EFG_PCT_L20,AWAY_EFG_PCT_opp_L20,AWAY_AST_RATIO_L20,AWAY_AST_RATIO_opp_L20,AWAY_TOV_PCT_L20,AWAY_TOV_PCT_opp_L20,AWAY_PIE_L20,AWAY_REST
20663,29.125098,53.691494,11.348029,31.593497,20.697453,27.793212,6.999337,37.758586,25.913601,6.192778,...,0.601159,0.583452,0.533056,0.519636,27.158502,23.635871,11.279083,11.404486,0.525536,1.0


In [3]:
date = '2022-10-18'

web = 'https://www.sportsbookreview.com/betting-odds/nba-basketball/?date={}'.format(date)

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(web)
sleep(random.randint(2,3))

# try:
#     single_row_events = driver.find_elements(By.CLASS_NAME, 'eventMarketGridContainer-3QipG')
    
# except:
#     print("No Data for {}".format(date))
#     dates_with_no_data.append(date)
#     continue
    
# num_postponed_events = len(driver.find_elements(By.CLASS_NAME, 'eventStatus-3EHqw'))

# num_listed_events = len(single_row_events)
# cutoff = num_listed_events - num_postponed_events

# for event in single_row_events[:cutoff]:

#     away_team = event.find_elements(By.CLASS_NAME, 'participantBox-3ar9Y')[0].text
#     home_team = event.find_elements(By.CLASS_NAME, 'participantBox-3ar9Y')[1].text
#     away_teams.append(away_team)
#     home_teams.append(home_team)
#     gm_dates.append(date)

#     seasons.append(season_string(season))
    
#     scoreboard = event.find_elements(By.CLASS_NAME, 'scoreboard-1TXQV')

#     home_score = []
#     away_score = []

#     for score in scoreboard:
#         quarters = score.find_elements(By.CLASS_NAME, 'scoreboardColumn-2OtpR')
#         for i in range(len(quarters)):
#             scores = quarters[i].text.split('\n')
#             away_score.append(scores[0])
#             home_score.append(scores[1])
            
#         home_score = ",".join(home_score)
#         away_score = ",".join(away_score)
        
#         away_scoreboards.append(away_score)
#         home_scoreboards.append(home_score)


#     if len(away_scoreboards) != len(away_teams):
#         num_to_add = len(away_teams) - len(away_scoreboards)
#         for i in range(num_to_add):
#             away_scoreboards.append('')
#             home_scoreboards.append('')

#     spreads = event.find_elements(By.CLASS_NAME, 'pointer-2j4Dk')
#     away_lines = []
#     home_lines = []
#     for i in range(len(spreads)):    
#         if i % 2 == 0:
#             away_lines.append(spreads[i].text)
#         else:
#             home_lines.append(spreads[i].text)
    
#     away_lines = ",".join(away_lines)
#     home_lines = ",".join(home_lines)
    
#     away_spreads.append(away_lines)
#     home_spreads.append(home_lines)

#     if len(away_spreads) != len(away_teams):
#         num_to_add = len(away_teams) - len(away_spreads)
#         for i in range(num_to_add):
#             away_scoreboards.append('')
#             home_scoreboards.append('')

# driver.quit()
# clear_output(wait=True)