In [1]:
import tensorflow as tf
from tensorflow.keras import layers, models
import sqlite3
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import lightgbm as lgb
from nba_api.stats.endpoints import leaguegamelog, playergamelogs, teamgamelogs
from time import sleep
import time

In [2]:
def load_team_data(connection, start_season, end_season):
    """Loads basic, advanced, and scoring boxscores 
    from sqlite db and merges them into one dataframe"""
    

    df = pd.read_sql("""SELECT b.*, E_OFF_RATING, E_DEF_RATING, E_NET_RATING, AST_PCT, AST_TOV, AST_RATIO, OREB_PCT, DREB_PCT, REB_PCT,
        E_TM_TOV_PCT, TM_TOV_PCT, EFG_PCT, TS_PCT, E_USG_PCT, E_PACE, POSS, PIE, PCT_FGA_2PT, PCT_FGA_3PT, PCT_PTS_2PT, PCT_PTS_2PT_MR, 
        PCT_PTS_3PT, PCT_AST_2PM, PCT_UAST_2PM, PCT_AST_3PM, PCT_UAST_3PM, 
        PCT_AST_FGM, PCT_UAST_FGM, contestedShots, contestedShots2pt, contestedShots3pt, deflections, chargesDrawn, screenAssists,
        screenAssistPoints, looseBallsRecoveredOffensive, looseBallsRecoveredDefensive, looseBallsRecoveredTotal, offensiveBoxOuts,
        defensiveBoxOuts, boxOutPlayerTeamRebounds, boxOutPlayerRebounds, boxOuts, DIST, ORBC, DRBC, RBC, TCHS, SAST, FTAST, PASS,
        CFGM, CFGA, CFG_PCT, UFGM, UFGA, UFG_PCT, DFGM, DFGA, DFG_PCT, PTS_OFF_TOV, PTS_2ND_CHANCE, PTS_FB, PTS_PAINT
        FROM team_basic_stats b 
        LEFT JOIN team_advanced_stats adv ON b.team_ID = adv.team_ID AND b.GAME_ID = adv.GAME_ID
        LEFT JOIN team_scoring_stats sco ON b.team_ID = sco.team_ID AND b.GAME_ID = sco.GAME_ID 
        LEFT JOIN team_hustle_stats hust ON b.team_ID = hust.teamId AND b.GAME_ID = hust.gameId
        LEFT JOIN team_track_stats track ON b.team_ID = track.team_ID AND b.GAME_ID = track.GAME_ID
        LEFT JOIN team_miscellaneous_stats misc ON b.team_ID = misc.team_ID AND b.GAME_ID = misc.GAME_ID
                                """, connection)
    
    df = df.loc[df['SEASON_YEAR'].between(start_season, end_season)]
    
    return df


start_season = '2016-17'
end_season = '2023-24'

connection = sqlite3.connect('nba_stats.db')
df = load_team_data(connection, start_season, end_season)
connection.close()

In [65]:
def clean_team_data(df):
    """This function cleans the team_data
    1) Changes W/L to 1/0 
    2) Changes franchise abbreviations to their most 
    recent abbreviation for consistency
    3) Converts GAME_DATE to datetime object
    4) Creates a binary column 'HOME_GAME'
    5) Removes 3 games where advanced stats were not collected
    """
    df = df.copy()
    df['WL'] = (df['WL'] == 'W').astype(int)
    df = df.rename(columns={'SEASON_YEAR': 'SEASON'})

    abbr_mapping = {'NJN': 'BKN',
                    'CHH': 'CHA',
                    'VAN': 'MEM',
                    'NOH': 'NOP',
                    'NOK': 'NOP',
                    'SEA': 'OKC'}

    df['TEAM_ABBREVIATION'] = df['TEAM_ABBREVIATION'].replace(abbr_mapping)
    df['MATCHUP'] = df['MATCHUP'].str.replace('NJN', 'BKN')
    df['MATCHUP'] = df['MATCHUP'].str.replace('CHH', 'CHA')
    df['MATCHUP'] = df['MATCHUP'].str.replace('VAN', 'MEM')
    df['MATCHUP'] = df['MATCHUP'].str.replace('NOH', 'NOP')
    df['MATCHUP'] = df['MATCHUP'].str.replace('NOK', 'NOP')
    df['MATCHUP'] = df['MATCHUP'].str.replace('SEA', 'OKC')

    df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'])

    df['HOME_GAME'] = df['MATCHUP'].str.contains('vs').astype(int)

    return df


clean_df = clean_team_data(df)
clean_df

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,SEASON,E_OFF_RATING,E_DEF_RATING,E_NET_RATING,AST_PCT,AST_TOV,AST_RATIO,OREB_PCT,DREB_PCT,REB_PCT,E_TM_TOV_PCT,TM_TOV_PCT,EFG_PCT,TS_PCT,E_USG_PCT,E_PACE,POSS,PIE,PCT_FGA_2PT,PCT_FGA_3PT,PCT_PTS_2PT,PCT_PTS_2PT_MR,PCT_PTS_3PT,PCT_AST_2PM,PCT_UAST_2PM,PCT_AST_3PM,PCT_UAST_3PM,PCT_AST_FGM,PCT_UAST_FGM,contestedShots,contestedShots2pt,contestedShots3pt,deflections,chargesDrawn,screenAssists,screenAssistPoints,looseBallsRecoveredOffensive,looseBallsRecoveredDefensive,looseBallsRecoveredTotal,offensiveBoxOuts,defensiveBoxOuts,boxOutPlayerTeamRebounds,boxOutPlayerRebounds,boxOuts,DIST,ORBC,DRBC,RBC,TCHS,SAST,FTAST,PASS,CFGM,CFGA,CFG_PCT,UFGM,UFGA,UFG_PCT,DFGM,DFGA,DFG_PCT,PTS_OFF_TOV,PTS_2ND_CHANCE,PTS_FB,PTS_PAINT,HOME_GAME
2632,22016,1610612739,CLE,Cleveland Cavaliers,0021600001,2016-10-25,CLE vs. NYK,1,240,45,94,0.479,13,35,0.371,14,19,0.737,11,40,51,31,12,5,15,22,117,29,2016-17,110.0,87.3,22.7,0.689,2.07,20.9,0.353,0.729,0.555,14.103,14.9,0.548,0.572,0.198,103.58,101,0.703,0.628,0.372,0.547,0.103,0.333,0.625,0.375,0.846,0.154,0.689,0.311,65.0,46.0,19.0,16.0,0.0,6.0,12.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,17.12,41.0,72.0,108.0,441.0,2.0,0.0,314.0,17.0,39.0,0.436,28.0,55.0,0.509,10.0,18.0,0.556,21.0,16.0,16.0,52.0,1
2633,22016,1610612752,NYK,New York Knicks,0021600001,2016-10-25,NYK @ CLE,0,240,32,87,0.368,9,27,0.333,15,20,0.750,13,29,42,17,6,6,18,22,88,-29,2016-17,87.3,110.0,-22.7,0.531,0.94,13.0,0.271,0.647,0.445,17.857,17.8,0.420,0.459,0.202,103.58,101,0.297,0.690,0.310,0.523,0.205,0.307,0.391,0.609,0.889,0.111,0.531,0.469,65.0,39.0,26.0,9.0,2.0,7.0,15.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,17.29,34.0,71.0,104.0,408.0,1.0,1.0,287.0,12.0,31.0,0.387,20.0,56.0,0.357,21.0,31.0,0.677,14.0,16.0,9.0,28.0,0
2634,22016,1610612757,POR,Portland Trail Blazers,0021600002,2016-10-25,POR vs. UTA,1,240,39,75,0.520,13,19,0.684,22,22,1.000,5,29,34,22,5,3,13,18,113,9,2016-17,121.9,107.2,14.8,0.564,1.69,18.4,0.222,0.738,0.500,14.027,14.1,0.607,0.667,0.197,94.86,92,0.548,0.747,0.253,0.460,0.159,0.345,0.500,0.500,0.692,0.308,0.564,0.436,62.0,44.0,18.0,9.0,0.0,18.0,43.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,16.65,19.0,54.0,73.0,362.0,0.0,0.0,255.0,19.0,43.0,0.442,20.0,32.0,0.625,14.0,20.0,0.700,20.0,15.0,4.0,34.0,1
2635,22016,1610612762,UTA,Utah Jazz,0021600002,2016-10-25,UTA @ POR,0,240,40,82,0.488,8,24,0.333,16,16,1.000,6,25,31,19,9,5,14,19,104,-9,2016-17,107.2,121.9,-14.8,0.475,1.36,15.6,0.262,0.778,0.500,14.427,15.4,0.537,0.584,0.190,94.86,91,0.452,0.707,0.293,0.615,0.135,0.231,0.344,0.656,1.000,0.000,0.475,0.525,54.0,44.0,10.0,16.0,0.0,10.0,20.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,16.76,19.0,47.0,66.0,425.0,2.0,1.0,310.0,22.0,44.0,0.500,18.0,38.0,0.474,11.0,18.0,0.611,8.0,4.0,8.0,50.0,0
2636,22016,1610612744,GSW,Golden State Warriors,0021600003,2016-10-25,GSW vs. SAS,0,240,40,85,0.471,7,33,0.212,13,18,0.722,8,27,35,24,11,6,16,19,100,-29,2016-17,99.1,125.9,-26.8,0.600,1.50,18.1,0.208,0.528,0.376,15.854,15.8,0.512,0.538,0.198,101.68,101,0.400,0.612,0.388,0.660,0.180,0.210,0.576,0.424,0.714,0.286,0.600,0.400,83.0,62.0,21.0,23.0,0.0,5.0,11.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,16.90,23.0,45.0,67.0,397.0,4.0,1.0,277.0,21.0,38.0,0.553,19.0,47.0,0.404,20.0,32.0,0.625,15.0,4.0,20.0,48.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23043,42023,1610612738,BOS,Boston Celtics,0042300403,2024-06-12,BOS @ DAL,1,240,38,82,0.463,17,46,0.370,13,14,0.929,6,30,36,26,4,6,9,19,106,7,2023-24,116.3,104.2,12.1,0.684,2.89,21.1,0.136,0.750,0.457,9.873,9.8,0.567,0.601,0.200,93.10,92,0.544,0.439,0.561,0.396,0.057,0.481,0.619,0.381,0.765,0.235,0.684,0.316,45.0,29.0,16.0,8.0,0.0,7.0,16.0,1.0,4.0,5.0,0.0,7.0,6.0,3.0,7.0,16.52,12.0,60.0,72.0,405.0,2.0,0.0,296.0,15.0,26.0,0.577,23.0,56.0,0.411,17.0,28.0,0.607,13.0,6.0,12.0,36.0,0
23044,42023,1610612742,DAL,Dallas Mavericks,0042300404,2024-06-14,DAL vs. BOS,1,240,46,91,0.505,15,37,0.405,15,22,0.682,13,39,52,21,7,2,9,17,122,38,2023-24,126.2,87.8,38.4,0.457,2.33,16.1,0.319,0.882,0.612,9.309,9.4,0.588,0.606,0.197,96.20,96,0.701,0.593,0.407,0.508,0.016,0.369,0.258,0.742,0.867,0.133,0.457,0.543,37.0,20.0,17.0,15.0,0.0,11.0,28.0,3.0,2.0,5.0,7.0,5.0,11.0,8.0,12.0,17.63,26.0,59.0,85.0,349.0,1.0,2.0,232.0,17.0,35.0,0.486,29.0,56.0,0.518,11.0,21.0,0.524,17.0,16.0,11.0,60.0,1
23045,42023,1610612738,BOS,Boston Celtics,0042300404,2024-06-14,BOS @ DAL,0,240,29,80,0.363,14,41,0.341,12,13,0.923,4,27,31,18,2,5,14,19,84,-38,2023-24,87.8,126.2,-38.4,0.621,1.29,15.3,0.118,0.681,0.388,14.626,14.6,0.450,0.490,0.199,96.20,96,0.299,0.488,0.513,0.357,0.048,0.500,0.533,0.467,0.714,0.286,0.621,0.379,44.0,24.0,20.0,11.0,0.0,6.0,17.0,1.0,2.0,3.0,2.0,3.0,5.0,3.0,5.0,17.26,18.0,55.0,71.0,404.0,6.0,2.0,290.0,11.0,27.0,0.407,18.0,53.0,0.340,20.0,29.0,0.690,9.0,2.0,6.0,26.0,0
23046,42023,1610612742,DAL,Dallas Mavericks,0042300405,2024-06-17,DAL @ BOS,0,240,35,78,0.449,11,37,0.297,7,13,0.538,7,28,35,18,4,4,13,20,88,-18,2023-24,98.1,115.5,-17.4,0.514,1.38,15.7,0.196,0.635,0.429,14.490,14.6,0.519,0.526,0.199,90.76,89,0.366,0.526,0.474,0.545,0.068,0.375,0.500,0.500,0.545,0.455,0.514,0.486,43.0,17.0,26.0,13.0,0.0,11.0,26.0,3.0,1.0,4.0,2.0,6.0,8.0,4.0,8.0,16.67,14.0,56.0,70.0,315.0,1.0,0.0,207.0,13.0,22.0,0.591,22.0,56.0,0.393,16.0,30.0,0.533,6.0,6.0,9.0,42.0,0


In [66]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
clean_df.isnull().sum()

SEASON_ID                         0
TEAM_ID                           0
TEAM_ABBREVIATION                 0
TEAM_NAME                         0
GAME_ID                           0
GAME_DATE                         0
MATCHUP                           0
WL                                0
MIN                               0
FGM                               0
FGA                               0
FG_PCT                            0
FG3M                              0
FG3A                              0
FG3_PCT                           0
FTM                               0
FTA                               0
FT_PCT                            1
OREB                              0
DREB                              0
REB                               0
AST                               0
STL                               0
BLK                               0
TOV                               0
PF                                0
PTS                               0
PLUS_MINUS                  

In [67]:
df.columns

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M',
       'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST',
       'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS', 'SEASON_YEAR',
       'E_OFF_RATING', 'E_DEF_RATING', 'E_NET_RATING', 'AST_PCT', 'AST_TOV',
       'AST_RATIO', 'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'E_TM_TOV_PCT',
       'TM_TOV_PCT', 'EFG_PCT', 'TS_PCT', 'E_USG_PCT', 'E_PACE', 'POSS', 'PIE',
       'PCT_FGA_2PT', 'PCT_FGA_3PT', 'PCT_PTS_2PT', 'PCT_PTS_2PT_MR',
       'PCT_PTS_3PT', 'PCT_AST_2PM', 'PCT_UAST_2PM', 'PCT_AST_3PM',
       'PCT_UAST_3PM', 'PCT_AST_FGM', 'PCT_UAST_FGM', 'contestedShots',
       'contestedShots2pt', 'contestedShots3pt', 'deflections', 'chargesDrawn',
       'screenAssists', 'screenAssistPoints', 'looseBallsRecoveredOffensive',
       'looseBallsRecoveredDefensive', 'looseBallsRecoveredTotal',
       'offensiveBoxOuts', 'defensi

In [198]:
def prep_for_aggregation(df):
    """This function...
    1) Removes categories that are percentages,
    as we will be averaging them and do not want to average 
    percentages. 
    2) Converts shooting percentage stats into raw values"""
    df = df.copy()

    df['FG2M'] = df['FGM'] - df['FG3M']
    df['FG2A'] = df['FGA'] - df['FG3A']
    df['PTS_2PT_MR'] = (df['PTS'] * df['PCT_PTS_2PT_MR']).astype('int8')
    df['AST_2PM'] = (df['FG2M'] * df['PCT_AST_2PM']).astype('int8')
    df['AST_3PM'] = (df['FG3M'] * df['PCT_AST_3PM']).astype('int8')
    df['UAST_2PM'] = (df['FG2M'] * df['PCT_UAST_2PM']).astype('int8')
    df['UAST_3PM'] = (df['FG3M'] * df['PCT_UAST_3PM']).astype('int8')

    df['POINT_DIFF'] = df['PLUS_MINUS']
    df['RECORD'] = df['WL']
    df['TEAM_SCORE'] = df['PTS']
    
    df = df.drop(columns = ['PCT_FGA_2PT', 'PCT_FGA_3PT', 'PCT_PTS_2PT',
                          'PCT_PTS_2PT_MR', 'PCT_PTS_3PT', 'POSS',                       
                          'PCT_AST_2PM', 'PCT_UAST_2PM','PCT_AST_3PM',
                          'PCT_UAST_3PM', 'PCT_AST_FGM', 'PCT_UAST_FGM',
                          'FT_PCT', 'FG_PCT', 'FG3_PCT', 'DREB_PCT',
                          'OREB_PCT', 'REB_PCT', 'AST_PCT', 'AST_TOV', 
                          'AST_RATIO', 'E_TM_TOV_PCT', 'TM_TOV_PCT',
                          'EFG_PCT', 'TS_PCT', 'E_USG_PCT',
                           'MIN', 'PIE', 'CFG_PCT', 'UFG_PCT',
                          'DFG_PCT'])
    
    ## Reorder Columns


    return df


clean_df2 = prep_for_aggregation(clean_df)

clean_df2

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,SEASON,E_OFF_RATING,E_DEF_RATING,E_NET_RATING,E_PACE,contestedShots,contestedShots2pt,contestedShots3pt,deflections,chargesDrawn,screenAssists,screenAssistPoints,looseBallsRecoveredOffensive,looseBallsRecoveredDefensive,looseBallsRecoveredTotal,offensiveBoxOuts,defensiveBoxOuts,boxOutPlayerTeamRebounds,boxOutPlayerRebounds,boxOuts,DIST,ORBC,DRBC,RBC,TCHS,SAST,FTAST,PASS,CFGM,CFGA,UFGM,UFGA,DFGM,DFGA,PTS_OFF_TOV,PTS_2ND_CHANCE,PTS_FB,PTS_PAINT,HOME_GAME,FG2M,FG2A,PTS_2PT_MR,AST_2PM,AST_3PM,UAST_2PM,UAST_3PM,POINT_DIFF,RECORD,TEAM_SCORE
2632,22016,1610612739,CLE,Cleveland Cavaliers,0021600001,2016-10-25,CLE vs. NYK,1,45,94,13,35,14,19,11,40,51,31,12,5,15,22,117,29,2016-17,110.0,87.3,22.7,103.58,65.0,46.0,19.0,16.0,0.0,6.0,12.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,17.12,41.0,72.0,108.0,441.0,2.0,0.0,314.0,17.0,39.0,28.0,55.0,10.0,18.0,21.0,16.0,16.0,52.0,1,32,59,12,20,10,12,2,29,1,117
2633,22016,1610612752,NYK,New York Knicks,0021600001,2016-10-25,NYK @ CLE,0,32,87,9,27,15,20,13,29,42,17,6,6,18,22,88,-29,2016-17,87.3,110.0,-22.7,103.58,65.0,39.0,26.0,9.0,2.0,7.0,15.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,17.29,34.0,71.0,104.0,408.0,1.0,1.0,287.0,12.0,31.0,20.0,56.0,21.0,31.0,14.0,16.0,9.0,28.0,0,23,60,18,8,8,14,0,-29,0,88
2634,22016,1610612757,POR,Portland Trail Blazers,0021600002,2016-10-25,POR vs. UTA,1,39,75,13,19,22,22,5,29,34,22,5,3,13,18,113,9,2016-17,121.9,107.2,14.8,94.86,62.0,44.0,18.0,9.0,0.0,18.0,43.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,16.65,19.0,54.0,73.0,362.0,0.0,0.0,255.0,19.0,43.0,20.0,32.0,14.0,20.0,20.0,15.0,4.0,34.0,1,26,56,17,13,8,13,4,9,1,113
2635,22016,1610612762,UTA,Utah Jazz,0021600002,2016-10-25,UTA @ POR,0,40,82,8,24,16,16,6,25,31,19,9,5,14,19,104,-9,2016-17,107.2,121.9,-14.8,94.86,54.0,44.0,10.0,16.0,0.0,10.0,20.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,16.76,19.0,47.0,66.0,425.0,2.0,1.0,310.0,22.0,44.0,18.0,38.0,11.0,18.0,8.0,4.0,8.0,50.0,0,32,58,14,11,8,20,0,-9,0,104
2636,22016,1610612744,GSW,Golden State Warriors,0021600003,2016-10-25,GSW vs. SAS,0,40,85,7,33,13,18,8,27,35,24,11,6,16,19,100,-29,2016-17,99.1,125.9,-26.8,101.68,83.0,62.0,21.0,23.0,0.0,5.0,11.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,16.90,23.0,45.0,67.0,397.0,4.0,1.0,277.0,21.0,38.0,19.0,47.0,20.0,32.0,15.0,4.0,20.0,48.0,1,33,52,18,19,4,13,2,-29,0,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23043,42023,1610612738,BOS,Boston Celtics,0042300403,2024-06-12,BOS @ DAL,1,38,82,17,46,13,14,6,30,36,26,4,6,9,19,106,7,2023-24,116.3,104.2,12.1,93.10,45.0,29.0,16.0,8.0,0.0,7.0,16.0,1.0,4.0,5.0,0.0,7.0,6.0,3.0,7.0,16.52,12.0,60.0,72.0,405.0,2.0,0.0,296.0,15.0,26.0,23.0,56.0,17.0,28.0,13.0,6.0,12.0,36.0,0,21,36,6,12,13,8,3,7,1,106
23044,42023,1610612742,DAL,Dallas Mavericks,0042300404,2024-06-14,DAL vs. BOS,1,46,91,15,37,15,22,13,39,52,21,7,2,9,17,122,38,2023-24,126.2,87.8,38.4,96.20,37.0,20.0,17.0,15.0,0.0,11.0,28.0,3.0,2.0,5.0,7.0,5.0,11.0,8.0,12.0,17.63,26.0,59.0,85.0,349.0,1.0,2.0,232.0,17.0,35.0,29.0,56.0,11.0,21.0,17.0,16.0,11.0,60.0,1,31,54,1,7,13,23,1,38,1,122
23045,42023,1610612738,BOS,Boston Celtics,0042300404,2024-06-14,BOS @ DAL,0,29,80,14,41,12,13,4,27,31,18,2,5,14,19,84,-38,2023-24,87.8,126.2,-38.4,96.20,44.0,24.0,20.0,11.0,0.0,6.0,17.0,1.0,2.0,3.0,2.0,3.0,5.0,3.0,5.0,17.26,18.0,55.0,71.0,404.0,6.0,2.0,290.0,11.0,27.0,18.0,53.0,20.0,29.0,9.0,2.0,6.0,26.0,0,15,39,4,7,9,7,4,-38,0,84
23046,42023,1610612742,DAL,Dallas Mavericks,0042300405,2024-06-17,DAL @ BOS,0,35,78,11,37,7,13,7,28,35,18,4,4,13,20,88,-18,2023-24,98.1,115.5,-17.4,90.76,43.0,17.0,26.0,13.0,0.0,11.0,26.0,3.0,1.0,4.0,2.0,6.0,8.0,4.0,8.0,16.67,14.0,56.0,70.0,315.0,1.0,0.0,207.0,13.0,22.0,22.0,56.0,16.0,30.0,6.0,6.0,9.0,42.0,0,24,41,5,12,5,12,5,-18,0,88


In [246]:
def find_high_correlations(df, threshold=0.7):
    # Select only numeric columns
    numeric_df = df.select_dtypes(include=[np.number])
    
    # Compute the correlation matrix
    correlation_matrix = numeric_df.corr()
    
    # Create a list to store highly correlated pairs
    high_correlations = []
    
    # Loop through the correlation matrix
    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > threshold:
                col1 = correlation_matrix.columns[i]
                col2 = correlation_matrix.columns[j]
                corr_value = correlation_matrix.iloc[i, j]
                high_correlations.append((col1, col2, corr_value))
    
    # Sort the list by absolute correlation value in descending order
    high_correlations.sort(key=lambda x: abs(x[2]), reverse=True)
    
    return high_correlations

# Find highly correlated variables
high_corr_pairs = find_high_correlations(clean_df2, threshold=0.7)

# Print the results
print("Highly correlated variable pairs (correlation > 0.7):")
for col1, col2, corr in high_corr_pairs:
    print(f"{col1} and {col2}: {corr:.2f}")

Highly correlated variable pairs (correlation > 0.7):
REL_E_DEF_RATING_L10 and REL_E_OFF_RATING_opp_L10: 1.00
REL_PLUS_MINUS_L10 and REL_PLUS_MINUS_opp_L10: -1.00
REL_OREB_PCT_opp_L10 and REL_DREB_PCT_L10: -1.00
REL_E_NET_RATING_L10 and REL_E_NET_RATING_opp_L10: -1.00
REL_E_OFF_RATING_L10 and REL_E_DEF_RATING_opp_L10: 1.00
REL_E_PACE_opp_L10 and REL_E_PACE_L10: 1.00
REL_REB_PCT_L10 and REL_REB_PCT_opp_L10: -1.00
REL_OREB_PCT_L10 and REL_DREB_PCT_opp_L10: -1.00
REL_TOV_opp_L10 and REL_TOV_PCT_opp_L10: 0.99
REL_screenAssists_opp_L10 and REL_screenAssistPoints_opp_L10: 0.99
REL_PASS_L10 and REL_TCHS_L10: 0.99
REL_PLUS_MINUS_opp_L10 and REL_E_NET_RATING_opp_L10: 0.99
REL_PLUS_MINUS_L10 and REL_E_NET_RATING_opp_L10: -0.99
REL_E_NET_RATING_L10 and REL_PLUS_MINUS_opp_L10: -0.99
REL_E_NET_RATING_L10 and REL_PLUS_MINUS_L10: 0.99
REL_TOV_PCT_L10 and REL_TOV_L10: 0.98
REL_PASS_opp_L10 and REL_TCHS_opp_L10: 0.98
REL_screenAssists_L10 and REL_screenAssistPoints_L10: 0.98
REL_PTS_L10 and REL_E_DEF_R

In [247]:
high_corr_pairs

[('REL_E_DEF_RATING_L10', 'REL_E_OFF_RATING_opp_L10', 1.0),
 ('REL_PLUS_MINUS_L10', 'REL_PLUS_MINUS_opp_L10', -1.0),
 ('REL_OREB_PCT_opp_L10', 'REL_DREB_PCT_L10', -1.0),
 ('REL_E_NET_RATING_L10', 'REL_E_NET_RATING_opp_L10', -1.0),
 ('REL_E_OFF_RATING_L10', 'REL_E_DEF_RATING_opp_L10', 1.0),
 ('REL_E_PACE_opp_L10', 'REL_E_PACE_L10', 1.0),
 ('REL_REB_PCT_L10', 'REL_REB_PCT_opp_L10', -0.9999999999999998),
 ('REL_OREB_PCT_L10', 'REL_DREB_PCT_opp_L10', -0.9999999999999998),
 ('REL_TOV_opp_L10', 'REL_TOV_PCT_opp_L10', 0.9902291776746244),
 ('REL_screenAssists_opp_L10',
  'REL_screenAssistPoints_opp_L10',
  0.9890193015571056),
 ('REL_PASS_L10', 'REL_TCHS_L10', 0.9884518974321779),
 ('REL_PLUS_MINUS_opp_L10', 'REL_E_NET_RATING_opp_L10', 0.9854770826463026),
 ('REL_PLUS_MINUS_L10', 'REL_E_NET_RATING_opp_L10', -0.9854770826463026),
 ('REL_E_NET_RATING_L10', 'REL_PLUS_MINUS_opp_L10', -0.9854770826463026),
 ('REL_E_NET_RATING_L10', 'REL_PLUS_MINUS_L10', 0.9854770826463026),
 ('REL_TOV_PCT_L10', 'R

In [200]:
def normalize_per_100_poss(df):
    df = df.copy(deep=True)
    
    stats = ['FGM', 'FGA', 'FG3M', 'FG3A', 'FTM',
       'FTA', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
       'PLUS_MINUS', 'contestedShots',
       'contestedShots2pt', 'contestedShots3pt', 'deflections', 'chargesDrawn',
       'screenAssists', 'screenAssistPoints', 'looseBallsRecoveredOffensive',
       'looseBallsRecoveredDefensive', 'looseBallsRecoveredTotal',
       'offensiveBoxOuts', 'defensiveBoxOuts', 'boxOutPlayerTeamRebounds',
       'boxOutPlayerRebounds', 'boxOuts', 'DIST', 'ORBC', 'DRBC', 'RBC',
       'TCHS', 'SAST', 'FTAST', 'PASS', 'CFGM', 'CFGA', 'UFGM', 'UFGA', 'DFGM',
       'DFGA', 'PTS_OFF_TOV', 'PTS_2ND_CHANCE', 'PTS_FB', 'PTS_PAINT',
        'FG2M', 'FG2A', 'PTS_2PT_MR', 'AST_2PM', 'AST_3PM',
       'UAST_2PM', 'UAST_3PM']
    
    df[stats] = 100*df[stats].div(df['E_PACE'], axis=0) 
    
    return df
 
normalized_df = normalize_per_100_poss(clean_df2)
normalized_df

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,SEASON,E_OFF_RATING,E_DEF_RATING,E_NET_RATING,E_PACE,contestedShots,contestedShots2pt,contestedShots3pt,deflections,chargesDrawn,screenAssists,screenAssistPoints,looseBallsRecoveredOffensive,looseBallsRecoveredDefensive,looseBallsRecoveredTotal,offensiveBoxOuts,defensiveBoxOuts,boxOutPlayerTeamRebounds,boxOutPlayerRebounds,boxOuts,DIST,ORBC,DRBC,RBC,TCHS,SAST,FTAST,PASS,CFGM,CFGA,UFGM,UFGA,DFGM,DFGA,PTS_OFF_TOV,PTS_2ND_CHANCE,PTS_FB,PTS_PAINT,HOME_GAME,FG2M,FG2A,PTS_2PT_MR,AST_2PM,AST_3PM,UAST_2PM,UAST_3PM,POINT_DIFF,RECORD,TEAM_SCORE
2632,22016,1610612739,CLE,Cleveland Cavaliers,0021600001,2016-10-25,CLE vs. NYK,1,43.444680,90.751110,12.550685,33.790307,13.516123,18.343310,10.619811,38.617494,49.237304,29.928558,11.585248,4.827187,14.481560,21.239622,112.956169,27.997683,2016-17,110.0,87.3,22.7,103.58,62.753427,44.410118,18.343310,15.446997,0.000000,5.792624,11.585248,0.000000,0.000000,11.585248,0.000000,0.000000,0.000000,0.000000,0.000000,16.528287,39.582931,69.511489,104.267233,425.757868,1.930875,0.000000,303.147326,16.412435,37.652056,27.032246,53.099054,9.654373,17.377872,20.274184,15.446997,15.446997,50.202742,1,30.893995,56.960803,11.585248,19.308747,9.654373,11.585248,1.930875,29,1,117
2633,22016,1610612752,NYK,New York Knicks,0021600001,2016-10-25,NYK @ CLE,0,30.893995,83.993049,8.688936,26.066808,14.481560,19.308747,12.550685,27.997683,40.548368,16.412435,5.792624,5.792624,17.377872,21.239622,84.958486,-27.997683,2016-17,87.3,110.0,-22.7,103.58,62.753427,37.652056,25.101371,8.688936,1.930875,6.758061,14.481560,0.000000,0.000000,9.654373,0.000000,0.000000,0.000000,0.000000,0.000000,16.692412,32.824870,68.546051,100.405484,393.898436,0.965437,0.965437,277.080517,11.585248,29.928558,19.308747,54.064491,20.274184,29.928558,13.516123,15.446997,8.688936,27.032246,0,22.205059,57.926241,17.377872,7.723499,7.723499,13.516123,0.000000,-29,0,88
2634,22016,1610612757,POR,Portland Trail Blazers,0021600002,2016-10-25,POR vs. UTA,1,41.113219,79.063884,13.704406,20.029517,23.192073,23.192073,5.270926,30.571368,35.842294,23.192073,5.270926,3.162555,13.704406,18.975332,119.122918,9.487666,2016-17,121.9,107.2,14.8,94.86,65.359477,46.384145,18.975332,9.487666,0.000000,18.975332,45.329960,0.000000,0.000000,4.216740,0.000000,0.000000,0.000000,0.000000,0.000000,17.552182,20.029517,56.925996,76.955513,381.615012,0.000000,0.000000,268.817204,20.029517,45.329960,21.083702,33.733924,14.758592,21.083702,21.083702,15.812777,4.216740,35.842294,1,27.408813,59.034366,17.921147,13.704406,8.433481,13.704406,4.216740,9,1,113
2635,22016,1610612762,UTA,Utah Jazz,0021600002,2016-10-25,UTA @ POR,0,42.167405,86.443179,8.433481,25.300443,16.866962,16.866962,6.325111,26.354628,32.679739,20.029517,9.487666,5.270926,14.758592,20.029517,109.635252,-9.487666,2016-17,107.2,121.9,-14.8,94.86,56.925996,46.384145,10.541851,16.866962,0.000000,10.541851,21.083702,0.000000,0.000000,7.379296,0.000000,0.000000,0.000000,0.000000,0.000000,17.668143,20.029517,49.546700,69.576218,448.028674,2.108370,1.054185,326.797386,23.192073,46.384145,18.975332,40.059034,11.596036,18.975332,8.433481,4.216740,8.433481,52.709256,0,33.733924,61.142737,14.758592,11.596036,8.433481,21.083702,0.000000,-9,0,104
2636,22016,1610612744,GSW,Golden State Warriors,0021600003,2016-10-25,GSW vs. SAS,0,39.339103,83.595594,6.884343,32.454760,12.785208,17.702596,7.867821,26.553895,34.421715,23.603462,10.818253,5.900865,15.735641,18.686074,98.347758,-28.520850,2016-17,99.1,125.9,-26.8,101.68,81.628639,60.975610,20.653029,22.619984,0.000000,4.917388,10.818253,0.000000,0.000000,7.867821,0.000000,0.000000,0.000000,0.000000,0.000000,16.620771,22.619984,44.256491,65.892998,390.440598,3.933910,0.983478,272.423289,20.653029,37.372148,18.686074,46.223446,19.669552,31.471282,14.752164,3.933910,19.669552,47.206924,1,32.454760,51.140834,17.702596,18.686074,3.933910,12.785208,1.966955,-29,0,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23043,42023,1610612738,BOS,Boston Celtics,0042300403,2024-06-12,BOS @ DAL,1,40.816327,88.077336,18.259936,49.409237,13.963480,15.037594,6.444683,32.223416,38.668099,27.926960,4.296455,6.444683,9.667025,20.408163,113.856069,7.518797,2023-24,116.3,104.2,12.1,93.10,48.335124,31.149302,17.185822,8.592911,0.000000,7.518797,17.185822,1.074114,4.296455,5.370569,0.000000,7.518797,6.444683,3.222342,7.518797,17.744361,12.889366,64.446831,77.336198,435.016112,2.148228,0.000000,317.937701,16.111708,27.926960,24.704619,60.150376,18.259936,30.075188,13.963480,6.444683,12.889366,38.668099,0,22.556391,38.668099,6.444683,12.889366,13.963480,8.592911,3.222342,7,1,106
23044,42023,1610612742,DAL,Dallas Mavericks,0042300404,2024-06-14,DAL vs. BOS,1,47.817048,94.594595,15.592516,38.461538,15.592516,22.869023,13.513514,40.540541,54.054054,21.829522,7.276507,2.079002,9.355509,17.671518,126.819127,39.501040,2023-24,126.2,87.8,38.4,96.20,38.461538,20.790021,17.671518,15.592516,0.000000,11.434511,29.106029,3.118503,2.079002,5.197505,7.276507,5.197505,11.434511,8.316008,12.474012,18.326403,27.027027,61.330561,88.357588,362.785863,1.039501,2.079002,241.164241,17.671518,36.382536,30.145530,58.212058,11.434511,21.829522,17.671518,16.632017,11.434511,62.370062,1,32.224532,56.133056,1.039501,7.276507,13.513514,23.908524,1.039501,38,1,122
23045,42023,1610612738,BOS,Boston Celtics,0042300404,2024-06-14,BOS @ DAL,0,30.145530,83.160083,14.553015,42.619543,12.474012,13.513514,4.158004,28.066528,32.224532,18.711019,2.079002,5.197505,14.553015,19.750520,87.318087,-39.501040,2023-24,87.8,126.2,-38.4,96.20,45.738046,24.948025,20.790021,11.434511,0.000000,6.237006,17.671518,1.039501,2.079002,3.118503,2.079002,3.118503,5.197505,3.118503,5.197505,17.941788,18.711019,57.172557,73.804574,419.958420,6.237006,2.079002,301.455301,11.434511,28.066528,18.711019,55.093555,20.790021,30.145530,9.355509,2.079002,6.237006,27.027027,0,15.592516,40.540541,4.158004,7.276507,9.355509,7.276507,4.158004,-38,0,84
23046,42023,1610612742,DAL,Dallas Mavericks,0042300405,2024-06-17,DAL @ BOS,0,38.563244,85.940943,12.119877,40.766858,7.712649,14.323491,7.712649,30.850595,38.563244,19.832525,4.407228,4.407228,14.323491,22.036139,96.959013,-19.832525,2023-24,98.1,115.5,-17.4,90.76,47.377699,18.730718,28.646981,14.323491,0.000000,12.119877,28.646981,3.305421,1.101807,4.407228,2.203614,6.610842,8.814456,4.407228,8.814456,18.367122,15.425297,61.701190,77.126487,347.069193,1.101807,0.000000,228.074041,14.323491,24.239753,24.239753,61.701190,17.628911,33.054209,6.610842,6.610842,9.916263,46.275892,0,26.443367,45.174086,5.509035,13.221684,5.509035,13.221684,5.509035,-18,0,88


In [98]:
normalized_df.columns

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM',
       'FTA', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
       'PLUS_MINUS', 'SEASON', 'E_PACE', 'contestedShots', 'contestedShots2pt',
       'contestedShots3pt', 'deflections', 'chargesDrawn', 'screenAssists',
       'screenAssistPoints', 'looseBallsRecoveredOffensive',
       'looseBallsRecoveredDefensive', 'looseBallsRecoveredTotal',
       'offensiveBoxOuts', 'defensiveBoxOuts', 'boxOutPlayerTeamRebounds',
       'boxOutPlayerRebounds', 'boxOuts', 'DIST', 'ORBC', 'DRBC', 'RBC',
       'TCHS', 'SAST', 'FTAST', 'PASS', 'CFGM', 'CFGA', 'UFGM', 'UFGA', 'DFGM',
       'DFGA', 'PTS_OFF_TOV', 'PTS_2ND_CHANCE', 'PTS_FB', 'PTS_PAINT',
       'HOME_GAME', 'FG2M', 'FG2A', 'PTS_2PT_MR', 'AST_2PM', 'AST_3PM',
       'UAST_2PM', 'UAST_3PM', 'POINT_DIFF', 'RECORD', 'TEAM_SCORE'],
      dtype='object')

## Add Betting Data

In [201]:
def load_betting_data(conn):
    betting_data = pd.read_sql("SELECT * FROM nba_odds", conn)
    betting_data = betting_data[['season', 'date', 'status', 'home_team_abbr', 'away_team_abbr', 'home_spread_draftkings',
                                'home_spread_odds_draftkings',
                                'away_spread_draftkings',
                                'away_spread_odds_draftkings',
                                'home_ml_draftkings',
                                'away_ml_draftkings',
                                'total_draftkings',
                                'over_odds_draftkings',
                                'under_odds_draftkings']]
    return betting_data


connection = sqlite3.connect("nba_stats.db")
betting_data = load_betting_data(connection)
betting_data

connection.close()

In [202]:
def convert_american_to_decimal(x):
    return np.where(x>0, (100+x)/100, 1+(100.0/-x))          

In [203]:
normalized_df

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,SEASON,E_OFF_RATING,E_DEF_RATING,E_NET_RATING,E_PACE,contestedShots,contestedShots2pt,contestedShots3pt,deflections,chargesDrawn,screenAssists,screenAssistPoints,looseBallsRecoveredOffensive,looseBallsRecoveredDefensive,looseBallsRecoveredTotal,offensiveBoxOuts,defensiveBoxOuts,boxOutPlayerTeamRebounds,boxOutPlayerRebounds,boxOuts,DIST,ORBC,DRBC,RBC,TCHS,SAST,FTAST,PASS,CFGM,CFGA,UFGM,UFGA,DFGM,DFGA,PTS_OFF_TOV,PTS_2ND_CHANCE,PTS_FB,PTS_PAINT,HOME_GAME,FG2M,FG2A,PTS_2PT_MR,AST_2PM,AST_3PM,UAST_2PM,UAST_3PM,POINT_DIFF,RECORD,TEAM_SCORE
2632,22016,1610612739,CLE,Cleveland Cavaliers,0021600001,2016-10-25,CLE vs. NYK,1,43.444680,90.751110,12.550685,33.790307,13.516123,18.343310,10.619811,38.617494,49.237304,29.928558,11.585248,4.827187,14.481560,21.239622,112.956169,27.997683,2016-17,110.0,87.3,22.7,103.58,62.753427,44.410118,18.343310,15.446997,0.000000,5.792624,11.585248,0.000000,0.000000,11.585248,0.000000,0.000000,0.000000,0.000000,0.000000,16.528287,39.582931,69.511489,104.267233,425.757868,1.930875,0.000000,303.147326,16.412435,37.652056,27.032246,53.099054,9.654373,17.377872,20.274184,15.446997,15.446997,50.202742,1,30.893995,56.960803,11.585248,19.308747,9.654373,11.585248,1.930875,29,1,117
2633,22016,1610612752,NYK,New York Knicks,0021600001,2016-10-25,NYK @ CLE,0,30.893995,83.993049,8.688936,26.066808,14.481560,19.308747,12.550685,27.997683,40.548368,16.412435,5.792624,5.792624,17.377872,21.239622,84.958486,-27.997683,2016-17,87.3,110.0,-22.7,103.58,62.753427,37.652056,25.101371,8.688936,1.930875,6.758061,14.481560,0.000000,0.000000,9.654373,0.000000,0.000000,0.000000,0.000000,0.000000,16.692412,32.824870,68.546051,100.405484,393.898436,0.965437,0.965437,277.080517,11.585248,29.928558,19.308747,54.064491,20.274184,29.928558,13.516123,15.446997,8.688936,27.032246,0,22.205059,57.926241,17.377872,7.723499,7.723499,13.516123,0.000000,-29,0,88
2634,22016,1610612757,POR,Portland Trail Blazers,0021600002,2016-10-25,POR vs. UTA,1,41.113219,79.063884,13.704406,20.029517,23.192073,23.192073,5.270926,30.571368,35.842294,23.192073,5.270926,3.162555,13.704406,18.975332,119.122918,9.487666,2016-17,121.9,107.2,14.8,94.86,65.359477,46.384145,18.975332,9.487666,0.000000,18.975332,45.329960,0.000000,0.000000,4.216740,0.000000,0.000000,0.000000,0.000000,0.000000,17.552182,20.029517,56.925996,76.955513,381.615012,0.000000,0.000000,268.817204,20.029517,45.329960,21.083702,33.733924,14.758592,21.083702,21.083702,15.812777,4.216740,35.842294,1,27.408813,59.034366,17.921147,13.704406,8.433481,13.704406,4.216740,9,1,113
2635,22016,1610612762,UTA,Utah Jazz,0021600002,2016-10-25,UTA @ POR,0,42.167405,86.443179,8.433481,25.300443,16.866962,16.866962,6.325111,26.354628,32.679739,20.029517,9.487666,5.270926,14.758592,20.029517,109.635252,-9.487666,2016-17,107.2,121.9,-14.8,94.86,56.925996,46.384145,10.541851,16.866962,0.000000,10.541851,21.083702,0.000000,0.000000,7.379296,0.000000,0.000000,0.000000,0.000000,0.000000,17.668143,20.029517,49.546700,69.576218,448.028674,2.108370,1.054185,326.797386,23.192073,46.384145,18.975332,40.059034,11.596036,18.975332,8.433481,4.216740,8.433481,52.709256,0,33.733924,61.142737,14.758592,11.596036,8.433481,21.083702,0.000000,-9,0,104
2636,22016,1610612744,GSW,Golden State Warriors,0021600003,2016-10-25,GSW vs. SAS,0,39.339103,83.595594,6.884343,32.454760,12.785208,17.702596,7.867821,26.553895,34.421715,23.603462,10.818253,5.900865,15.735641,18.686074,98.347758,-28.520850,2016-17,99.1,125.9,-26.8,101.68,81.628639,60.975610,20.653029,22.619984,0.000000,4.917388,10.818253,0.000000,0.000000,7.867821,0.000000,0.000000,0.000000,0.000000,0.000000,16.620771,22.619984,44.256491,65.892998,390.440598,3.933910,0.983478,272.423289,20.653029,37.372148,18.686074,46.223446,19.669552,31.471282,14.752164,3.933910,19.669552,47.206924,1,32.454760,51.140834,17.702596,18.686074,3.933910,12.785208,1.966955,-29,0,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23043,42023,1610612738,BOS,Boston Celtics,0042300403,2024-06-12,BOS @ DAL,1,40.816327,88.077336,18.259936,49.409237,13.963480,15.037594,6.444683,32.223416,38.668099,27.926960,4.296455,6.444683,9.667025,20.408163,113.856069,7.518797,2023-24,116.3,104.2,12.1,93.10,48.335124,31.149302,17.185822,8.592911,0.000000,7.518797,17.185822,1.074114,4.296455,5.370569,0.000000,7.518797,6.444683,3.222342,7.518797,17.744361,12.889366,64.446831,77.336198,435.016112,2.148228,0.000000,317.937701,16.111708,27.926960,24.704619,60.150376,18.259936,30.075188,13.963480,6.444683,12.889366,38.668099,0,22.556391,38.668099,6.444683,12.889366,13.963480,8.592911,3.222342,7,1,106
23044,42023,1610612742,DAL,Dallas Mavericks,0042300404,2024-06-14,DAL vs. BOS,1,47.817048,94.594595,15.592516,38.461538,15.592516,22.869023,13.513514,40.540541,54.054054,21.829522,7.276507,2.079002,9.355509,17.671518,126.819127,39.501040,2023-24,126.2,87.8,38.4,96.20,38.461538,20.790021,17.671518,15.592516,0.000000,11.434511,29.106029,3.118503,2.079002,5.197505,7.276507,5.197505,11.434511,8.316008,12.474012,18.326403,27.027027,61.330561,88.357588,362.785863,1.039501,2.079002,241.164241,17.671518,36.382536,30.145530,58.212058,11.434511,21.829522,17.671518,16.632017,11.434511,62.370062,1,32.224532,56.133056,1.039501,7.276507,13.513514,23.908524,1.039501,38,1,122
23045,42023,1610612738,BOS,Boston Celtics,0042300404,2024-06-14,BOS @ DAL,0,30.145530,83.160083,14.553015,42.619543,12.474012,13.513514,4.158004,28.066528,32.224532,18.711019,2.079002,5.197505,14.553015,19.750520,87.318087,-39.501040,2023-24,87.8,126.2,-38.4,96.20,45.738046,24.948025,20.790021,11.434511,0.000000,6.237006,17.671518,1.039501,2.079002,3.118503,2.079002,3.118503,5.197505,3.118503,5.197505,17.941788,18.711019,57.172557,73.804574,419.958420,6.237006,2.079002,301.455301,11.434511,28.066528,18.711019,55.093555,20.790021,30.145530,9.355509,2.079002,6.237006,27.027027,0,15.592516,40.540541,4.158004,7.276507,9.355509,7.276507,4.158004,-38,0,84
23046,42023,1610612742,DAL,Dallas Mavericks,0042300405,2024-06-17,DAL @ BOS,0,38.563244,85.940943,12.119877,40.766858,7.712649,14.323491,7.712649,30.850595,38.563244,19.832525,4.407228,4.407228,14.323491,22.036139,96.959013,-19.832525,2023-24,98.1,115.5,-17.4,90.76,47.377699,18.730718,28.646981,14.323491,0.000000,12.119877,28.646981,3.305421,1.101807,4.407228,2.203614,6.610842,8.814456,4.407228,8.814456,18.367122,15.425297,61.701190,77.126487,347.069193,1.101807,0.000000,228.074041,14.323491,24.239753,24.239753,61.701190,17.628911,33.054209,6.610842,6.610842,9.916263,46.275892,0,26.443367,45.174086,5.509035,13.221684,5.509035,13.221684,5.509035,-18,0,88


In [204]:
def clean_betting_data(df):
    abbr_mapping = {'BK':'BKN',
                    'GS':'GSW',
                    'NO':'NOP',
                    'NY':'NYK',
                    'PHO':'PHX',
                    'SA':'SAS'
                    }

    df['home_team_abbr'] = df['home_team_abbr'].replace(abbr_mapping)
    df['away_team_abbr'] = df['away_team_abbr'].replace(abbr_mapping)

    df['date'] = pd.to_datetime(df['date']).dt.date
    df['GAME_DATE'] = df['date'] - pd.Timedelta(days=1)

    df['home_spread_odds_draftkings'] = convert_american_to_decimal(df['home_spread_odds_draftkings'])
    df['away_spread_odds_draftkings'] = convert_american_to_decimal(df['away_spread_odds_draftkings'])

    df['home_ml_draftkings'] = convert_american_to_decimal(df['home_ml_draftkings'])
    df['away_ml_draftkings'] = convert_american_to_decimal(df['away_ml_draftkings'])
    
    df['over_odds_draftkings'] = convert_american_to_decimal(df['over_odds_draftkings'])
    df['under_odds_draftkings'] = convert_american_to_decimal(df['under_odds_draftkings'])

    return df

In [205]:
clean_betting_df = clean_betting_data(betting_data)

In [206]:
clean_betting_df.head()

Unnamed: 0,season,date,status,home_team_abbr,away_team_abbr,home_spread_draftkings,home_spread_odds_draftkings,away_spread_draftkings,away_spread_odds_draftkings,home_ml_draftkings,away_ml_draftkings,total_draftkings,over_odds_draftkings,under_odds_draftkings,GAME_DATE
0,2019-20,2020-10-11,Final,MIA,LAL,6.0,1.909091,-6.0,1.909091,3.15,1.377358,214.5,1.925926,1.884956,2020-10-10
1,2019-20,2020-10-11,Final,MIA,LAL,6.0,1.909091,-6.0,1.909091,3.15,1.377358,214.5,1.925926,1.884956,2020-10-10
2,2019-20,2020-10-10,Final,LAL,MIA,-7.0,1.892857,7.0,1.925926,1.298507,3.7,215.5,1.892857,1.917431,2020-10-09
3,2019-20,2020-10-07,Final,MIA,LAL,7.5,1.909091,-7.5,1.909091,3.75,1.298507,218.5,1.884956,1.925926,2020-10-06
4,2019-20,2020-10-04,Final,MIA,LAL,9.5,1.909091,-9.5,1.909091,4.5,1.21978,219.5,1.884956,1.925926,2020-10-03


In [207]:
normalized_df['GAME_DATE'] = pd.to_datetime(normalized_df['GAME_DATE'])

betting_data['season'].unique()

array(['2019-20', '2020-21', '2021-22', '2022-23', '2023-24'],
      dtype=object)

In [208]:
clean_betting_df[['season', 'date', 'GAME_DATE', 'home_team_abbr', 'away_team_abbr']]

Unnamed: 0,season,date,GAME_DATE,home_team_abbr,away_team_abbr
0,2019-20,2020-10-11,2020-10-10,MIA,LAL
1,2019-20,2020-10-11,2020-10-10,MIA,LAL
2,2019-20,2020-10-10,2020-10-09,LAL,MIA
3,2019-20,2020-10-07,2020-10-06,MIA,LAL
4,2019-20,2020-10-04,2020-10-03,MIA,LAL
...,...,...,...,...,...
6456,2023-24,2024-06-07,2024-06-06,BOS,DAL
6457,2023-24,2024-06-10,2024-06-09,BOS,DAL
6458,2023-24,2024-06-13,2024-06-12,DAL,BOS
6459,2023-24,2024-06-15,2024-06-14,DAL,BOS


In [209]:
# Create copy of input DataFrames to avoid modifying originals
clean_boxscores = normalized_df.copy()
clean_betting_df = clean_betting_df.copy()

# Ensure date columns are in the same format
clean_boxscores['GAME_DATE'] = pd.to_datetime(clean_boxscores['GAME_DATE']) #.dt.tz_localize(None)
clean_betting_df['GAME_DATE'] = pd.to_datetime(clean_betting_df['GAME_DATE']) #.dt.tz_localize(None)    

clean_boxscores['HOME_TEAM'] = clean_boxscores['MATCHUP'].apply(
    lambda x: x[:3] if 'vs' in x else x[-3:])
clean_boxscores['AWAY_TEAM'] = clean_boxscores['MATCHUP'].apply(
    lambda x: x[:3] if '@' in x else x[-3:])


merged_df = pd.merge(clean_boxscores, clean_betting_df, how='left', 
                        left_on=['HOME_TEAM', 'AWAY_TEAM', 'GAME_DATE'],
                        right_on=['home_team_abbr', 'away_team_abbr', 'GAME_DATE'])


merged_df['ML'] = merged_df.apply(lambda row: row['home_ml_draftkings'] if row['HOME_GAME'] == 1
                                    else row['away_ml_draftkings'], axis=1)

merged_df['SPREAD'] = merged_df.apply(lambda row: row['home_spread_odds_draftkings'] if row['HOME_GAME'] == 1
                                        else -row['away_spread_odds_draftkings'], axis=1)


merged_df['SPREAD_ODDS'] = merged_df.apply(lambda row: row['home_spread_draftkings'] if row['HOME_GAME'] == 1
                                        else -row['away_spread_draftkings'], axis=1)


merged_df = merged_df.drop(columns=['HOME_TEAM', 'AWAY_TEAM', 'date'])

merged_df['ATS_DIFF'] = merged_df['POINT_DIFF'] + merged_df['SPREAD']

merged_df['TEAM_COVERED'] = (merged_df['ATS_DIFF'] > 0).astype(int)


In [210]:
def merge_betting_and_boxscore_data(clean_betting_df, clean_boxscores):
    
    # Create copy of input DataFrames to avoid modifying originals
    clean_boxscores = clean_boxscores.copy()
    clean_betting_df = clean_betting_df.copy()

    # Ensure date columns are in the same format
    clean_boxscores['GAME_DATE'] = pd.to_datetime(clean_boxscores['GAME_DATE']) #.dt.tz_localize(None)
    clean_betting_df['GAME_DATE'] = pd.to_datetime(clean_betting_df['GAME_DATE']) #.dt.tz_localize(None)    

    clean_boxscores['HOME_TEAM'] = clean_boxscores['MATCHUP'].apply(
        lambda x: x[:3] if 'vs' in x else x[-3:])
    clean_boxscores['AWAY_TEAM'] = clean_boxscores['MATCHUP'].apply(
        lambda x: x[:3] if '@' in x else x[-3:])


    merged_df = pd.merge(clean_boxscores, clean_betting_df, how='left', 
                            left_on=['HOME_TEAM', 'AWAY_TEAM', 'GAME_DATE'],
                            right_on=['home_team_abbr', 'away_team_abbr', 'GAME_DATE'])

    merged_df['ML'] = merged_df.apply(lambda row: row['home_ml_draftkings'] if row['HOME_GAME'] == 1
                                      else row['away_ml_draftkings'], axis=1)

    merged_df['SPREAD'] = merged_df.apply(lambda row: row['home_spread_odds_draftkings'] if row['HOME_GAME'] == 1
                                          else -row['away_spread_odds_draftkings'], axis=1)
    
    
    merged_df['SPREAD_ODDS'] = merged_df.apply(lambda row: row['home_spread_draftkings'] if row['HOME_GAME'] == 1
                                          else -row['away_spread_draftkings'], axis=1)
    

    merged_df = merged_df.drop(columns=['HOME_TEAM', 'AWAY_TEAM', 'date'])

    merged_df['ATS_DIFF'] = merged_df['POINT_DIFF'] + merged_df['SPREAD']

    merged_df['TEAM_COVERED'] = (merged_df['ATS_DIFF'] > 0).astype(int)
    
#     merged_df.loc[merged_df['ATS_DIFF'] == 0.0, 'TEAM_COVERED'] = 0.5

    return merged_df


merged_df = merge_betting_and_boxscore_data(clean_betting_df, clean_boxscores=normalized_df)

merged_df

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,SEASON,E_OFF_RATING,E_DEF_RATING,E_NET_RATING,E_PACE,contestedShots,contestedShots2pt,contestedShots3pt,deflections,chargesDrawn,screenAssists,screenAssistPoints,looseBallsRecoveredOffensive,looseBallsRecoveredDefensive,looseBallsRecoveredTotal,offensiveBoxOuts,defensiveBoxOuts,boxOutPlayerTeamRebounds,boxOutPlayerRebounds,boxOuts,DIST,ORBC,DRBC,RBC,TCHS,SAST,FTAST,PASS,CFGM,CFGA,UFGM,UFGA,DFGM,DFGA,PTS_OFF_TOV,PTS_2ND_CHANCE,PTS_FB,PTS_PAINT,HOME_GAME,FG2M,FG2A,PTS_2PT_MR,AST_2PM,AST_3PM,UAST_2PM,UAST_3PM,POINT_DIFF,RECORD,TEAM_SCORE,season,status,home_team_abbr,away_team_abbr,home_spread_draftkings,home_spread_odds_draftkings,away_spread_draftkings,away_spread_odds_draftkings,home_ml_draftkings,away_ml_draftkings,total_draftkings,over_odds_draftkings,under_odds_draftkings,ML,SPREAD,SPREAD_ODDS,ATS_DIFF,TEAM_COVERED
0,22016,1610612739,CLE,Cleveland Cavaliers,0021600001,2016-10-25,CLE vs. NYK,1,43.444680,90.751110,12.550685,33.790307,13.516123,18.343310,10.619811,38.617494,49.237304,29.928558,11.585248,4.827187,14.481560,21.239622,112.956169,27.997683,2016-17,110.0,87.3,22.7,103.58,62.753427,44.410118,18.343310,15.446997,0.000000,5.792624,11.585248,0.000000,0.000000,11.585248,0.000000,0.000000,0.000000,0.000000,0.000000,16.528287,39.582931,69.511489,104.267233,425.757868,1.930875,0.000000,303.147326,16.412435,37.652056,27.032246,53.099054,9.654373,17.377872,20.274184,15.446997,15.446997,50.202742,1,30.893995,56.960803,11.585248,19.308747,9.654373,11.585248,1.930875,29,1,117,,,,,,,,,,,,,,,,,,0
1,22016,1610612752,NYK,New York Knicks,0021600001,2016-10-25,NYK @ CLE,0,30.893995,83.993049,8.688936,26.066808,14.481560,19.308747,12.550685,27.997683,40.548368,16.412435,5.792624,5.792624,17.377872,21.239622,84.958486,-27.997683,2016-17,87.3,110.0,-22.7,103.58,62.753427,37.652056,25.101371,8.688936,1.930875,6.758061,14.481560,0.000000,0.000000,9.654373,0.000000,0.000000,0.000000,0.000000,0.000000,16.692412,32.824870,68.546051,100.405484,393.898436,0.965437,0.965437,277.080517,11.585248,29.928558,19.308747,54.064491,20.274184,29.928558,13.516123,15.446997,8.688936,27.032246,0,22.205059,57.926241,17.377872,7.723499,7.723499,13.516123,0.000000,-29,0,88,,,,,,,,,,,,,,,,,,0
2,22016,1610612757,POR,Portland Trail Blazers,0021600002,2016-10-25,POR vs. UTA,1,41.113219,79.063884,13.704406,20.029517,23.192073,23.192073,5.270926,30.571368,35.842294,23.192073,5.270926,3.162555,13.704406,18.975332,119.122918,9.487666,2016-17,121.9,107.2,14.8,94.86,65.359477,46.384145,18.975332,9.487666,0.000000,18.975332,45.329960,0.000000,0.000000,4.216740,0.000000,0.000000,0.000000,0.000000,0.000000,17.552182,20.029517,56.925996,76.955513,381.615012,0.000000,0.000000,268.817204,20.029517,45.329960,21.083702,33.733924,14.758592,21.083702,21.083702,15.812777,4.216740,35.842294,1,27.408813,59.034366,17.921147,13.704406,8.433481,13.704406,4.216740,9,1,113,,,,,,,,,,,,,,,,,,0
3,22016,1610612762,UTA,Utah Jazz,0021600002,2016-10-25,UTA @ POR,0,42.167405,86.443179,8.433481,25.300443,16.866962,16.866962,6.325111,26.354628,32.679739,20.029517,9.487666,5.270926,14.758592,20.029517,109.635252,-9.487666,2016-17,107.2,121.9,-14.8,94.86,56.925996,46.384145,10.541851,16.866962,0.000000,10.541851,21.083702,0.000000,0.000000,7.379296,0.000000,0.000000,0.000000,0.000000,0.000000,17.668143,20.029517,49.546700,69.576218,448.028674,2.108370,1.054185,326.797386,23.192073,46.384145,18.975332,40.059034,11.596036,18.975332,8.433481,4.216740,8.433481,52.709256,0,33.733924,61.142737,14.758592,11.596036,8.433481,21.083702,0.000000,-9,0,104,,,,,,,,,,,,,,,,,,0
4,22016,1610612744,GSW,Golden State Warriors,0021600003,2016-10-25,GSW vs. SAS,0,39.339103,83.595594,6.884343,32.454760,12.785208,17.702596,7.867821,26.553895,34.421715,23.603462,10.818253,5.900865,15.735641,18.686074,98.347758,-28.520850,2016-17,99.1,125.9,-26.8,101.68,81.628639,60.975610,20.653029,22.619984,0.000000,4.917388,10.818253,0.000000,0.000000,7.867821,0.000000,0.000000,0.000000,0.000000,0.000000,16.620771,22.619984,44.256491,65.892998,390.440598,3.933910,0.983478,272.423289,20.653029,37.372148,18.686074,46.223446,19.669552,31.471282,14.752164,3.933910,19.669552,47.206924,1,32.454760,51.140834,17.702596,18.686074,3.933910,12.785208,1.966955,-29,0,100,,,,,,,,,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20581,42023,1610612738,BOS,Boston Celtics,0042300403,2024-06-12,BOS @ DAL,1,40.816327,88.077336,18.259936,49.409237,13.963480,15.037594,6.444683,32.223416,38.668099,27.926960,4.296455,6.444683,9.667025,20.408163,113.856069,7.518797,2023-24,116.3,104.2,12.1,93.10,48.335124,31.149302,17.185822,8.592911,0.000000,7.518797,17.185822,1.074114,4.296455,5.370569,0.000000,7.518797,6.444683,3.222342,7.518797,17.744361,12.889366,64.446831,77.336198,435.016112,2.148228,0.000000,317.937701,16.111708,27.926960,24.704619,60.150376,18.259936,30.075188,13.963480,6.444683,12.889366,38.668099,0,22.556391,38.668099,6.444683,12.889366,13.963480,8.592911,3.222342,7,1,106,2023-24,Final,DAL,BOS,-3.0,1.925926,3.0,1.892857,1.675676,2.240000,213.5,1.909091,1.909091,2.240000,-1.892857,-3.0,5.107143,1
20582,42023,1610612742,DAL,Dallas Mavericks,0042300404,2024-06-14,DAL vs. BOS,1,47.817048,94.594595,15.592516,38.461538,15.592516,22.869023,13.513514,40.540541,54.054054,21.829522,7.276507,2.079002,9.355509,17.671518,126.819127,39.501040,2023-24,126.2,87.8,38.4,96.20,38.461538,20.790021,17.671518,15.592516,0.000000,11.434511,29.106029,3.118503,2.079002,5.197505,7.276507,5.197505,11.434511,8.316008,12.474012,18.326403,27.027027,61.330561,88.357588,362.785863,1.039501,2.079002,241.164241,17.671518,36.382536,30.145530,58.212058,11.434511,21.829522,17.671518,16.632017,11.434511,62.370062,1,32.224532,56.133056,1.039501,7.276507,13.513514,23.908524,1.039501,38,1,122,2023-24,Final,DAL,BOS,1.0,1.925926,-1.0,1.892857,1.980392,1.847458,211.0,1.909091,1.909091,1.980392,1.925926,1.0,39.925926,1
20583,42023,1610612738,BOS,Boston Celtics,0042300404,2024-06-14,BOS @ DAL,0,30.145530,83.160083,14.553015,42.619543,12.474012,13.513514,4.158004,28.066528,32.224532,18.711019,2.079002,5.197505,14.553015,19.750520,87.318087,-39.501040,2023-24,87.8,126.2,-38.4,96.20,45.738046,24.948025,20.790021,11.434511,0.000000,6.237006,17.671518,1.039501,2.079002,3.118503,2.079002,3.118503,5.197505,3.118503,5.197505,17.941788,18.711019,57.172557,73.804574,419.958420,6.237006,2.079002,301.455301,11.434511,28.066528,18.711019,55.093555,20.790021,30.145530,9.355509,2.079002,6.237006,27.027027,0,15.592516,40.540541,4.158004,7.276507,9.355509,7.276507,4.158004,-38,0,84,2023-24,Final,DAL,BOS,1.0,1.925926,-1.0,1.892857,1.980392,1.847458,211.0,1.909091,1.909091,1.847458,-1.892857,1.0,-39.892857,0
20584,42023,1610612742,DAL,Dallas Mavericks,0042300405,2024-06-17,DAL @ BOS,0,38.563244,85.940943,12.119877,40.766858,7.712649,14.323491,7.712649,30.850595,38.563244,19.832525,4.407228,4.407228,14.323491,22.036139,96.959013,-19.832525,2023-24,98.1,115.5,-17.4,90.76,47.377699,18.730718,28.646981,14.323491,0.000000,12.119877,28.646981,3.305421,1.101807,4.407228,2.203614,6.610842,8.814456,4.407228,8.814456,18.367122,15.425297,61.701190,77.126487,347.069193,1.101807,0.000000,228.074041,14.323491,24.239753,24.239753,61.701190,17.628911,33.054209,6.610842,6.610842,9.916263,46.275892,0,26.443367,45.174086,5.509035,13.221684,5.509035,13.221684,5.509035,-18,0,88,2023-24,Final,BOS,DAL,-6.5,1.869565,6.5,1.952381,1.359712,3.250000,211.0,1.892857,1.925926,3.250000,-1.952381,-6.5,-19.952381,0


In [211]:
def create_matchups(df):
    """This function makes each row a matchup between 
    team and opp"""
    df = df.copy()
    
    
    matchups = pd.merge(df, df, on=['GAME_ID'], suffixes=['', '_opp'])
    matchups = matchups.loc[matchups['TEAM_ABBREVIATION'] != matchups['TEAM_ABBREVIATION_opp']]

    matchups = matchups.drop(columns = ['SEASON_opp', 'SEASON_ID_opp',
                             'TEAM_ABBREVIATION_opp', 'GAME_DATE_opp',
                             'MATCHUP_opp', 'HOME_GAME_opp', 'TEAM_NAME_opp',
                                       'TEAM_ID_opp', 'WL_opp']
                 )
    
    matchups
    
    return matchups


matchups = create_matchups(normalized_df)
matchups.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,SEASON,E_OFF_RATING,E_DEF_RATING,E_NET_RATING,E_PACE,contestedShots,contestedShots2pt,contestedShots3pt,deflections,chargesDrawn,screenAssists,screenAssistPoints,looseBallsRecoveredOffensive,looseBallsRecoveredDefensive,looseBallsRecoveredTotal,offensiveBoxOuts,defensiveBoxOuts,boxOutPlayerTeamRebounds,boxOutPlayerRebounds,boxOuts,DIST,ORBC,DRBC,RBC,TCHS,SAST,FTAST,PASS,CFGM,CFGA,UFGM,UFGA,DFGM,DFGA,PTS_OFF_TOV,PTS_2ND_CHANCE,PTS_FB,PTS_PAINT,HOME_GAME,FG2M,FG2A,PTS_2PT_MR,AST_2PM,AST_3PM,UAST_2PM,UAST_3PM,POINT_DIFF,RECORD,TEAM_SCORE,FGM_opp,FGA_opp,FG3M_opp,FG3A_opp,FTM_opp,FTA_opp,OREB_opp,DREB_opp,REB_opp,AST_opp,STL_opp,BLK_opp,TOV_opp,PF_opp,PTS_opp,PLUS_MINUS_opp,E_OFF_RATING_opp,E_DEF_RATING_opp,E_NET_RATING_opp,E_PACE_opp,contestedShots_opp,contestedShots2pt_opp,contestedShots3pt_opp,deflections_opp,chargesDrawn_opp,screenAssists_opp,screenAssistPoints_opp,looseBallsRecoveredOffensive_opp,looseBallsRecoveredDefensive_opp,looseBallsRecoveredTotal_opp,offensiveBoxOuts_opp,defensiveBoxOuts_opp,boxOutPlayerTeamRebounds_opp,boxOutPlayerRebounds_opp,boxOuts_opp,DIST_opp,ORBC_opp,DRBC_opp,RBC_opp,TCHS_opp,SAST_opp,FTAST_opp,PASS_opp,CFGM_opp,CFGA_opp,UFGM_opp,UFGA_opp,DFGM_opp,DFGA_opp,PTS_OFF_TOV_opp,PTS_2ND_CHANCE_opp,PTS_FB_opp,PTS_PAINT_opp,FG2M_opp,FG2A_opp,PTS_2PT_MR_opp,AST_2PM_opp,AST_3PM_opp,UAST_2PM_opp,UAST_3PM_opp,POINT_DIFF_opp,RECORD_opp,TEAM_SCORE_opp
1,22016,1610612739,CLE,Cleveland Cavaliers,21600001,2016-10-25,CLE vs. NYK,1,43.44468,90.75111,12.550685,33.790307,13.516123,18.34331,10.619811,38.617494,49.237304,29.928558,11.585248,4.827187,14.48156,21.239622,112.956169,27.997683,2016-17,110.0,87.3,22.7,103.58,62.753427,44.410118,18.34331,15.446997,0.0,5.792624,11.585248,0.0,0.0,11.585248,0.0,0.0,0.0,0.0,0.0,16.528287,39.582931,69.511489,104.267233,425.757868,1.930875,0.0,303.147326,16.412435,37.652056,27.032246,53.099054,9.654373,17.377872,20.274184,15.446997,15.446997,50.202742,1,30.893995,56.960803,11.585248,19.308747,9.654373,11.585248,1.930875,29,1,117,30.893995,83.993049,8.688936,26.066808,14.48156,19.308747,12.550685,27.997683,40.548368,16.412435,5.792624,5.792624,17.377872,21.239622,84.958486,-27.997683,87.3,110.0,-22.7,103.58,62.753427,37.652056,25.101371,8.688936,1.930875,6.758061,14.48156,0.0,0.0,9.654373,0.0,0.0,0.0,0.0,0.0,16.692412,32.82487,68.546051,100.405484,393.898436,0.965437,0.965437,277.080517,11.585248,29.928558,19.308747,54.064491,20.274184,29.928558,13.516123,15.446997,8.688936,27.032246,22.205059,57.926241,17.377872,7.723499,7.723499,13.516123,0.0,-29,0,88
2,22016,1610612752,NYK,New York Knicks,21600001,2016-10-25,NYK @ CLE,0,30.893995,83.993049,8.688936,26.066808,14.48156,19.308747,12.550685,27.997683,40.548368,16.412435,5.792624,5.792624,17.377872,21.239622,84.958486,-27.997683,2016-17,87.3,110.0,-22.7,103.58,62.753427,37.652056,25.101371,8.688936,1.930875,6.758061,14.48156,0.0,0.0,9.654373,0.0,0.0,0.0,0.0,0.0,16.692412,32.82487,68.546051,100.405484,393.898436,0.965437,0.965437,277.080517,11.585248,29.928558,19.308747,54.064491,20.274184,29.928558,13.516123,15.446997,8.688936,27.032246,0,22.205059,57.926241,17.377872,7.723499,7.723499,13.516123,0.0,-29,0,88,43.44468,90.75111,12.550685,33.790307,13.516123,18.34331,10.619811,38.617494,49.237304,29.928558,11.585248,4.827187,14.48156,21.239622,112.956169,27.997683,110.0,87.3,22.7,103.58,62.753427,44.410118,18.34331,15.446997,0.0,5.792624,11.585248,0.0,0.0,11.585248,0.0,0.0,0.0,0.0,0.0,16.528287,39.582931,69.511489,104.267233,425.757868,1.930875,0.0,303.147326,16.412435,37.652056,27.032246,53.099054,9.654373,17.377872,20.274184,15.446997,15.446997,50.202742,30.893995,56.960803,11.585248,19.308747,9.654373,11.585248,1.930875,29,1,117
5,22016,1610612757,POR,Portland Trail Blazers,21600002,2016-10-25,POR vs. UTA,1,41.113219,79.063884,13.704406,20.029517,23.192073,23.192073,5.270926,30.571368,35.842294,23.192073,5.270926,3.162555,13.704406,18.975332,119.122918,9.487666,2016-17,121.9,107.2,14.8,94.86,65.359477,46.384145,18.975332,9.487666,0.0,18.975332,45.32996,0.0,0.0,4.21674,0.0,0.0,0.0,0.0,0.0,17.552182,20.029517,56.925996,76.955513,381.615012,0.0,0.0,268.817204,20.029517,45.32996,21.083702,33.733924,14.758592,21.083702,21.083702,15.812777,4.21674,35.842294,1,27.408813,59.034366,17.921147,13.704406,8.433481,13.704406,4.21674,9,1,113,42.167405,86.443179,8.433481,25.300443,16.866962,16.866962,6.325111,26.354628,32.679739,20.029517,9.487666,5.270926,14.758592,20.029517,109.635252,-9.487666,107.2,121.9,-14.8,94.86,56.925996,46.384145,10.541851,16.866962,0.0,10.541851,21.083702,0.0,0.0,7.379296,0.0,0.0,0.0,0.0,0.0,17.668143,20.029517,49.5467,69.576218,448.028674,2.10837,1.054185,326.797386,23.192073,46.384145,18.975332,40.059034,11.596036,18.975332,8.433481,4.21674,8.433481,52.709256,33.733924,61.142737,14.758592,11.596036,8.433481,21.083702,0.0,-9,0,104
6,22016,1610612762,UTA,Utah Jazz,21600002,2016-10-25,UTA @ POR,0,42.167405,86.443179,8.433481,25.300443,16.866962,16.866962,6.325111,26.354628,32.679739,20.029517,9.487666,5.270926,14.758592,20.029517,109.635252,-9.487666,2016-17,107.2,121.9,-14.8,94.86,56.925996,46.384145,10.541851,16.866962,0.0,10.541851,21.083702,0.0,0.0,7.379296,0.0,0.0,0.0,0.0,0.0,17.668143,20.029517,49.5467,69.576218,448.028674,2.10837,1.054185,326.797386,23.192073,46.384145,18.975332,40.059034,11.596036,18.975332,8.433481,4.21674,8.433481,52.709256,0,33.733924,61.142737,14.758592,11.596036,8.433481,21.083702,0.0,-9,0,104,41.113219,79.063884,13.704406,20.029517,23.192073,23.192073,5.270926,30.571368,35.842294,23.192073,5.270926,3.162555,13.704406,18.975332,119.122918,9.487666,121.9,107.2,14.8,94.86,65.359477,46.384145,18.975332,9.487666,0.0,18.975332,45.32996,0.0,0.0,4.21674,0.0,0.0,0.0,0.0,0.0,17.552182,20.029517,56.925996,76.955513,381.615012,0.0,0.0,268.817204,20.029517,45.32996,21.083702,33.733924,14.758592,21.083702,21.083702,15.812777,4.21674,35.842294,27.408813,59.034366,17.921147,13.704406,8.433481,13.704406,4.21674,9,1,113
9,22016,1610612744,GSW,Golden State Warriors,21600003,2016-10-25,GSW vs. SAS,0,39.339103,83.595594,6.884343,32.45476,12.785208,17.702596,7.867821,26.553895,34.421715,23.603462,10.818253,5.900865,15.735641,18.686074,98.347758,-28.52085,2016-17,99.1,125.9,-26.8,101.68,81.628639,60.97561,20.653029,22.619984,0.0,4.917388,10.818253,0.0,0.0,7.867821,0.0,0.0,0.0,0.0,0.0,16.620771,22.619984,44.256491,65.892998,390.440598,3.93391,0.983478,272.423289,20.653029,37.372148,18.686074,46.223446,19.669552,31.471282,14.752164,3.93391,19.669552,47.206924,1,32.45476,51.140834,17.702596,18.686074,3.93391,12.785208,1.966955,-29,0,100,46.223446,96.380803,11.801731,23.603462,22.619984,25.570417,20.653029,33.438238,54.091267,24.586939,12.785208,2.950433,13.768686,18.686074,126.868607,28.52085,125.9,99.1,26.8,101.68,62.942565,33.438238,29.504327,18.686074,0.0,4.917388,11.801731,0.0,0.0,11.801731,0.0,0.0,0.0,0.0,0.0,17.466562,39.339103,57.041699,92.446892,463.217939,1.966955,1.966955,333.398899,22.619984,44.256491,23.603462,52.124312,15.735641,22.619984,16.719119,25.570417,23.603462,49.173879,34.421715,72.777341,18.686074,14.752164,8.851298,18.686074,1.966955,29,1,129


In [212]:
def build_team_avg_stats_df(df: pd.DataFrame, span: int = 10) -> pd.DataFrame:
    """
    Calculate the average statistics for each team up to (but not including) the given date.
    
    Args:
    df (pd.DataFrame): Input DataFrame containing team statistics.
    span (int): Number of games to consider for the exponential weighted moving average. Default is 10.
    
    Returns:
    pd.DataFrame: DataFrame with average team statistics.
    """
    df = df.sort_values(['TEAM_ABBREVIATION', 'GAME_DATE']).reset_index(drop=True)

    # Define columns to drop and keep
    drop_cols = ['SEASON_ID', 'SEASON', 'TEAM_ID', 'TEAM_NAME', 'GAME_ID', 'MATCHUP', 
                 'HOME_GAME', 'TEAM_SCORE', 'GAME_DATE', 'POINT_DIFF', 'WL', 
                 'TEAM_SCORE_opp', 'POINT_DIFF_opp', 'RECORD', 'RECORD_opp']
    
    keep_cols = ['SEASON', 'TEAM_ABBREVIATION', 'GAME_DATE', 'GAME_ID', 'MATCHUP', 
                 'HOME_GAME', 'TEAM_SCORE', 'RECORD', 'POINT_DIFF', 'WL']

    # Select stats columns
    stats = df.drop(columns=drop_cols)
    stats_cols = stats.columns[1:]  # Exclude 'TEAM_ABBREVIATION'

    # Calculate exponential weighted moving average for each stat
    avg_stats = stats.groupby('TEAM_ABBREVIATION')[stats_cols].transform(
        lambda x: x.ewm(span=span).mean()
    )

    # Add matchup info and calculate win percentage
    result = df[keep_cols].copy()
    result = pd.concat([result, avg_stats], axis=1)
    result[f'WIN_PCT_L{span}'] = result.groupby('TEAM_ABBREVIATION')['RECORD'].transform(
        lambda x: x.rolling(window=span).mean()
    )
    result = result.drop(columns='RECORD')

    # Shift stats to avoid look-ahead bias
    shift_cols = [col for col in result.columns if col not in keep_cols + [f'WIN_PCT_L{span}']]
    result[shift_cols] = result.groupby('TEAM_ABBREVIATION')[shift_cols].shift(1)

    # Rename columns
    result = result.add_suffix(f'_L{span}')
    rename_dict = {f'{col}_L{span}': col for col in keep_cols}
    result = result.rename(columns=rename_dict)

    return result

# Usage

start = time.time()
team_stats_ewa = build_team_avg_stats_df(matchups, span=10)
end = time.time()
print(f"Time taken: {end - start} seconds")

Time taken: 0.5773587226867676 seconds


In [266]:
def add_percentage_features(df, span):
    """Add the following features for both team and opp:
    OREB_PCT, DREB_PCT, REB_PCT, TS_PCT, EFG_PCT, AST_RATIO, TOV_PCT, PIE.
    """
    
    df = df.copy()
    
    df[f'FG2_PCT_L{span}'] = df[f'FG2M_L{span}'] / df[f'FG2A_L{span}']
    df[f'FG3_PCT_L{span}'] = df[f'FG3M_L{span}'] / df[f'FG3A_L{span}']
    
    df['OREB_PCT_L{}'.format(span)] = df['OREB_L{}'.format(span)] / (df['OREB_L{}'.format(span)] + df['DREB_opp_L{}'.format(span)])
    df['OREB_PCT_opp_L{}'.format(span)] = df['OREB_opp_L{}'.format(span)] / (df['OREB_opp_L{}'.format(span)] + df['DREB_L{}'.format(span)])

    df['DREB_PCT_L{}'.format(span)] = df['DREB_L{}'.format(span)] / (df['DREB_L{}'.format(span)] + df['OREB_opp_L{}'.format(span)])
    df['DREB_PCT_opp_L{}'.format(span)] = df['DREB_opp_L{}'.format(span)] / (df['DREB_opp_L{}'.format(span)] + df['OREB_L{}'.format(span)])

    df['REB_PCT_L{}'.format(span)] = df['REB_L{}'.format(span)] / (df['REB_L{}'.format(span)] + df['REB_opp_L{}'.format(span)])
    df['REB_PCT_opp_L{}'.format(span)] = df['REB_opp_L{}'.format(span)] / (df['REB_opp_L{}'.format(span)] + df['REB_L{}'.format(span)])

    df['TS_PCT_L{}'.format(span)] = df['PTS_L{}'.format(span)] / ((2*(df['FG2A_L{}'.format(span)] + df['FG3A_L{}'.format(span)]) + 0.44*df['FTA_L{}'.format(span)]))
    
    df['TS_PCT_opp_L{}'.format(span)] = df['PTS_opp_L{}'.format(span)] / ((2*(df['FG2A_opp_L{}'.format(span)] + df['FG3A_opp_L{}'.format(span)]) + 0.44*df['FTA_opp_L{}'.format(span)]))

    df['EFG_PCT_L{}'.format(span)] = (df['FG2M_L{}'.format(span)] + 1.5*df['FG3M_L{}'.format(span)]) / (df['FG2A_L{}'.format(span)]
                                                                    + df['FG3A_L{}'.format(span)])
    df['EFG_PCT_opp_L{}'.format(span)] = (df['FG2M_opp_L{}'.format(span)] + 1.5*df['FG3M_opp_L{}'.format(span)]) / (df['FG2A_opp_L{}'.format(span)] 
                                                                 + df['FG3A_opp_L{}'.format(span)])

    df['AST_RATIO_L{}'.format(span)] = (df['AST_L{}'.format(span)] * 100) / df['E_PACE_L{}'.format(span)]
    df['AST_RATIO_opp_L{}'.format(span)] = (df['AST_opp_L{}'.format(span)] * 100) / df['E_PACE_opp_L{}'.format(span)]

    df['TOV_PCT_L{}'.format(span)] = 100*df['TOV_L{}'.format(span)] / (df['FG2A_L{}'.format(span)] 
                                               + df['FG3A_L{}'.format(span)] 
                                               + 0.44*df['FTA_L{}'.format(span)] 
                                               + df['TOV_L{}'.format(span)])
    
    df['TOV_PCT_opp_L{}'.format(span)] = 100*df['TOV_opp_L{}'.format(span)] / (df['FG2A_opp_L{}'.format(span)] 
                                             + df['FG3A_opp_L{}'.format(span)] 
                                             + 0.44*df['FTA_opp_L{}'.format(span)] 
                                             + df['TOV_opp_L{}'.format(span)])
    
    
    df['PIE_L{}'.format(span)] = ((df['PTS_L{}'.format(span)] + df['FG2M_L{}'.format(span)] + df['FG3M_L{}'.format(span)] + df['FTM_L{}'.format(span)] 
                 - df['FG2A_L{}'.format(span)] - df['FG3A_L{}'.format(span)] - df['FTA_L{}'.format(span)] 
                 + df['DREB_L{}'.format(span)] + df['OREB_L{}'.format(span)]/2
                + df['AST_L{}'.format(span)] + df['STL_L{}'.format(span)] + df['BLK_L{}'.format(span)]/2
                - df['PF_L{}'.format(span)] - df['TOV_L{}'.format(span)]) 
                 / (df['PTS_L{}'.format(span)] + df['PTS_opp_L{}'.format(span)] + df['FG2M_L{}'.format(span)] + df['FG2M_opp_L{}'.format(span)]
                   + df['FG3M_L{}'.format(span)] + df['FG3M_opp_L{}'.format(span)] + df['FTM_L{}'.format(span)] + df['FTM_opp_L{}'.format(span)]
                   - df['FG2A_L{}'.format(span)] - df['FG2A_opp_L{}'.format(span)] - df['FG3A_L{}'.format(span)] - df['FG3A_opp_L{}'.format(span)] 
                    - df['FTA_L{}'.format(span)] - df['FTA_opp_L{}'.format(span)] + df['DREB_L{}'.format(span)] + df['DREB_opp_L{}'.format(span)]
                    + (df['OREB_L{}'.format(span)]+df['OREB_opp_L{}'.format(span)])/2 + df['AST_L{}'.format(span)] + df['AST_opp_L{}'.format(span)]
                    + df['STL_L{}'.format(span)] + df['STL_opp_L{}'.format(span)] + (df['BLK_L{}'.format(span)] + df['BLK_opp_L{}'.format(span)])/2
                    - df['PF_L{}'.format(span)] - df['PF_opp_L{}'.format(span)] - df['TOV_L{}'.format(span)] - df['TOV_opp_L{}'.format(span)]))
        
    return df
  
  
  
team_stats_ewa = add_percentage_features(team_stats_ewa, span=10)

In [267]:
def add_rest_days(df: pd.DataFrame, max_rest_days: int = 7) -> pd.DataFrame:
    """
    Calculate the number of rest days between games for each team.

    Args:
    df (pd.DataFrame): Input DataFrame containing game data.
    max_rest_days (int): Maximum number of rest days to consider. Default is 8.

    Returns:
    pd.DataFrame: DataFrame with an additional 'REST' column indicating the number of rest days.
    """
    # Create a copy to avoid modifying the original DataFrame
    df = df.copy()

    # Ensure 'GAME_DATE' is in datetime format
    df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'])

    # Calculate rest days
    df['REST'] = df.groupby(['SEASON', 'TEAM_ABBREVIATION'])['GAME_DATE'].diff().dt.days

    # Cap rest days at the specified maximum
    df['REST'] = df['REST'].clip(upper=max_rest_days)

    # Fill NaN values (first game of the season) with max_rest_days
    df['REST'] = df['REST'].fillna(max_rest_days)

    return df


team_stats_ewa = add_rest_days(team_stats_ewa)

In [268]:
team_stats_ewa

Unnamed: 0,SEASON,TEAM_ABBREVIATION,GAME_DATE,GAME_ID,MATCHUP,HOME_GAME,TEAM_SCORE,POINT_DIFF,WL,FGM_L10,FGA_L10,FG3M_L10,FG3A_L10,FTM_L10,FTA_L10,OREB_L10,DREB_L10,REB_L10,AST_L10,STL_L10,BLK_L10,TOV_L10,PF_L10,PTS_L10,PLUS_MINUS_L10,E_OFF_RATING_L10,E_DEF_RATING_L10,E_NET_RATING_L10,E_PACE_L10,contestedShots_L10,contestedShots2pt_L10,contestedShots3pt_L10,deflections_L10,chargesDrawn_L10,screenAssists_L10,screenAssistPoints_L10,looseBallsRecoveredOffensive_L10,looseBallsRecoveredDefensive_L10,looseBallsRecoveredTotal_L10,offensiveBoxOuts_L10,defensiveBoxOuts_L10,boxOutPlayerTeamRebounds_L10,boxOutPlayerRebounds_L10,boxOuts_L10,DIST_L10,ORBC_L10,DRBC_L10,RBC_L10,TCHS_L10,SAST_L10,FTAST_L10,PASS_L10,CFGM_L10,CFGA_L10,UFGM_L10,UFGA_L10,DFGM_L10,DFGA_L10,PTS_OFF_TOV_L10,PTS_2ND_CHANCE_L10,PTS_FB_L10,PTS_PAINT_L10,FG2M_L10,FG2A_L10,PTS_2PT_MR_L10,AST_2PM_L10,AST_3PM_L10,UAST_2PM_L10,UAST_3PM_L10,FGM_opp_L10,FGA_opp_L10,FG3M_opp_L10,FG3A_opp_L10,FTM_opp_L10,FTA_opp_L10,OREB_opp_L10,DREB_opp_L10,REB_opp_L10,AST_opp_L10,STL_opp_L10,BLK_opp_L10,TOV_opp_L10,PF_opp_L10,PTS_opp_L10,PLUS_MINUS_opp_L10,E_OFF_RATING_opp_L10,E_DEF_RATING_opp_L10,E_NET_RATING_opp_L10,E_PACE_opp_L10,contestedShots_opp_L10,contestedShots2pt_opp_L10,contestedShots3pt_opp_L10,deflections_opp_L10,chargesDrawn_opp_L10,screenAssists_opp_L10,screenAssistPoints_opp_L10,looseBallsRecoveredOffensive_opp_L10,looseBallsRecoveredDefensive_opp_L10,looseBallsRecoveredTotal_opp_L10,offensiveBoxOuts_opp_L10,defensiveBoxOuts_opp_L10,boxOutPlayerTeamRebounds_opp_L10,boxOutPlayerRebounds_opp_L10,boxOuts_opp_L10,DIST_opp_L10,ORBC_opp_L10,DRBC_opp_L10,RBC_opp_L10,TCHS_opp_L10,SAST_opp_L10,FTAST_opp_L10,PASS_opp_L10,CFGM_opp_L10,CFGA_opp_L10,UFGM_opp_L10,UFGA_opp_L10,DFGM_opp_L10,DFGA_opp_L10,PTS_OFF_TOV_opp_L10,PTS_2ND_CHANCE_opp_L10,PTS_FB_opp_L10,PTS_PAINT_opp_L10,FG2M_opp_L10,FG2A_opp_L10,PTS_2PT_MR_opp_L10,AST_2PM_opp_L10,AST_3PM_opp_L10,UAST_2PM_opp_L10,UAST_3PM_opp_L10,WIN_PCT_L10_L10,OREB_PCT_L10,OREB_PCT_opp_L10,DREB_PCT_L10,DREB_PCT_opp_L10,REB_PCT_L10,REB_PCT_opp_L10,TS_PCT_L10,TS_PCT_opp_L10,EFG_PCT_L10,EFG_PCT_opp_L10,AST_RATIO_L10,AST_RATIO_opp_L10,TOV_PCT_L10,TOV_PCT_opp_L10,PIE_L10,REST,FG2_PCT_L10,FG3_PCT_L10
0,2016-17,ATL,2016-10-27,0021600014,ATL vs. WAS,1,114,15,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7.0,,
1,2016-17,ATL,2016-10-29,0021600026,ATL @ PHI,0,104,32,1,41.540785,83.081571,11.329305,24.546828,13.217523,16.993958,13.217523,35.876133,49.093656,26.435045,12.273414,6.608761,19.826284,17.938066,107.628399,14.161631,110.800000,90.900000,19.900000,105.920000,67.975831,50.981873,16.993958,19.826284,0.000000,6.608761,14.161631,0.000000,0.000000,5.664653,0.000000,0.000000,0.000000,0.000000,0.000000,16.399169,25.490937,66.087613,90.634441,407.854985,2.832326,0.944109,283.232628,19.826284,41.540785,21.714502,41.540785,11.329305,20.770393,24.546828,18.882175,15.105740,41.540785,30.211480,58.534743,17.938066,15.105740,11.329305,15.105740,0.000000,37.764350,88.746224,5.664653,18.882175,12.273414,16.993958,11.329305,26.435045,37.764350,23.602719,13.217523,3.776435,17.938066,18.882175,93.466767,-14.161631,90.900000,110.800000,-19.900000,105.920000,50.037764,33.043807,16.993958,23.602719,0.000000,7.552870,16.049849,0.000000,0.000000,4.720544,0.000000,0.000000,0.000000,0.000000,0.000000,16.276435,36.820242,44.373112,77.416918,402.190332,4.720544,0.000000,279.456193,19.826284,41.540785,17.938066,47.205438,14.161631,21.714502,23.602719,5.664653,16.993958,45.317221,32.099698,69.864048,17.938066,17.938066,3.776435,13.217523,0.944109,,0.333333,0.240000,0.760000,0.666667,0.565217,0.434783,0.619835,0.505308,0.568182,0.457447,24.957558,22.283534,17.960999,15.712868,0.603133,2.0,0.516129,0.461538
2,2016-17,ATL,2016-10-31,0021600044,ATL vs. SAC,1,106,11,1,41.371539,83.283035,7.797971,21.845209,14.047237,20.066287,9.187626,38.282489,47.470115,28.634431,11.462561,6.753640,15.401309,15.631525,104.588287,23.651352,106.400000,79.515000,26.940000,103.687000,62.446575,47.779856,14.666719,16.481223,0.539957,6.753640,14.472086,0.000000,0.000000,4.168964,0.000000,0.000000,0.000000,0.000000,0.000000,17.115047,26.589712,74.555841,99.640790,398.437551,4.514288,2.044719,276.482761,19.181007,38.671755,22.190532,44.611280,9.417842,19.605856,22.925122,12.816633,17.056762,45.691194,33.573568,61.437827,21.031093,20.836460,7.797971,12.197151,0.000000,32.112748,83.132345,5.248878,18.216201,11.462561,15.746633,7.258015,30.794259,38.052273,19.260532,10.807496,4.399180,18.871266,19.296115,80.936935,-23.651352,79.515000,106.400000,-26.940000,103.687000,54.374445,38.627812,15.746633,18.180619,0.000000,6.098576,13.161957,0.000000,0.000000,3.744115,0.000000,0.000000,0.000000,0.000000,0.000000,17.378391,32.767813,56.145006,85.593553,416.406816,2.664202,0.539957,299.081421,14.861353,36.511928,17.251396,46.620417,16.091957,23.270446,14.940878,7.408705,11.426979,39.831194,26.863871,64.916144,13.471698,14.551612,3.319266,11.347453,1.504763,,0.229795,0.159375,0.840625,0.770205,0.555061,0.444939,0.596301,0.467322,0.543574,0.417854,27.616221,18.575648,14.324999,17.323875,0.674013,2.0,0.546464,0.356965
3,2016-17,ATL,2016-11-02,0021600059,ATL vs. LAL,1,116,-7,0,38.556099,82.877621,9.268453,25.623256,20.541361,31.676599,12.611411,33.778235,46.389646,27.171321,11.459902,6.550657,14.652612,17.720873,106.922012,18.748880,108.168771,87.187043,21.014618,100.604917,59.950883,44.481594,15.469289,19.484966,1.160209,6.550657,14.934242,0.000000,0.000000,3.749036,0.000000,0.000000,0.000000,0.000000,0.000000,17.188786,31.391091,70.960207,100.614129,422.058225,4.792853,2.060067,298.889747,15.656928,34.429675,22.899170,48.447947,11.074458,19.260238,23.757116,14.781581,12.711992,40.720625,29.287646,57.254366,17.181948,18.740179,8.012486,9.805913,0.837312,34.275248,81.112884,7.325422,20.522494,12.297214,17.371052,6.852276,31.812159,38.664435,22.402979,10.649513,5.142674,19.239604,25.773504,88.173132,-18.748880,87.187043,108.168771,-21.014618,100.604917,55.961008,37.752643,18.208364,17.570625,0.000000,10.345482,23.361204,0.000000,0.000000,5.169597,0.000000,0.000000,0.000000,0.000000,0.000000,17.275096,36.341604,60.369062,90.539164,425.686815,2.849178,0.322898,305.286746,16.841649,36.068675,17.433599,45.044209,14.646967,23.126310,16.470550,7.361046,11.857279,38.890930,26.949825,60.590390,14.336003,14.981798,6.171502,10.972411,0.899858,,0.283890,0.168649,0.831351,0.716110,0.545414,0.454586,0.595026,0.519065,0.521134,0.467718,27.007945,22.268274,13.145136,17.815149,0.619403,2.0,0.511536,0.361720
4,2016-17,ATL,2016-11-04,0021600070,ATL @ WAS,0,92,-3,0,38.806855,82.427484,9.690179,27.291406,21.040368,30.402642,11.615848,31.180016,42.795864,27.697586,10.843710,4.708441,15.512032,17.885371,108.344258,10.360425,108.673243,97.865668,10.796683,101.815693,60.103548,44.043898,16.059651,19.068274,1.093906,6.919975,15.069010,0.000000,0.000000,3.461696,0.000000,0.000000,0.000000,0.000000,0.000000,16.966221,30.527130,62.430855,91.477201,418.544312,4.793489,2.013234,297.094541,16.185471,34.776170,22.621384,47.651314,12.164920,19.549451,21.301074,18.125980,12.315159,43.733535,29.116676,55.136078,14.048730,18.252928,8.847997,10.050570,0.561455,38.147888,82.823817,7.755423,20.711784,13.932633,17.966734,8.070025,31.125408,39.195433,22.604599,10.616242,6.291793,17.640014,25.180620,97.983833,-10.360425,97.865668,108.673243,-10.796683,101.815693,58.060026,38.268103,19.791923,15.889022,0.000000,12.623910,27.986135,0.000000,0.000000,4.730179,0.000000,0.000000,0.000000,0.000000,0.000000,17.039893,32.267002,59.436155,86.301156,414.342841,2.542368,0.848384,294.749411,18.559546,36.191126,19.588342,46.632691,18.035715,26.564894,16.731040,9.674912,15.849171,41.874776,30.392465,62.112033,18.459066,14.784966,6.665735,14.623961,0.603395,,0.271771,0.205606,0.794394,0.728229,0.521956,0.478044,0.607883,0.564576,0.529580,0.507410,27.203651,22.201488,13.935047,16.277701,0.561484,2.0,0.528088,0.355063
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20411,2023-24,WAS,2024-04-05,0022301117,WAS vs. POR,1,102,-6,0,40.901701,87.226858,11.995294,36.104013,12.707972,16.085276,7.991871,33.783536,41.775408,25.190045,7.008524,3.517840,14.867231,19.765631,106.506668,-5.449256,105.217918,111.254913,-6.020003,103.911397,36.930084,22.359605,14.570479,11.256938,0.526728,8.938928,21.065213,2.032811,2.532314,4.565125,0.646667,5.380549,5.604245,1.412858,6.027216,17.964896,21.613070,55.669216,75.353686,362.703771,2.293803,1.470997,245.121935,17.338704,33.983566,23.563269,53.242255,18.904237,26.934581,14.291737,8.542714,16.223041,52.239687,28.906407,51.122845,5.043783,15.412931,9.052648,12.596009,2.244720,41.384035,88.202628,10.055706,33.429664,19.132148,23.561176,10.699103,34.967108,45.666211,23.734752,8.676136,6.145166,12.750418,15.497701,111.955924,5.449256,111.254913,105.217918,6.020003,103.911397,35.617944,21.098975,14.518968,12.967107,0.390120,6.490848,14.689609,2.593587,3.251673,5.845260,1.894949,4.069677,5.763712,3.435919,5.964626,17.745464,23.070899,52.581224,74.428028,405.487575,3.141781,3.214460,283.592011,18.069172,34.576399,23.314860,53.484046,17.783201,27.107991,17.909421,14.923796,13.638331,51.985651,31.328329,54.772964,10.039680,14.984118,8.043658,15.396029,1.618483,0.4,0.186035,0.240523,0.759477,0.813965,0.477752,0.522248,0.586713,0.599425,0.537671,0.526196,24.241850,22.841337,13.618221,11.453847,0.464505,2.0,0.565430,0.332243
20412,2023-24,WAS,2024-04-07,0022301137,WAS @ TOR,0,122,-8,0,40.347964,87.306859,10.719981,35.516933,14.201159,18.594577,8.712363,33.618362,42.330724,24.957154,7.364416,3.783882,13.975397,19.975608,105.617068,-5.545261,104.778297,110.354019,-5.561821,103.269325,38.366368,23.728119,14.638249,11.746041,0.430959,8.038188,19.227603,2.931118,3.158673,6.089791,1.253611,5.670177,5.672071,1.699364,6.923787,17.986059,22.211668,56.234204,76.324536,361.421001,1.876748,1.928063,244.025484,18.714459,34.506542,21.633727,52.799469,18.727441,27.652411,14.047928,10.793221,15.084696,54.333875,29.627983,51.789926,4.488991,16.595438,7.768972,12.117124,2.379979,41.285990,89.191996,9.857565,32.604310,18.732784,23.624443,11.651890,35.673518,47.325407,24.672110,8.366566,5.933512,12.786849,16.845925,111.162329,5.545261,110.354019,104.778297,5.561821,103.269325,34.938110,20.704266,14.233844,12.601880,0.319189,7.121993,16.003629,2.846545,3.384980,6.231525,2.274932,3.873125,5.621414,3.354596,6.148058,17.897089,24.491217,54.432184,77.197350,412.365361,3.657328,2.811142,289.266872,18.406466,35.534976,22.879522,53.540688,19.078139,28.881071,16.102202,15.108457,11.883154,50.865689,31.428425,56.587686,11.474622,15.882331,8.030213,14.589179,1.324213,0.4,0.196287,0.257385,0.742615,0.803713,0.472145,0.527855,0.577789,0.588850,0.523532,0.518149,24.167055,23.891034,12.767132,11.378873,0.465445,2.0,0.572080,0.301827
20413,2023-24,WAS,2024-04-09,0022301152,WAS @ MIN,0,121,-9,0,39.844794,85.911963,11.373874,35.566760,15.198228,20.094333,8.429787,33.199952,41.629739,25.137392,7.489608,4.885453,13.874710,20.410836,106.261690,-5.838522,106.036788,110.962379,-4.914217,104.813084,38.223488,24.131817,14.091671,11.074573,0.515289,8.040875,19.148087,2.398188,2.909741,5.307929,1.025682,4.639235,4.640785,1.390389,5.664917,17.829682,22.403026,54.794862,74.811505,359.643798,1.698207,2.716310,242.769077,18.402870,33.601272,21.442106,52.309997,19.552295,29.132151,13.934053,11.271111,17.059926,52.589302,28.470920,50.345204,3.835497,15.693007,8.796726,11.866246,2.109942,41.751074,88.593152,9.854829,31.719528,18.743235,23.884305,10.834854,35.857561,46.692415,25.554919,8.309548,5.505437,12.739575,18.012873,112.100213,5.838522,110.962379,106.036788,4.914217,104.813084,34.767805,19.380148,15.387657,12.100178,0.261155,7.779321,17.486408,2.328992,3.908333,6.237324,1.861308,3.656980,5.087397,2.907356,5.518288,17.743873,23.942739,53.645855,75.525625,400.349430,3.155045,2.300026,278.645954,18.313561,35.744209,23.437511,52.753763,18.700426,28.673242,16.753627,13.825641,12.650933,52.354676,31.896245,56.873624,11.015190,16.573732,8.034351,14.376896,1.246133,0.3,0.190343,0.246052,0.753948,0.809657,0.471340,0.528660,0.588168,0.597245,0.529981,0.526886,23.983067,24.381420,12.772662,11.390708,0.465731,2.0,0.565514,0.319789
20414,2023-24,WAS,2024-04-12,0022301172,WAS vs. CHI,1,127,-2,0,40.421952,86.112704,12.861200,36.743977,14.745861,19.462825,8.141454,33.207612,41.349066,25.188850,7.016687,4.708249,13.662982,20.610608,108.450964,-6.376859,108.230099,113.951038,-5.711632,104.352523,38.562134,24.366108,14.196026,10.660901,0.421600,9.067610,21.355101,2.317684,2.736228,5.053911,1.194724,4.506799,4.863597,2.026417,5.701523,17.873021,22.596112,56.386894,76.497215,366.604427,2.100503,2.400200,250.003369,17.723370,33.535964,22.698731,52.576171,19.019339,28.990585,12.822710,11.532765,16.091303,50.138216,27.560752,49.368726,4.382490,14.439619,10.219328,12.375224,2.259611,41.803871,88.484168,11.262815,33.240712,19.957268,25.230189,10.642532,35.382019,46.024551,26.241524,8.398608,5.926570,11.845410,17.226517,114.827823,6.376859,113.951038,108.230099,5.711632,104.352523,34.312635,20.122848,14.189787,12.744388,0.213672,7.787020,18.040129,2.438834,3.553257,5.992091,1.700654,3.169839,4.517946,2.556511,4.870493,17.833034,24.389172,53.669146,76.014904,400.797861,3.292461,2.059604,279.890473,18.716891,35.644807,23.086978,52.761487,17.789060,27.726289,17.618346,14.867191,13.906066,51.723901,30.541056,55.243456,9.012428,16.226803,9.062272,13.540566,1.552859,0.2,0.187059,0.242702,0.757298,0.812941,0.473244,0.526756,0.599876,0.610560,0.544084,0.536088,24.138228,25.146995,12.611286,10.630278,0.460278,3.0,0.558263,0.350022


In [269]:
def make_matchups_2(df):
    
    home_teams = df.loc[df['HOME_GAME'] == 1]
    home_teams = home_teams.add_prefix('HOME_')

    away_teams = df.loc[df['HOME_GAME'] == 0]
    away_teams = away_teams.add_prefix('AWAY_')

    drop_cols = ['AWAY_SEASON', 'AWAY_GAME_DATE',
                            'AWAY_MATCHUP', 'AWAY_HOME_GAME', 'AWAY_POINT_DIFF', 'AWAY_WL',
                            # 'AWAY_ATS_DIFF', 'AWAY_SPREAD', 'AWAY_TEAM_COVERED'
                            ]
    
    away_teams = away_teams.drop(columns=drop_cols)

    full_matchup_ewa = pd.merge(home_teams, away_teams, how = 'inner',
                                left_on = 'HOME_GAME_ID', right_on = 'AWAY_GAME_ID') 
    
    full_matchup_ewa = full_matchup_ewa.rename(columns = {'HOME_SEASON':'SEASON',
                                                          'HOME_GAME_DATE':'GAME_DATE',
                                                          'HOME_GAME_ID':'GAME_ID',
                                                          'HOME_MATCHUP':'MATCHUP'})
    
    full_matchup_ewa = full_matchup_ewa.drop(columns=['AWAY_GAME_ID'])
    
    return full_matchup_ewa


start = time.time()
full_matchups_team_stats_ewa = make_matchups_2(team_stats_ewa)
end = time.time()
print(f"Time taken: {end - start} seconds")


Time taken: 0.18192505836486816 seconds


In [270]:
cols = [x for x in full_matchups_team_stats_ewa.columns if 'RB' in x or 'Rebound' in x]

full_matchups_team_stats_ewa[cols]

full_matchups_team_stats_ewa

Unnamed: 0,SEASON,HOME_TEAM_ABBREVIATION,GAME_DATE,GAME_ID,MATCHUP,HOME_HOME_GAME,HOME_TEAM_SCORE,HOME_POINT_DIFF,HOME_WL,HOME_FGM_L10,HOME_FGA_L10,HOME_FG3M_L10,HOME_FG3A_L10,HOME_FTM_L10,HOME_FTA_L10,HOME_OREB_L10,HOME_DREB_L10,HOME_REB_L10,HOME_AST_L10,HOME_STL_L10,HOME_BLK_L10,HOME_TOV_L10,HOME_PF_L10,HOME_PTS_L10,HOME_PLUS_MINUS_L10,HOME_E_OFF_RATING_L10,HOME_E_DEF_RATING_L10,HOME_E_NET_RATING_L10,HOME_E_PACE_L10,HOME_contestedShots_L10,HOME_contestedShots2pt_L10,HOME_contestedShots3pt_L10,HOME_deflections_L10,HOME_chargesDrawn_L10,HOME_screenAssists_L10,HOME_screenAssistPoints_L10,HOME_looseBallsRecoveredOffensive_L10,HOME_looseBallsRecoveredDefensive_L10,HOME_looseBallsRecoveredTotal_L10,HOME_offensiveBoxOuts_L10,HOME_defensiveBoxOuts_L10,HOME_boxOutPlayerTeamRebounds_L10,HOME_boxOutPlayerRebounds_L10,HOME_boxOuts_L10,HOME_DIST_L10,HOME_ORBC_L10,HOME_DRBC_L10,HOME_RBC_L10,HOME_TCHS_L10,HOME_SAST_L10,HOME_FTAST_L10,HOME_PASS_L10,HOME_CFGM_L10,HOME_CFGA_L10,HOME_UFGM_L10,HOME_UFGA_L10,HOME_DFGM_L10,HOME_DFGA_L10,HOME_PTS_OFF_TOV_L10,HOME_PTS_2ND_CHANCE_L10,HOME_PTS_FB_L10,HOME_PTS_PAINT_L10,HOME_FG2M_L10,HOME_FG2A_L10,HOME_PTS_2PT_MR_L10,HOME_AST_2PM_L10,HOME_AST_3PM_L10,HOME_UAST_2PM_L10,HOME_UAST_3PM_L10,HOME_FGM_opp_L10,HOME_FGA_opp_L10,HOME_FG3M_opp_L10,HOME_FG3A_opp_L10,HOME_FTM_opp_L10,HOME_FTA_opp_L10,HOME_OREB_opp_L10,HOME_DREB_opp_L10,HOME_REB_opp_L10,HOME_AST_opp_L10,HOME_STL_opp_L10,HOME_BLK_opp_L10,HOME_TOV_opp_L10,HOME_PF_opp_L10,HOME_PTS_opp_L10,HOME_PLUS_MINUS_opp_L10,HOME_E_OFF_RATING_opp_L10,HOME_E_DEF_RATING_opp_L10,HOME_E_NET_RATING_opp_L10,HOME_E_PACE_opp_L10,HOME_contestedShots_opp_L10,HOME_contestedShots2pt_opp_L10,HOME_contestedShots3pt_opp_L10,HOME_deflections_opp_L10,HOME_chargesDrawn_opp_L10,HOME_screenAssists_opp_L10,HOME_screenAssistPoints_opp_L10,HOME_looseBallsRecoveredOffensive_opp_L10,HOME_looseBallsRecoveredDefensive_opp_L10,HOME_looseBallsRecoveredTotal_opp_L10,HOME_offensiveBoxOuts_opp_L10,...,AWAY_TCHS_L10,AWAY_SAST_L10,AWAY_FTAST_L10,AWAY_PASS_L10,AWAY_CFGM_L10,AWAY_CFGA_L10,AWAY_UFGM_L10,AWAY_UFGA_L10,AWAY_DFGM_L10,AWAY_DFGA_L10,AWAY_PTS_OFF_TOV_L10,AWAY_PTS_2ND_CHANCE_L10,AWAY_PTS_FB_L10,AWAY_PTS_PAINT_L10,AWAY_FG2M_L10,AWAY_FG2A_L10,AWAY_PTS_2PT_MR_L10,AWAY_AST_2PM_L10,AWAY_AST_3PM_L10,AWAY_UAST_2PM_L10,AWAY_UAST_3PM_L10,AWAY_FGM_opp_L10,AWAY_FGA_opp_L10,AWAY_FG3M_opp_L10,AWAY_FG3A_opp_L10,AWAY_FTM_opp_L10,AWAY_FTA_opp_L10,AWAY_OREB_opp_L10,AWAY_DREB_opp_L10,AWAY_REB_opp_L10,AWAY_AST_opp_L10,AWAY_STL_opp_L10,AWAY_BLK_opp_L10,AWAY_TOV_opp_L10,AWAY_PF_opp_L10,AWAY_PTS_opp_L10,AWAY_PLUS_MINUS_opp_L10,AWAY_E_OFF_RATING_opp_L10,AWAY_E_DEF_RATING_opp_L10,AWAY_E_NET_RATING_opp_L10,AWAY_E_PACE_opp_L10,AWAY_contestedShots_opp_L10,AWAY_contestedShots2pt_opp_L10,AWAY_contestedShots3pt_opp_L10,AWAY_deflections_opp_L10,AWAY_chargesDrawn_opp_L10,AWAY_screenAssists_opp_L10,AWAY_screenAssistPoints_opp_L10,AWAY_looseBallsRecoveredOffensive_opp_L10,AWAY_looseBallsRecoveredDefensive_opp_L10,AWAY_looseBallsRecoveredTotal_opp_L10,AWAY_offensiveBoxOuts_opp_L10,AWAY_defensiveBoxOuts_opp_L10,AWAY_boxOutPlayerTeamRebounds_opp_L10,AWAY_boxOutPlayerRebounds_opp_L10,AWAY_boxOuts_opp_L10,AWAY_DIST_opp_L10,AWAY_ORBC_opp_L10,AWAY_DRBC_opp_L10,AWAY_RBC_opp_L10,AWAY_TCHS_opp_L10,AWAY_SAST_opp_L10,AWAY_FTAST_opp_L10,AWAY_PASS_opp_L10,AWAY_CFGM_opp_L10,AWAY_CFGA_opp_L10,AWAY_UFGM_opp_L10,AWAY_UFGA_opp_L10,AWAY_DFGM_opp_L10,AWAY_DFGA_opp_L10,AWAY_PTS_OFF_TOV_opp_L10,AWAY_PTS_2ND_CHANCE_opp_L10,AWAY_PTS_FB_opp_L10,AWAY_PTS_PAINT_opp_L10,AWAY_FG2M_opp_L10,AWAY_FG2A_opp_L10,AWAY_PTS_2PT_MR_opp_L10,AWAY_AST_2PM_opp_L10,AWAY_AST_3PM_opp_L10,AWAY_UAST_2PM_opp_L10,AWAY_UAST_3PM_opp_L10,AWAY_WIN_PCT_L10_L10,AWAY_OREB_PCT_L10,AWAY_OREB_PCT_opp_L10,AWAY_DREB_PCT_L10,AWAY_DREB_PCT_opp_L10,AWAY_REB_PCT_L10,AWAY_REB_PCT_opp_L10,AWAY_TS_PCT_L10,AWAY_TS_PCT_opp_L10,AWAY_EFG_PCT_L10,AWAY_EFG_PCT_opp_L10,AWAY_AST_RATIO_L10,AWAY_AST_RATIO_opp_L10,AWAY_TOV_PCT_L10,AWAY_TOV_PCT_opp_L10,AWAY_PIE_L10,AWAY_REST,AWAY_FG2_PCT_L10,AWAY_FG3_PCT_L10
0,2016-17,ATL,2016-10-27,0021600014,ATL vs. WAS,1,114,15,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7.0,,
1,2016-17,ATL,2016-10-31,0021600044,ATL vs. SAC,1,106,11,1,41.371539,83.283035,7.797971,21.845209,14.047237,20.066287,9.187626,38.282489,47.470115,28.634431,11.462561,6.753640,15.401309,15.631525,104.588287,23.651352,106.400000,79.515000,26.940000,103.687000,62.446575,47.779856,14.666719,16.481223,0.539957,6.753640,14.472086,0.000000,0.000000,4.168964,0.000000,0.000000,0.000000,0.000000,0.000000,17.115047,26.589712,74.555841,99.640790,398.437551,4.514288,2.044719,276.482761,19.181007,38.671755,22.190532,44.611280,9.417842,19.605856,22.925122,12.816633,17.056762,45.691194,33.573568,61.437827,21.031093,20.836460,7.797971,12.197151,0.000000,32.112748,83.132345,5.248878,18.216201,11.462561,15.746633,7.258015,30.794259,38.052273,19.260532,10.807496,4.399180,18.871266,19.296115,80.936935,-23.651352,79.515000,106.400000,-26.940000,103.687000,54.374445,38.627812,15.746633,18.180619,0.000000,6.098576,13.161957,0.000000,0.000000,3.744115,0.000000,...,441.491383,2.867723,1.472555,316.159329,20.074999,41.699939,17.904876,41.234653,13.913336,20.502107,19.183481,17.167208,14.765755,43.404808,30.771843,60.224982,17.441294,17.633462,6.161648,12.363395,0.697584,38.095470,78.284538,6.161648,19.996676,22.632147,29.956758,8.836297,31.585023,40.421319,21.547523,7.673349,5.735431,14.339553,27.593529,104.984735,-3.644599,108.277741,105.443189,2.874751,95.615880,59.991778,41.390255,18.601523,21.818952,0.620167,5.541512,11.509225,0.000000,0.000000,6.355629,0.000000,0.000000,0.000000,0.000000,0.000000,17.297274,30.188979,58.713204,84.988091,410.141805,2.364120,3.139106,296.900856,19.183419,35.654824,18.563267,41.583360,12.788675,19.493072,19.880972,9.921827,13.177481,39.221053,31.933822,58.287862,24.026424,16.664480,4.185459,14.649176,1.549987,,0.231130,0.234809,0.765191,0.768870,0.486463,0.513537,0.598224,0.618467,0.501388,0.525982,25.534751,22.535507,13.259628,13.552804,0.502836,2.0,0.510948,0.318102
2,2016-17,ATL,2016-11-02,0021600059,ATL vs. LAL,1,116,-7,0,38.556099,82.877621,9.268453,25.623256,20.541361,31.676599,12.611411,33.778235,46.389646,27.171321,11.459902,6.550657,14.652612,17.720873,106.922012,18.748880,108.168771,87.187043,21.014618,100.604917,59.950883,44.481594,15.469289,19.484966,1.160209,6.550657,14.934242,0.000000,0.000000,3.749036,0.000000,0.000000,0.000000,0.000000,0.000000,17.188786,31.391091,70.960207,100.614129,422.058225,4.792853,2.060067,298.889747,15.656928,34.429675,22.899170,48.447947,11.074458,19.260238,23.757116,14.781581,12.711992,40.720625,29.287646,57.254366,17.181948,18.740179,8.012486,9.805913,0.837312,34.275248,81.112884,7.325422,20.522494,12.297214,17.371052,6.852276,31.812159,38.664435,22.402979,10.649513,5.142674,19.239604,25.773504,88.173132,-18.748880,87.187043,108.168771,-21.014618,100.604917,55.961008,37.752643,18.208364,17.570625,0.000000,10.345482,23.361204,0.000000,0.000000,5.169597,0.000000,...,390.910283,1.504867,1.342442,274.111195,18.496214,40.504250,17.187492,42.477479,18.760926,27.084907,19.367397,13.982375,10.961241,41.286580,28.287189,55.517212,15.059065,11.197463,4.753880,16.379465,1.995292,38.332848,82.043345,7.858599,25.717127,20.195356,26.910439,10.534585,31.198045,41.732631,21.171437,9.388657,5.693728,15.916734,19.637618,104.719652,6.846773,105.523267,97.138045,8.391213,105.069406,57.489699,35.626926,21.862772,13.260064,0.876078,5.973976,13.452819,0.000000,0.000000,5.355215,0.000000,0.000000,0.000000,0.000000,0.000000,16.384474,33.543155,58.008083,88.950458,391.944220,1.928614,0.710261,273.095821,20.079031,37.759842,18.253817,44.283502,13.154420,19.920369,24.013205,16.939704,14.740755,49.097573,30.474249,56.326218,11.300217,14.526055,6.470667,15.222770,0.969321,,0.251734,0.262134,0.737866,0.748266,0.490329,0.509671,0.555048,0.595244,0.478363,0.515120,15.705859,20.149954,16.349850,14.496026,0.433673,1.0,0.509521,0.277384
3,2016-17,ATL,2016-11-05,0021600084,ATL vs. HOU,1,112,15,1,36.405361,81.431312,8.273845,26.284050,21.281334,31.506131,11.012160,32.333432,43.345592,25.754355,11.007862,4.176047,15.974579,19.305141,102.365900,6.566957,102.193079,96.153087,6.032227,102.764270,59.234888,43.962259,15.272630,17.690650,0.779875,6.298891,13.474013,0.000000,0.000000,4.652662,0.000000,0.000000,0.000000,0.000000,0.000000,16.874766,32.141063,60.347907,90.340917,414.455114,3.690499,2.527652,292.914676,16.454693,37.081953,19.950668,44.349359,11.130517,19.672237,20.101745,15.107228,13.695442,44.833355,28.131515,55.147263,10.834993,17.655548,7.673431,9.623140,0.400276,35.662475,80.894597,6.894505,20.227805,17.579489,22.367143,8.757336,31.748320,40.505656,19.665611,9.480241,5.304862,17.764766,24.233032,95.798943,-6.566957,96.153087,102.193079,-6.032227,102.764270,56.139462,36.840541,19.298921,15.697165,0.000000,12.003925,26.506250,0.000000,0.000000,4.737726,0.000000,...,381.811472,2.560514,1.207752,261.225884,16.239071,34.401107,23.551909,50.537541,13.868949,19.440267,14.237918,15.890219,14.686663,46.823597,25.966390,48.088171,4.666231,14.035138,9.443613,10.922602,3.372325,38.456933,85.126900,11.115560,28.630256,19.957128,25.146585,9.809046,29.826012,39.635058,18.487633,8.388811,3.575625,13.340175,21.036467,107.986553,-4.082833,108.311790,111.825396,-3.478665,99.263168,60.495041,32.348649,28.146392,16.154889,1.454740,8.409868,20.353922,0.000000,0.000000,5.660276,0.000000,0.000000,0.000000,0.000000,0.000000,17.018295,34.372027,56.224665,87.887319,448.028003,2.136594,1.421811,326.902339,14.954718,31.903989,23.502215,53.222911,20.254206,31.065315,14.226729,13.373927,11.976013,39.814176,27.341373,56.496644,14.624718,8.547852,9.055689,17.784870,1.736210,,0.283665,0.235164,0.764836,0.716335,0.524465,0.475535,0.617776,0.595563,0.549847,0.517048,24.669172,18.624867,13.934749,12.179296,0.522458,3.0,0.539975,0.375154
4,2016-17,ATL,2016-11-09,0021600111,ATL vs. CHI,1,115,8,1,38.770463,81.195721,8.787859,25.472793,18.665488,27.978361,11.277618,33.343506,44.621123,25.221784,9.656957,4.439491,17.300847,18.316516,104.994272,7.494945,105.576120,97.078335,8.493423,102.579270,63.634832,41.257439,22.377393,16.932450,0.818654,10.246985,22.318425,0.000000,0.000000,3.422879,0.000000,0.000000,0.000000,0.000000,0.000000,16.770133,27.764450,63.403949,89.723512,414.454768,3.881062,1.658180,292.883301,18.141293,35.535295,20.629170,45.660427,12.200400,21.423846,18.904519,13.721670,11.024549,46.977814,29.982604,55.722928,12.463021,16.770192,7.736748,12.305076,0.700741,35.879585,85.176607,8.773719,28.206328,16.966438,20.988687,10.296256,29.921848,40.218104,20.231084,9.893862,4.550243,16.355197,23.320462,97.499327,-7.494945,97.078335,105.576120,-8.493423,102.579270,56.614799,37.445499,19.169300,16.145623,0.190225,11.216518,25.263817,0.000000,0.000000,3.756280,0.000000,...,457.977617,2.849031,2.004409,335.114813,22.267306,44.200324,17.897828,43.753325,14.186741,23.047404,15.454195,17.200896,17.863432,49.275730,33.050742,66.331778,16.474445,16.979367,5.891775,15.245323,1.044641,41.498054,89.943832,9.944008,27.423561,10.575044,14.714269,9.035164,30.748539,39.783703,28.138348,8.149530,5.652287,12.365019,22.003008,103.515160,-6.384624,103.816715,109.644578,-5.811733,96.653208,58.792068,41.991581,16.800487,15.749698,1.144693,10.828824,24.568917,0.000000,0.000000,6.510614,0.000000,0.000000,0.000000,0.000000,0.000000,17.537181,31.597803,62.659823,91.778239,430.426383,4.343202,0.775160,313.011812,17.900601,39.299930,23.597453,50.643902,15.314691,25.594195,17.203343,10.105115,11.452066,45.433387,31.554046,62.520270,17.136049,18.838756,8.893926,11.679734,0.615172,,0.327133,0.202176,0.797824,0.672867,0.559853,0.440147,0.583276,0.555452,0.496343,0.516656,23.942850,29.112689,12.909778,11.366669,0.528170,2.0,0.498264,0.326991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10203,2023-24,WAS,2024-03-31,0022301081,WAS vs. MIA,1,107,-12,0,40.379089,88.903937,11.767930,35.271148,12.539259,16.562368,9.987654,34.153461,44.141115,25.640600,5.858698,4.479710,15.429188,19.562424,105.065367,-6.893507,103.229971,110.087913,-6.826919,101.637022,38.313359,23.380442,14.932917,11.415323,0.657810,7.500672,17.632513,2.072243,2.680347,4.752590,0.876795,4.819459,5.484839,1.758139,5.696253,18.387778,24.000192,58.499196,80.574127,378.794407,2.729175,1.530788,256.916002,18.336946,36.250861,22.042640,52.651183,18.607038,27.325935,13.039459,11.611164,11.243777,50.231598,28.611159,53.632790,6.585092,15.251884,9.887436,12.495207,1.380745,41.469763,89.611630,10.110538,32.500479,18.908810,23.383297,10.284290,34.581324,44.865613,24.286542,8.395869,6.753092,12.038609,16.828580,111.958874,6.893507,110.087913,103.229971,6.826919,101.637022,40.283219,24.931785,15.351434,13.993293,0.151436,8.972910,20.332873,2.235019,3.519471,5.754490,1.824177,...,416.475585,4.488518,1.995455,297.341620,17.456380,34.404240,24.906462,56.359867,15.448674,23.226494,22.465208,8.947082,16.539942,49.131261,28.871091,53.638535,7.933943,15.326225,11.659079,12.676395,1.098762,39.710305,85.655977,12.212150,36.501112,11.708861,15.410067,8.361345,35.879647,44.240992,26.636052,4.673651,4.227458,15.585476,18.074445,103.341621,-9.588232,103.825015,112.672744,-8.839979,96.535031,45.228913,27.299252,17.929661,10.953122,0.407608,7.321403,16.743730,2.230876,2.476718,4.707595,0.707676,3.915491,4.326253,2.174718,4.623167,18.707184,20.238267,56.466273,74.997127,428.490404,4.619713,2.528220,311.187053,15.842819,28.788375,23.863461,56.858529,16.792001,25.743406,12.847820,12.295183,10.627251,44.385797,27.498155,49.154865,10.130748,16.226365,9.570283,10.523727,2.026028,0.6,0.199470,0.195297,0.804703,0.800530,0.495158,0.504842,0.594330,0.580270,0.540382,0.534888,28.768012,27.592110,9.348472,14.428073,0.548093,2.0,0.538253,0.362296
10204,2023-24,WAS,2024-04-02,0022301095,WAS vs. MIL,1,117,4,1,40.030444,88.124200,11.551384,35.501568,13.056596,17.222357,9.395493,34.062622,43.458115,25.698953,5.842431,3.840042,15.421084,19.851773,104.668867,-7.738044,102.769976,111.326474,-8.531115,102.066654,37.291350,22.101481,15.189869,10.563586,0.538209,7.885165,18.447581,1.870297,2.367837,4.238133,0.717377,4.118019,4.662421,1.613303,4.835396,18.371469,22.783374,56.604237,77.462748,380.027592,2.757437,1.776938,259.329876,18.674284,36.827627,21.356566,51.295023,17.321842,26.028912,13.815501,11.423120,11.996657,50.888790,28.479060,52.622632,5.737453,15.625668,9.488321,11.971603,1.479351,40.922813,87.479446,10.719811,32.885007,19.841474,24.201719,9.463370,34.587517,44.050887,23.716961,8.792424,6.923858,12.297324,17.090516,112.406911,7.738044,111.326474,102.769976,8.531115,102.066654,37.854102,23.545586,14.308516,14.246260,0.123902,8.740073,19.782841,1.828652,3.404043,5.232694,1.492509,...,392.158082,4.236081,1.751265,272.514561,16.661345,30.973763,26.539208,57.656684,14.013870,22.437341,14.945688,12.448810,13.908908,46.180315,29.500197,50.690525,12.446649,15.851674,10.857688,13.011161,2.243918,40.731872,92.045069,13.309746,38.184221,18.302354,22.290859,10.439860,34.905671,45.345531,25.976762,6.918597,4.215533,11.644141,18.579329,113.075844,-3.228574,109.915310,114.382781,-4.510717,99.867103,38.597493,22.262115,16.335378,14.949482,0.685899,9.087273,21.161172,2.775518,2.394641,5.170159,1.380380,9.601536,10.912306,4.761330,10.981916,18.410110,25.187568,54.521464,77.512367,427.029103,3.679509,1.967445,302.100343,16.259836,34.504768,24.472033,57.503928,16.620313,22.887900,14.806016,14.295604,11.486008,46.316157,27.422126,53.860849,7.924217,14.057290,11.398980,12.556516,1.316150,0.5,0.206821,0.222887,0.777113,0.793179,0.500856,0.499144,0.622577,0.583171,0.564523,0.514821,27.560893,26.011330,11.670192,10.259409,0.515260,3.0,0.581967,0.360823
10205,2023-24,WAS,2024-04-03,0022301103,WAS vs. LAL,1,120,-5,0,40.836812,88.098866,10.999253,35.583248,13.090857,17.015247,8.547289,34.577941,43.125230,25.498765,6.328292,3.485880,14.509398,19.682628,105.763734,-5.643073,104.066344,110.667115,-6.580004,102.727263,38.423722,22.039338,16.384383,10.707095,0.440352,8.687673,20.253878,1.874270,2.281348,4.155617,0.586945,4.745395,5.018802,1.319975,5.332340,18.232371,21.737184,57.665443,77.655725,372.512469,2.600111,1.797885,252.602142,18.547215,35.636124,22.289929,52.461474,17.612685,25.596718,13.195739,10.034243,12.911688,53.677222,29.837559,52.515619,5.554347,16.396919,8.623239,12.547162,1.726418,41.222905,89.291474,10.662902,35.162559,18.298094,22.897648,10.838999,35.007401,45.846399,24.737202,8.569909,6.697055,12.125608,16.907378,111.406807,5.643073,110.667115,104.066344,6.580004,102.727263,36.819994,22.532826,14.287169,13.204152,0.273388,7.322982,16.529987,2.356237,2.957139,5.313376,1.909197,...,391.433078,3.706270,2.297574,271.917184,17.371198,31.577860,24.998653,53.336741,17.754988,26.352878,13.879723,10.645309,15.080171,51.623058,29.226409,51.155290,6.442092,17.252794,10.386480,11.081157,2.214856,42.716236,92.201109,12.273036,35.846610,12.833772,16.795745,9.293802,32.113693,41.407495,26.912279,8.978930,4.521373,12.112710,18.850378,110.539280,-5.276123,108.137091,114.659942,-6.508771,104.087291,34.071284,19.561388,14.509896,14.931790,0.291753,8.944617,20.639599,2.242336,2.650683,4.893019,1.854114,4.772016,6.448813,3.179878,6.626130,17.981642,23.009555,49.839051,70.765368,395.527219,3.072044,2.056612,275.474063,16.939784,35.005366,25.665113,57.050266,20.463731,28.782500,17.123782,14.718020,16.700852,52.135188,30.443200,56.354500,8.104910,16.126927,10.099278,13.441699,1.887709,0.8,0.211294,0.198550,0.801450,0.788706,0.526909,0.473091,0.643809,0.576349,0.576278,0.529850,27.173987,25.855489,13.553954,10.843583,0.536738,1.0,0.571327,0.389177
10206,2023-24,WAS,2024-04-05,0022301117,WAS vs. POR,1,102,-6,0,40.901701,87.226858,11.995294,36.104013,12.707972,16.085276,7.991871,33.783536,41.775408,25.190045,7.008524,3.517840,14.867231,19.765631,106.506668,-5.449256,105.217918,111.254913,-6.020003,103.911397,36.930084,22.359605,14.570479,11.256938,0.526728,8.938928,21.065213,2.032811,2.532314,4.565125,0.646667,5.380549,5.604245,1.412858,6.027216,17.964896,21.613070,55.669216,75.353686,362.703771,2.293803,1.470997,245.121935,17.338704,33.983566,23.563269,53.242255,18.904237,26.934581,14.291737,8.542714,16.223041,52.239687,28.906407,51.122845,5.043783,15.412931,9.052648,12.596009,2.244720,41.384035,88.202628,10.055706,33.429664,19.132148,23.561176,10.699103,34.967108,45.666211,23.734752,8.676136,6.145166,12.750418,15.497701,111.955924,5.449256,111.254913,105.217918,6.020003,103.911397,35.617944,21.098975,14.518968,12.967107,0.390120,6.490848,14.689609,2.593587,3.251673,5.845260,1.894949,...,401.023758,1.827875,1.480809,281.222848,18.692242,37.847301,20.014787,50.009198,16.046293,24.189309,13.421060,15.669448,12.840509,51.315256,29.669652,58.915340,7.701852,15.229953,7.703519,13.515332,1.470378,40.974518,85.246086,11.929423,34.067267,18.270159,22.982731,8.701491,33.318162,42.019653,26.493042,9.752117,6.860348,12.911405,16.303873,112.148617,12.176426,112.630910,99.155766,13.471849,99.470523,42.481971,29.663713,12.818258,16.262290,0.461566,10.290257,23.627001,2.325673,2.381438,4.707111,0.936410,5.746194,6.417896,3.092682,6.682604,17.935103,19.853459,55.752840,73.765026,389.475725,3.986758,2.272210,272.712539,16.425936,31.249868,23.962142,52.569269,19.417984,32.181121,19.376095,11.042061,15.932827,49.211598,29.045095,51.178818,8.459916,16.762472,9.326692,11.581474,2.006986,0.2,0.257700,0.208746,0.791254,0.742300,0.514615,0.485385,0.538865,0.620962,0.495969,0.550632,23.926713,26.634063,14.547646,11.925203,0.419921,2.0,0.503598,0.320831


In [272]:
full_matchups_team_stats_ewa[['HOME_FG2M_L10' ,'HOME_FG2A_L10', 'HOME_FG2_PCT_L10']]

Unnamed: 0,HOME_FG2M_L10,HOME_FG2A_L10,HOME_FG2_PCT_L10
0,,,
1,33.573568,61.437827,0.546464
2,29.287646,57.254366,0.511536
3,28.131515,55.147263,0.510116
4,29.982604,55.722928,0.538066
...,...,...,...
10203,28.611159,53.632790,0.533464
10204,28.479060,52.622632,0.541194
10205,29.837559,52.515619,0.568165
10206,28.906407,51.122845,0.565430


In [252]:
def engineer_features(df, keep_original=False):
    
    df = df.copy()
    # List of features to create relative metrics for
    relative_features = [col[5:] for col in df.columns if col.startswith('HOME_') and col.endswith('_L10')]
    relative_features = list(set(relative_features) - set(['HOME_GAME', 'TEAM_SCORE', 'POINT_DIFF', 'WL']))

    # Create relative metrics
    for feature in relative_features:
        df[f'REL_{feature}'] = df[f'HOME_{feature}'] - df[f'AWAY_{feature}']


    # Matchup-specific metrics
    df['HOME_OFF_VS_DEF'] = df['HOME_E_OFF_RATING_L10'] - df['AWAY_E_DEF_RATING_L10']
    df['AWAY_OFF_VS_DEF'] = df['AWAY_E_OFF_RATING_L10'] - df['HOME_E_DEF_RATING_L10']

    df['HOME_OFF_VS_DEF'] = df['HOME_E_OFF_RATING_L10'] - df['AWAY_E_DEF_RATING_L10']
    df['AWAY_OFF_VS_DEF'] = df['AWAY_E_OFF_RATING_L10'] - df['HOME_E_DEF_RATING_L10']

    df['HOME_OREB_OPPORTUNITY_RATIO'] = df['HOME_ORBC_L10'] / df['AWAY_DRBC_L10']
    df['AWAY_OREB_OPPORTUNITY_RATIO'] = df['AWAY_ORBC_L10'] / df['HOME_DRBC_L10']
    # Rest advantage
    df['REST_ADVANTAGE'] = df['HOME_REST'] - df['AWAY_REST']

    df['HOME_2ND_CHANCE_EFFICIENCY'] = df['HOME_PTS_2ND_CHANCE_L10'] / df['HOME_ORBC_L10']
    df['AWAY_2ND_CHANCE_EFFICIENCY'] = df['AWAY_PTS_2ND_CHANCE_L10'] / df['AWAY_ORBC_L10']
    

    # Creating interaction terms
    interaction_features = ['E_OFF_RATING_L10', 'E_DEF_RATING_L10', 'TS_PCT_L10', 'EFG_PCT_L10', 'AST_RATIO_L10', 'TOV_PCT_L10', 'REB_PCT_L10']
    for feature in interaction_features:
        df[f'HOME_{feature}_INTERACTION'] = df[f'HOME_{feature}'] * df[f'HOME_{feature}_opp'] if f'HOME_{feature}_opp' in df.columns else df[f'HOME_{feature}']
        df[f'AWAY_{feature}_INTERACTION'] = df[f'AWAY_{feature}'] * df[f'AWAY_{feature}_opp'] if f'AWAY_{feature}_opp' in df.columns else df[f'AWAY_{feature}']


    df['HOME_ORBC_TS_INTERACTION'] = df['HOME_ORBC_L10'] * df['HOME_TS_PCT_L10']
    df['AWAY_ORBC_TS_INTERACTION'] = df['AWAY_ORBC_L10'] * df['AWAY_TS_PCT_L10']
    df['HOME_DRBC_OPP_TS_INTERACTION'] = df['HOME_DRBC_L10'] * df['AWAY_TS_PCT_L10']
    df['AWAY_DRBC_OPP_TS_INTERACTION'] = df['AWAY_DRBC_L10'] * df['HOME_TS_PCT_L10']
    

    teams = ['HOME', 'AWAY']
    for team in teams:
        # Rebounding Efficiency Ratios
        df[f'{team}_OREB_EFFICIENCY'] = df[f'{team}_OREB_L10'] / df[f'{team}_ORBC_L10']
        df[f'{team}_DREB_EFFICIENCY'] = df[f'{team}_DREB_L10'] / df[f'{team}_DRBC_L10']
        df[f'{team}_OVERALL_REB_EFFICIENCY'] = (df[f'{team}_OREB_L10'] + df[f'{team}_DREB_L10']) / df[f'{team}_RBC_L10']
        
        # Box Out Effectiveness
        df[f'{team}_BOX_OUT_TO_CHANCE_RATIO'] = (df[f'{team}_boxOutPlayerRebounds_L10'] + df[f'{team}_boxOutPlayerTeamRebounds_L10']) / df[f'{team}_RBC_L10']
        
        # Rebounding Opportunity Ratios
        df[f'{team}_OREB_CHANCE_RATIO'] = df[f'{team}_ORBC_L10'] / df[f'{team}_RBC_L10']
        df[f'{team}_DREB_CHANCE_RATIO'] = df[f'{team}_DRBC_L10'] / df[f'{team}_RBC_L10']
        
        # Rebounding Conversion Rates
        df[f'{team}_OREB_CONVERSION_RATE'] = df[f'{team}_OREB_L10'] / df[f'{team}_ORBC_L10']
        df[f'{team}_DREB_CONVERSION_RATE'] = df[f'{team}_DREB_L10'] / df[f'{team}_DRBC_L10']
        
        # Rebounding Chance Differentials
        df[f'{team}_NET_REB_CHANCES'] = df[f'{team}_RBC_L10'] - df[f'{team}_RBC_opp_L10']
        df[f'{team}_NET_OREB_CHANCES'] = df[f'{team}_ORBC_L10'] - df[f'{team}_ORBC_opp_L10']
        df[f'{team}_NET_DREB_CHANCES'] = df[f'{team}_DRBC_L10'] - df[f'{team}_DRBC_opp_L10']
        
        # Box Out Impact
        df[f'{team}_BOX_OUT_EFFECTIVENESS'] = (df[f'{team}_boxOutPlayerRebounds_L10'] + df[f'{team}_boxOutPlayerTeamRebounds_L10']) / df[f'{team}_RBC_L10']

    
    # df = pd.get_dummies(df, columns=['HOME_TEAM_ABBREVIATION', 'AWAY_TEAM_ABBREVIATION'])

    if not keep_original:
        columns_to_drop = [col for col in df.columns if col.startswith('HOME_') or col.startswith('AWAY_')]
        columns_to_keep = [col for col in df.columns if col.startswith('HOME_TEAM_ABBREVIATION') or col.startswith('AWAY_TEAM_ABBREVIATION')]
        columns_to_keep += ['HOME_TEAM_SCORE', 'AWAY_TEAM_SCORE', 'HOME_POINT_DIFF', 'HOME_WL']
        columns_to_drop = list(set(columns_to_drop) - set(columns_to_keep))
        df = df.drop(columns=columns_to_drop)

    high_corr_pairs = find_high_correlations(df, threshold=0.7)
    # Dropping highly correlated features
    drop_cols = [pair[0] for pair in high_corr_pairs]
    df = df.drop(columns=drop_cols)


    return df


final_df = engineer_features(full_matchups_team_stats_ewa)

final_df

  df[f'REL_{feature}'] = df[f'HOME_{feature}'] - df[f'AWAY_{feature}']
  df[f'REL_{feature}'] = df[f'HOME_{feature}'] - df[f'AWAY_{feature}']
  df[f'REL_{feature}'] = df[f'HOME_{feature}'] - df[f'AWAY_{feature}']
  df[f'REL_{feature}'] = df[f'HOME_{feature}'] - df[f'AWAY_{feature}']
  df[f'REL_{feature}'] = df[f'HOME_{feature}'] - df[f'AWAY_{feature}']
  df[f'REL_{feature}'] = df[f'HOME_{feature}'] - df[f'AWAY_{feature}']
  df[f'REL_{feature}'] = df[f'HOME_{feature}'] - df[f'AWAY_{feature}']
  df[f'REL_{feature}'] = df[f'HOME_{feature}'] - df[f'AWAY_{feature}']
  df[f'REL_{feature}'] = df[f'HOME_{feature}'] - df[f'AWAY_{feature}']
  df[f'REL_{feature}'] = df[f'HOME_{feature}'] - df[f'AWAY_{feature}']
  df[f'REL_{feature}'] = df[f'HOME_{feature}'] - df[f'AWAY_{feature}']
  df[f'REL_{feature}'] = df[f'HOME_{feature}'] - df[f'AWAY_{feature}']
  df[f'REL_{feature}'] = df[f'HOME_{feature}'] - df[f'AWAY_{feature}']
  df[f'REL_{feature}'] = df[f'HOME_{feature}'] - df[f'AWAY_{feature}']
  df[f

Unnamed: 0,SEASON,HOME_TEAM_ABBREVIATION,GAME_DATE,GAME_ID,MATCHUP,HOME_TEAM_SCORE,HOME_POINT_DIFF,AWAY_TEAM_ABBREVIATION,AWAY_TEAM_SCORE,REL_FG3A_opp_L10,REL_chargesDrawn_opp_L10,REL_boxOuts_L10,REL_UFGA_L10,REL_E_NET_RATING_opp_L10,REL_FGA_opp_L10,REL_BLK_L10,REL_SAST_L10,REL_DFGA_L10,REL_chargesDrawn_L10,REL_PTS_OFF_TOV_opp_L10,REL_screenAssistPoints_opp_L10,REL_DIST_opp_L10,REL_deflections_L10,REL_contestedShots2pt_L10,REL_PTS_2ND_CHANCE_opp_L10,REL_defensiveBoxOuts_opp_L10,REL_FTM_L10,REL_OREB_L10,REL_DREB_opp_L10,REL_looseBallsRecoveredDefensive_L10,REL_RBC_opp_L10,REL_FGA_L10,REL_DREB_PCT_L10,REL_FTM_opp_L10,REL_PTS_FB_opp_L10,REL_DREB_L10,REL_AST_2PM_opp_L10,REL_FTAST_L10,REL_boxOutPlayerRebounds_opp_L10,REL_REB_PCT_opp_L10,REL_CFGM_opp_L10,REL_AST_RATIO_L10,REL_FG2A_L10,REL_looseBallsRecoveredTotal_opp_L10,REL_TCHS_L10,REL_offensiveBoxOuts_L10,REL_boxOutPlayerRebounds_L10,REL_CFGA_L10,REL_STL_opp_L10,REL_screenAssistPoints_L10,REL_PTS_FB_L10,REL_UFGA_opp_L10,REL_DFGM_opp_L10,REL_looseBallsRecoveredDefensive_opp_L10,REL_UAST_3PM_L10,REL_UFGM_opp_L10,REL_PTS_2PT_MR_opp_L10,REL_BLK_opp_L10,REL_SAST_opp_L10,REL_FTAST_opp_L10,REL_RBC_L10,REL_looseBallsRecoveredOffensive_L10,REL_TCHS_opp_L10,REL_contestedShots_opp_L10,REL_PTS_2PT_MR_L10,REL_looseBallsRecoveredTotal_L10,REL_E_PACE_L10,REL_deflections_opp_L10,REL_UAST_2PM_L10,REL_UAST_2PM_opp_L10,REL_looseBallsRecoveredOffensive_opp_L10,REL_offensiveBoxOuts_opp_L10,REL_UAST_3PM_opp_L10,REST_ADVANTAGE
0,2016-17,ATL,2016-10-27,0021600014,ATL vs. WAS,114,15,WAS,99,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0
1,2016-17,ATL,2016-10-31,0021600044,ATL vs. SAC,106,11,SAC,95,-1.780474,-0.620167,0.000000,3.376627,-29.814751,4.847807,3.265719,1.646565,-0.896251,-1.049177,-4.940094,1.652732,0.081117,-2.081138,8.250117,-2.513122,0.000000,-10.367962,-0.307124,-0.790764,0.000000,0.605462,-0.697911,0.075434,-11.169586,-1.750502,9.486928,-2.112868,0.572165,0.000000,-0.068597,-4.322066,2.081470,1.212844,-2.611514,-43.053833,0.000000,0.000000,-3.028184,3.134148,0.908426,2.291008,5.037057,3.303282,0.000000,-0.697584,-1.311872,-10.554726,-1.336251,0.300082,-2.599149,25.618207,0.000000,6.265011,-5.617332,3.589799,-2.380645,8.071120,-3.638333,-0.166244,-3.301722,0.000000,0.000000,-0.045225,0.0
2,2016-17,ATL,2016-11-02,0021600059,ATL vs. LAL,116,-7,LAL,123,-5.194633,-0.876078,0.000000,5.970468,-29.405831,-0.930461,3.149246,3.287987,-7.824669,0.741598,-7.542655,9.908386,0.890622,5.116328,6.556475,-9.578658,0.000000,2.352849,2.115681,0.614113,0.000000,1.588706,-0.410920,0.093486,-7.898142,-2.883476,4.125041,0.455743,0.717625,0.000000,-0.055084,-3.237382,11.302087,1.737153,-0.185618,31.147942,0.000000,0.000000,-6.074575,1.260857,0.036600,1.750752,0.760707,1.492548,0.000000,-1.157980,-0.820218,3.035786,-0.551054,0.920564,-0.387363,14.819947,0.000000,33.742595,-1.528691,2.122882,-1.236450,-4.464489,4.310560,-6.573553,-4.250359,0.000000,0.000000,-0.069463,1.0
3,2016-17,ATL,2016-11-05,0021600084,ATL vs. HOU,112,15,HOU,97,-8.402451,-1.454740,0.000000,-6.188182,-2.553562,-4.232303,0.383559,1.129985,0.231970,-0.273526,0.978371,6.152328,-0.112854,3.999616,0.755642,-3.472423,0.000000,2.618497,-0.798768,1.922307,0.000000,1.221270,-3.507336,0.022042,-2.377639,2.600380,0.431006,4.177477,1.319899,0.000000,0.007531,1.553974,0.392413,7.059091,-0.922550,32.643642,0.000000,0.000000,2.680846,1.091430,-6.214719,-0.991221,-7.961212,-1.661152,0.000000,-2.972049,-4.348432,4.543245,1.729237,-0.324071,-0.543885,-1.038928,0.000000,-41.483863,-4.355578,6.168762,-1.954104,3.501102,-0.457724,-1.299462,-2.716511,0.000000,0.000000,-1.032943,-2.0
4,2016-17,ATL,2016-11-09,0021600111,ATL vs. CHI,115,8,CHI,107,0.782766,-0.954468,0.000000,1.907101,-2.681690,-4.767225,-0.796662,1.032031,-1.623559,0.640678,1.282853,0.694900,-0.982443,3.386693,-4.379090,3.355720,0.000000,-3.789634,-3.671660,-0.826692,0.000000,-2.271867,-6.893204,-0.033761,6.391394,2.019022,-2.310825,-6.836438,-0.346229,0.000000,0.033904,-1.441523,0.644753,-10.608851,-2.754334,-43.522849,0.000000,0.000000,-8.665029,1.744331,9.333210,-6.838883,-0.901588,2.362727,0.000000,-0.343900,-4.176946,-3.178005,-1.102044,-1.992653,0.193968,-12.466350,0.000000,-33.114626,-2.177270,-4.011423,-3.057538,5.926061,0.395924,-2.940247,2.638460,0.000000,0.000000,0.588577,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10203,2023-24,WAS,2024-03-31,0022301081,WAS vs. MIA,107,-12,MIA,119,-4.000633,-0.256172,-1.272929,-3.708684,15.666898,3.955653,1.426330,-1.759343,4.099442,0.437630,5.136578,3.589144,-0.306698,-3.075947,1.809396,1.435832,0.894730,-2.101593,1.047457,-1.298323,-0.248069,5.363077,-2.039989,-0.036135,7.199949,1.623339,-0.298758,-0.959650,-0.464668,1.641504,-0.000772,3.059222,-3.540394,-0.005746,1.046895,-37.681178,-0.001824,-1.269567,1.846620,3.722218,-0.580459,-5.296165,-3.493010,1.007481,1.042752,0.281983,-1.295745,-0.557455,2.525634,-1.207034,0.756556,7.565270,-0.454402,-23.922216,-4.945694,-1.348852,-0.702471,5.101991,3.040171,-0.181188,4.612119,0.004143,1.116502,-0.363096,0.0
10204,2023-24,WAS,2024-04-02,0022301095,WAS vs. MIL,117,4,MIL,113,-5.299214,-0.561997,-1.861433,-6.361661,13.041833,-4.565624,-0.574310,-1.478644,3.591571,0.117225,3.055345,-1.378331,0.013714,-3.800107,-4.189027,-1.312885,-5.491075,-3.146371,0.293868,-0.318154,-0.233814,-0.399473,-0.536076,0.005468,1.539120,2.558180,-2.336837,0.181911,0.025673,-1.464141,0.004243,1.478198,-2.382295,1.932107,0.062536,-12.130490,-1.630399,-2.660928,5.853864,1.873828,1.358735,-1.912251,-5.099971,1.264579,1.009402,-0.764567,-1.287258,0.782603,2.708325,-0.362840,1.419400,-2.913315,-0.491838,-25.214571,-0.743391,-6.709196,-0.725652,2.199552,-0.703221,-1.039558,2.449736,-0.946866,0.112129,0.394082,-1.0
10205,2023-24,WAS,2024-04-03,0022301103,WAS vs. LAL,120,-5,LAL,125,-0.684051,-0.018365,-3.622156,-0.875267,13.088775,-2.909636,-1.996543,-1.106158,-0.756160,0.139274,-1.133834,-4.109611,0.214440,-1.241542,-1.902132,-0.139487,-0.204818,-4.784889,-0.055938,2.893708,0.480907,6.776126,3.122779,-0.040106,5.464323,-3.490018,-2.936685,-0.864391,-0.499689,0.205876,0.042201,0.669394,-2.352178,1.360329,0.420357,-18.920609,-1.152590,-3.780371,4.058265,-0.409021,5.957899,-2.168483,-1.445309,-2.390370,0.306456,-0.488438,-2.051389,0.911000,2.175683,0.157634,1.058470,1.933413,-1.374236,5.475760,2.748710,-0.887746,-0.893328,-1.360028,-1.727638,1.466006,0.900304,0.113901,0.055083,-0.316415,0.0
10206,2023-24,WAS,2024-04-05,0022301117,WAS vs. POR,102,-6,POR,108,-0.637603,-0.071446,0.758330,3.233056,-7.451846,2.956542,0.079238,0.465928,2.745272,0.092150,-1.466674,-8.937392,-0.189639,-2.766505,1.393359,3.881735,-1.676517,1.460093,-3.574997,1.648946,-0.072263,0.663003,-2.218624,-0.031777,0.861989,-2.294496,0.800371,-1.778354,-0.009812,0.343237,0.036863,1.643236,0.315137,-7.792495,1.138149,-38.319988,-0.803645,-1.296252,-3.863735,-1.075981,5.634458,3.382532,0.914778,-1.634783,0.870236,0.774342,-0.647281,1.579765,-0.715182,-0.844977,0.942250,-1.933949,-0.563949,16.011849,-6.864027,-2.658069,-0.636212,4.440874,-3.295183,-0.919323,3.814555,0.267914,0.958539,-0.388504,0.0


In [279]:
full_matchups_team_stats_ewa[[x for x in full_matchups_team_stats_ewa.columns if 'WIN_PCT' in x]]

full_matchups_team_stats_ewa[['HOME_PTS_L10', 'HOME_E_OFF_RATING_L10']]

print(full_matchups_team_stats_ewa.shape)
high_corr_pairs = find_high_correlations(full_matchups_team_stats_ewa, threshold=0.7)
# Dropping highly correlated features
drop_cols = [pair[0] for pair in high_corr_pairs]
final_df = full_matchups_team_stats_ewa.drop(columns=drop_cols)


print(final_df.shape)

(10208, 289)
(10208, 127)


In [280]:
def train_and_evaluate_model(X_train, y_train, X_test, y_test):
    model = xgb.XGBRegressor(random_state=42, n_jobs=-1)  # n_jobs=-1 uses all available cores
    model.fit(X_train, y_train)
    
    predictions = model.predict(X_test)
    
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    
    return model, mse, mae, r2


def season_to_string(s):
    return str(s) + "-" + str(s+1)[-2:]

def time_series_cv(df, start_season=2016, end_season=2023):
    results = []
    
    for test_season in range(start_season + 2, end_season + 1):
        
        print(f"Training on seasons {start_season}-{test_season-1}, testing on season {test_season}")
        
        # Split data
        train_data = df[df['SEASON'] < season_to_string(test_season)]
        test_data = df[df['SEASON'] == season_to_string(test_season)]
        
        # Prepare features and target
        features = [col for col in df.columns if col not in ['SEASON', 'GAME_DATE', 'GAME_ID', 'HOME_WL', 'MATCHUP', 'HOME_TEAM_SCORE', 'AWAY_TEAM_SCORE', 'HOME_POINT_DIFF'
                                                             , 'HOME_TEAM_ABBREVIATION', 'AWAY_TEAM_ABBREVIATION'
                                                             ]]
        X_train = train_data[features].astype(float)
        y_train_home = train_data['HOME_TEAM_SCORE']
        y_train_away = train_data['AWAY_TEAM_SCORE']
        
        X_test = test_data[features].astype(float)
        y_test_home = test_data['HOME_TEAM_SCORE']
        y_test_away = test_data['AWAY_TEAM_SCORE']
        
        # Train and evaluate home score model
        home_model, home_mse, home_mae, home_r2 = train_and_evaluate_model(X_train, y_train_home, X_test, y_test_home)
        
        # Train and evaluate away score model
        away_model, away_mse, away_mae, away_r2 = train_and_evaluate_model(X_train, y_train_away, X_test, y_test_away)
        
        results.append({
            'Test Season': test_season,
            'Home MSE': home_mse,
            'Home MAE': home_mae,
            'Home R2': home_r2,
            'Away MSE': away_mse,
            'Away MAE': away_mae,
            'Away R2': away_r2
        })
        
        # Optionally, save feature importance
        feature_importance = pd.DataFrame({
            'feature': features,
            'importance_home': home_model.feature_importances_,
            'importance_away': away_model.feature_importances_
        })
        feature_importance.to_csv(f'feature_importance_season_{test_season}.csv', index=False)
    
    return pd.DataFrame(results)

# Main execution
# df = pd.read_csv('your_data.csv')  # Load your data
# df = prepare_data(df)
# df = engineer_features(df)

results = time_series_cv(final_df)
print(results)

# Optionally, you can save the results
# results.to_csv('model_performance_by_season.csv', index=False)

Training on seasons 2016-2017, testing on season 2018
Training on seasons 2016-2018, testing on season 2019
Training on seasons 2016-2019, testing on season 2020
Training on seasons 2016-2020, testing on season 2021
Training on seasons 2016-2021, testing on season 2022
Training on seasons 2016-2022, testing on season 2023
   Test Season    Home MSE   Home MAE   Home R2    Away MSE   Away MAE  \
0         2018  188.655940  10.684888 -0.179263  197.187114  11.140750   
1         2019  180.804318  10.650202 -0.136876  172.649366  10.465782   
2         2020  183.521204  10.704563 -0.173522  177.000815  10.429149   
3         2021  162.563262  10.143618 -0.053239  167.877496  10.284065   
4         2022  169.407775  10.338518 -0.179088  163.421309  10.065522   
5         2023  164.931441  10.328378  0.006049  162.709375  10.158811   

    Away R2  
0 -0.258666  
1 -0.160861  
2 -0.122366  
3 -0.037349  
4 -0.135367  
5  0.024271  


In [277]:
fi_2023 = pd.read_csv('feature_importance_season_2023.csv')
fi_2023.sort_values(['importance_home'], ascending=False).head(30)

Unnamed: 0,feature,importance_home,importance_away
17,HOME_E_OFF_RATING_L10,0.05182,0.011681
159,AWAY_E_PACE_L10,0.027823,0.018855
260,AWAY_WIN_PCT_L10_L10,0.019965,0.021255
121,HOME_WIN_PCT_L10_L10,0.018823,0.018782
20,HOME_E_PACE_L10,0.014518,0.019199
125,HOME_DREB_PCT_opp_L10,0.012451,0.000762
234,AWAY_boxOuts_opp_L10,0.008876,0.0
90,HOME_looseBallsRecoveredTotal_opp_L10,0.007721,0.003149
18,HOME_E_DEF_RATING_L10,0.007453,0.027698
263,AWAY_DREB_PCT_L10,0.007043,0.008359


In [281]:
pd.options.display.max_rows = 200
fi_2023 = pd.read_csv('feature_importance_season_2023.csv')
fi_2023.sort_values(['importance_home'], ascending=False)

Unnamed: 0,feature,importance_home,importance_away
74,AWAY_E_PACE_L10,0.053607,0.036188
73,AWAY_E_DEF_RATING_L10,0.032104,0.008495
14,HOME_E_PACE_L10,0.024326,0.036574
48,HOME_DRBC_opp_L10,0.019031,0.007658
24,HOME_TCHS_L10,0.017849,0.009797
86,AWAY_DFGM_L10,0.014097,0.007208
31,HOME_PTS_FB_L10,0.012316,0.006305
89,AWAY_PTS_PAINT_L10,0.01175,0.00577
96,AWAY_OREB_opp_L10,0.011572,0.00854
12,HOME_PLUS_MINUS_L10,0.01106,0.00643


In [None]:
def create_interactions(df, features_to_interact):
    for i, feature1 in enumerate(features_to_interact):
        for feature2 in features_to_interact[i+1:]:
            df[f'{feature1}_{feature2}_interaction'] = df[feature1] * df[feature2]
    return df

def train_and_evaluate(X, y, interactions=None):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    if interactions:
        X_train = create_interactions(X_train, interactions)
        X_test = create_interactions(X_test, interactions)
    
    model = xgb.XGBRegressor(random_state=42)
    model.fit(X_train, y_train)
    
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    
    feature_importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    return mse, feature_importance

# Assume df is your dataframe and 'target' is your target variable
# df = pd.read_csv('your_data.csv')
# X = df.drop('target', axis=1)
# y = df['target']

# Train model without interactions
mse_without, importance_without = train_and_evaluate(X, y)

print("MSE without interactions:", mse_without)
print("\nTop 10 features without interactions:")
print(importance_without.head(10))

# Select top features for interactions
top_features = importance_without['feature'].head(5).tolist()

# Train model with interactions
mse_with, importance_with = train_and_evaluate(X, y, interactions=top_features)

print("\nMSE with interactions:", mse_with)
print("\nTop 10 features with interactions:")
print(importance_with.head(10))

print(f"\nMSE Improvement: {mse_without - mse_with}")