# Import libraries

In [1]:
!pip install nba_api #live updated nba stats


Collecting nba_api
  Downloading nba_api-1.11.3-py3-none-any.whl.metadata (5.8 kB)
Downloading nba_api-1.11.3-py3-none-any.whl (318 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m319.0/319.0 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nba_api
Successfully installed nba_api-1.11.3


In [1]:
import pandas as pd
import numpy as np
from nba_api.stats.endpoints import leaguegamelog
from nba_api.stats.static import teams

# Get NBA Team META DATA for future steps (join in df later)

In [2]:
nba_teams = teams.get_teams()
teams_df = pd.DataFrame(nba_teams)

teams_df[['id', 'full_name', 'abbreviation']].head()


Unnamed: 0,id,full_name,abbreviation
0,1610612737,Atlanta Hawks,ATL
1,1610612738,Boston Celtics,BOS
2,1610612739,Cleveland Cavaliers,CLE
3,1610612740,New Orleans Pelicans,NOP
4,1610612741,Chicago Bulls,CHI


# Extract Season Data (Single Season for Testing)

In [3]:
season = "2023-24"

gamelog = leaguegamelog.LeagueGameLog(
    season=season,
    season_type_all_star="Regular Season"
)

games_df = gamelog.get_data_frames()[0]


In [4]:
games_df.head()


Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,...,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE
0,22023,1610612743,DEN,Denver Nuggets,22300061,2023-10-24,DEN vs. LAL,W,240,48,...,33,42,29,9,6,12,15,119,12,1
1,22023,1610612744,GSW,Golden State Warriors,22300062,2023-10-24,GSW vs. PHX,L,240,36,...,31,49,19,11,6,11,23,104,-4,1
2,22023,1610612747,LAL,Los Angeles Lakers,22300061,2023-10-24,LAL @ DEN,L,240,41,...,31,44,23,5,4,12,18,107,-12,1
3,22023,1610612756,PHX,Phoenix Suns,22300062,2023-10-24,PHX @ GSW,W,240,42,...,43,60,23,5,7,19,22,108,4,1
4,22023,1610612740,NOP,New Orleans Pelicans,22300071,2023-10-25,NOP @ MEM,W,240,40,...,41,52,22,8,5,21,21,111,7,1


We will not add stl of blk as they will be relfecitve in defensive metrics later

In [5]:
cols_to_keep = [
    'GAME_ID',
    'GAME_DATE',
    'TEAM_ID',
    'TEAM_ABBREVIATION',
    'MATCHUP',
    'WL',
    'PTS',
    'REB',
    'OREB',
    'DREB',
    'AST',
    'TOV',
    'FGA',
    'FGM',
    'FTA',
    'FTM'
]

games_df = games_df[cols_to_keep]


# Data Set Feature Engineering

Convert dates and home/away games

In [6]:
games_df['GAME_DATE'] = pd.to_datetime(games_df['GAME_DATE'])

games_df['HOME'] = games_df['MATCHUP'].apply(
    lambda x: 1 if 'vs.' in x else 0
) #1 home, 0 away


Create your target variable (wins vs loss)

In [7]:
games_df['WIN'] = games_df['WL'].apply(lambda x: 1 if x == 'W' else 0)


**Key Step** : sort dates for week by week prediction later

In [8]:
games_df = games_df.sort_values(['TEAM_ID', 'GAME_DATE']).reset_index(drop=True)


In [9]:
games_df['GAME_DATE'] = pd.to_datetime(games_df['GAME_DATE'])

games_df.dtypes #check to make sure data is correct

Unnamed: 0,0
GAME_ID,object
GAME_DATE,datetime64[ns]
TEAM_ID,int64
TEAM_ABBREVIATION,object
MATCHUP,object
WL,object
PTS,int64
REB,int64
OREB,int64
DREB,int64


**add** points allowed

In [10]:
games_df['PTS_ALLOWED'] = np.nan

for game_id in games_df['GAME_ID'].unique():
    game_rows = games_df[games_df['GAME_ID'] == game_id]
    if len(game_rows) == 2:
        pts = game_rows['PTS'].values
        games_df.loc[game_rows.index, 'PTS_ALLOWED'] = pts[::-1]


In [11]:
games_df[['PTS', 'PTS_ALLOWED']].head(10)


Unnamed: 0,PTS,PTS_ALLOWED
0,110,116.0
1,120,126.0
2,127,110.0
3,127,113.0
4,130,121.0
5,123,105.0
6,117,126.0
7,120,119.0
8,109,117.0
9,126,120.0


# Check Feature Engineering

In [13]:
print(games_df.info())

games_df[['TEAM_ABBREVIATION', 'GAME_DATE', 'PTS', 'WIN']].head(10)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2460 entries, 0 to 2459
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   GAME_ID            2460 non-null   object        
 1   GAME_DATE          2460 non-null   datetime64[ns]
 2   TEAM_ID            2460 non-null   int64         
 3   TEAM_ABBREVIATION  2460 non-null   object        
 4   MATCHUP            2460 non-null   object        
 5   WL                 2460 non-null   object        
 6   PTS                2460 non-null   int64         
 7   REB                2460 non-null   int64         
 8   OREB               2460 non-null   int64         
 9   DREB               2460 non-null   int64         
 10  AST                2460 non-null   int64         
 11  TOV                2460 non-null   int64         
 12  FGA                2460 non-null   int64         
 13  FGM                2460 non-null   int64         
 14  FTA     

Unnamed: 0,TEAM_ABBREVIATION,GAME_DATE,PTS,WIN
0,ATL,2023-10-25,110,0
1,ATL,2023-10-27,120,0
2,ATL,2023-10-29,127,1
3,ATL,2023-10-30,127,1
4,ATL,2023-11-01,130,1
5,ATL,2023-11-04,123,1
6,ATL,2023-11-06,117,0
7,ATL,2023-11-09,120,1
8,ATL,2023-11-11,109,0
9,ATL,2023-11-14,126,1


# Save 2023-2024 Data Set

In [14]:
games_df.to_csv("nba_games_2023_24.csv", index=False)
