In [None]:
import sys
!{sys.executable} -m pip install nba_api
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install matplotlib

In [2]:
import pandas as pd
import numpy as np

##################################################################

Play-by-play

##################################################################

Example uses Pacers team ID to find games

In [3]:
from nba_api.stats.static import teams

nba_teams = teams.get_teams()

pacers = [team for team in nba_teams if team['abbreviation'] == 'IND'][0]
pacers_id = pacers['id']
print(f'pacers_id: {pacers_id}')

pacers_id: 1610612754


In [4]:
from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.library.parameters import Season
from nba_api.stats.library.parameters import SeasonType

gamefinder = leaguegamefinder.LeagueGameFinder(team_id_nullable=pacers_id,
                            season_nullable=Season.default,
                            season_type_nullable=SeasonType.regular)  

games_dict = gamefinder.get_normalized_dict()
games = games_dict['LeagueGameFinderResults']
game_ids = [game["GAME_ID"] for game in games]

In [6]:
game_ids

['0022401172',
 '0022401168',
 '0022401148',
 '0022401142',
 '0022401119',
 '0022401104',
 '0022401090',
 '0022401079',
 '0022401061',
 '0022401055',
 '0022401037',
 '0022401024',
 '0022401011',
 '0022400998',
 '0022400986',
 '0022400972',
 '0022400959',
 '0022400942',
 '0022400933',
 '0022400914',
 '0022400899',
 '0022400882',
 '0022400868',
 '0022400854',
 '0022400837',
 '0022400822',
 '0022400813',
 '0022400788',
 '0022400771',
 '0022400765',
 '0022400742',
 '0022400733',
 '0022400716',
 '0022400707',
 '0022400686',
 '0022400664',
 '0022400633',
 '0022400621',
 '0022400588',
 '0022400572',
 '0022400554',
 '0022400543',
 '0022400525',
 '0022400510',
 '0022400495',
 '0022400480',
 '0022400463',
 '0022400450',
 '0022400437',
 '0022400420',
 '0022400410',
 '0022400403',
 '0022400389',
 '0022400369',
 '0022401216',
 '0022401209',
 '0022400349',
 '0022400336',
 '0022400320',
 '0022400054',
 '0022400306',
 '0022400045',
 '0022400288',
 '0022400279',
 '0022400272',
 '0022400029',
 '00224002

In [8]:
from nba_api.stats.endpoints import playbyplayv3

df_list = [playbyplayv3.PlayByPlayV3(id).get_data_frames()[0] for id in game_ids]

ReadTimeout: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)

In [None]:
df = pd.concat(df_list)
df = df[['gameId', 'actionNumber', 'clock', 'period', 'teamId', 'personId', 
         'playerNameI', 'xLegacy', 'yLegacy', 'shotDistance', 'shotResult', 'isFieldGoal', 
         'scoreHome', 'scoreAway', 'location', 'actionType', 'subType', 'shotValue', 'actionId']]
df.shape[0]

In [None]:
df = df[df["actionType"].isin(["Made Shot", "Missed Shot", "Free Throw"])]
df.shape

In [None]:
import matplotlib.pyplot as plt

# Create the scatter plot
plt.figure(figsize=(7, 7))  # Set figure size
plt.scatter(df['xLegacy'], df['yLegacy'], c='blue', alpha=0.6, edgecolors='black')

# Labels and title
plt.xlabel("X Coordinate (Court)")
plt.ylabel("Y Coordinate (Court)")
plt.title("NBA Shot Locations")

# Display the plot
plt.show()

CHECKPOINT 1

In [None]:
df.to_csv("pbp_final/base.csv", index=False)

Calculate scoreDiff

In [9]:
# data_v3_ft = pd.read_csv("base.csv")
data_v3_ft =  pd.read_csv("../data_ft/data_v1_ft.csv")

In [10]:
def fill_scores(group):
    # Replace NaN in the first row with 0
    if pd.isna(group.iloc[0]['scoreHome']):
        group.iloc[0, group.columns.get_loc('scoreHome')] = 0.0
    if pd.isna(group.iloc[0]['scoreAway']):
        group.iloc[0, group.columns.get_loc('scoreAway')] = 0.0
    # Forward-fill the rest
    group['scoreHome'] = group['scoreHome'].ffill()
    group['scoreAway'] = group['scoreAway'].ffill()
    return group

# Apply the function to each gameId group
data_v3_ft = data_v3_ft.groupby('gameId', group_keys=False).apply(fill_scores)

In [11]:
data_v3_ft["scoreDiff"] = np.where(
    data_v3_ft["location"] == "h",
    data_v3_ft["scoreHome"] - data_v3_ft["scoreAway"],
    data_v3_ft["scoreAway"] - data_v3_ft["scoreHome"]
)

# print(data_v3_ft[['gameId', 'actionNumber', 'scoreHome', 'scoreAway', 'scoreDiff', 'actionType', 'location']].head(20))

In [12]:
def shift_score_diff(group):
    group = group.copy()
    group['scoreDiff'] = group['scoreDiff'].shift(1)  # Shift up to reflect "before shot"
    group.iloc[0, group.columns.get_loc('scoreDiff')] = 0.0      # First row should be 0
    return group

data_v3_ft = data_v3_ft.groupby('gameId', group_keys=False).apply(shift_score_diff)

def mirror_diff_on_location_switch(group):
    group = group.copy()
    # Compare current row's location to previous row's
    group['locationChanged'] = group['location'] != group['location'].shift(1)
    
    # Mirror scoreDiff only if location changed and scoreDiff != 0
    group['scoreDiff'] = np.where(
        (group['locationChanged']) & (group['scoreDiff'] != 0),
        -group['scoreDiff'],
        group['scoreDiff']
    )
    
    group.drop(columns='locationChanged', inplace=True)  # optional cleanup
    return group

data_v3_ft = data_v3_ft.groupby('gameId', group_keys=False).apply(mirror_diff_on_location_switch)

In [None]:
data_v3_ft.to_csv("data_v3_ft.csv", index=False)

In [13]:
# data_pbp = pd.read_csv("data_v3_ft.csv")
data_pbp = data_v3_ft

In [14]:
data_pbp = data_pbp[data_pbp["actionType"].isin(["Made Shot", "Missed Shot"])]

In [17]:
def clock_to_seconds(clock_str):
    parts = clock_str.replace('PT', '').replace('S', '').split('M')
    minutes = int(parts[0].replace('M', ''))
    seconds = float(parts[1])
    return minutes * 60 + seconds

data_pbp['secondsRemaining'] = data_pbp['clock'].apply(clock_to_seconds)

# Define clutchFlag
data_pbp['clutchFlag'] = data_pbp.apply(
    lambda row: 1 if (
        row['period'] in [4, 5] and
        row['secondsRemaining'] <= 300 and
        abs(row['scoreDiff']) <= 5
    ) else 0,
    axis=1
)

data_pbp['clutchFlag'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_pbp['secondsRemaining'] = data_pbp['clock'].apply(clock_to_seconds)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_pbp['clutchFlag'] = data_pbp.apply(


0    13955
1      509
Name: clutchFlag, dtype: int64

In [18]:
data_pbp.drop(columns=["period", "clock", "secondsRemaining"], inplace=True)
data_pbp.columns

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_pbp.drop(columns=["period", "clock", "secondsRemaining"], inplace=True)


Index(['gameId', 'actionNumber', 'teamId', 'teamTricode', 'personId',
       'playerNameI', 'xLegacy', 'yLegacy', 'shotDistance', 'shotResult',
       'isFieldGoal', 'scoreHome', 'scoreAway', 'location', 'actionType',
       'subType', 'shotValue', 'actionId', 'scoreDiff', 'clutchFlag'],
      dtype='object')

In [19]:
data_pbp["shotResult"] = data_pbp["shotResult"].replace({'Made': 1, 'Missed': 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_pbp["shotResult"] = data_pbp["shotResult"].replace({'Made': 1, 'Missed': 0})


In [20]:
data_pbp.head()

Unnamed: 0,gameId,actionNumber,teamId,teamTricode,personId,playerNameI,xLegacy,yLegacy,shotDistance,shotResult,isFieldGoal,scoreHome,scoreAway,location,actionType,subType,shotValue,actionId,scoreDiff,clutchFlag
0,22401172,7,1610612753,ORL,202709,C. Joseph,227,21,0,0,1,0.0,0.0,v,Missed Shot,Jump Shot,3,3,0.0,0
1,22401172,9,1610612753,ORL,1629048,G. Bitadze,7,-11,1,1,1,0.0,2.0,v,Made Shot,Alley Oop Dunk Shot,2,5,0.0,0
2,22401172,11,1610612754,IND,1630167,O. Toppin,-86,46,10,1,1,2.0,2.0,h,Made Shot,Driving Floating Bank Jump Shot,2,6,-2.0,0
3,22401172,12,1610612753,ORL,203914,G. Harris,108,256,28,0,1,2.0,2.0,v,Missed Shot,Jump Shot,3,7,0.0,0
4,22401172,14,1610612754,IND,1631097,B. Mathurin,71,270,28,0,1,2.0,2.0,h,Missed Shot,Pullup Jump shot,3,9,0.0,0


In [22]:
data_pbp.to_csv("pbp_pacers.csv", index=False)

##################################################################

SHOT CHART

##################################################################