In [1]:
import pandas as pd
from nba_api.stats.static import players

players_act = players.get_active_players()
player_ids = [player["id"] for player in players_act]

In [3]:
player_ids = player_ids[:100]

In [None]:
from nba_api.stats.endpoints import playerprofilev2

profiles = [playerprofilev2.PlayerProfileV2(player_id=id).get_data_frames()[0] for id in player_ids]
df = pd.concat(profiles)
df = df[["PLAYER_ID", "SEASON_ID", "GP", "MIN", "FGM", "FGA", "FG_PCT", "FG3M", "FG3A", "FG3_PCT"]]
df.head()

In [28]:
df.to_csv("player_profiles", index=False)

Attempt to use PlayerCareerStats endpoint. Produces same results as previous approach.

In [None]:
# from nba_api.stats.endpoints import playercareerstats

# stats = [playercareerstats.PlayerCareerStats(player_id=id).get_data_frames()[0] for id in player_ids]
# df_stats = pd.concat(stats)
# df_stats.head()

Tried to limit player shooting percentage to last season, but every player can still have multiple items - they are split by teams (e.g. if a player played for multiple teams during season). This creates problems when joining shooting data with pbp data. Most likely necessary to aggregate this data. How??

In [None]:
# df = df[df["SEASON_ID"] == "2024-25"]
# df = df[["PLAYER_ID", "FG_PCT", "FG3_PCT"]]
# df.shape
# df.head()

In [1]:
import pandas as pd

df_profiles = pd.read_csv("player_profiles")
df_profiles = df_profiles[df_profiles["SEASON_ID"] == "2024-25"]
df_profiles

Unnamed: 0,PLAYER_ID,SEASON_ID,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT
6,1630173,2024-25,44,927,122,235,0.519,9,30,0.300
17,203500,2024-25,49,618,70,134,0.522,0,1,0.000
25,1628389,2024-25,66,2272,444,913,0.486,54,174,0.310
30,1630534,2024-25,55,1482,222,441,0.503,83,208,0.399
34,1630583,2024-25,53,1325,263,540,0.487,96,256,0.375
...,...,...,...,...,...,...,...,...,...,...
3472,1629027,2024-25,63,2281,463,1147,0.404,173,533,0.325
3494,1627826,2024-25,66,2152,475,763,0.623,0,0,0.000
3495,1641783,2024-25,65,1481,179,429,0.417,73,218,0.335
3500,1628427,2024-25,8,67,6,12,0.500,2,5,0.400


In [5]:
sum_cols = ['GP', 'MIN', 'FGM', 'FGA', 'FG3M', 'FG3A']

# Group by PLAYER_ID and SEASON_ID
grouped = df_profiles.groupby(['PLAYER_ID', 'SEASON_ID'], as_index=False)
sum_df = grouped[sum_cols].sum()

# Now compute weighted FG_PCT and FG3_PCT
def weighted_pct(group):
    fg_pct = (group['FG_PCT'] * group['FGA']).sum() / group['FGA'].sum() if group['FGA'].sum() != 0 else 0
    fg3_pct = (group['FG3_PCT'] * group['FG3A']).sum() / group['FG3A'].sum() if group['FG3A'].sum() != 0 else 0
    return pd.Series({'FG_PCT': fg_pct, 'FG3_PCT': fg3_pct})

# Apply the weighted average function
weighted_df = grouped.apply(weighted_pct).reset_index()

# Merge summed and weighted data
final_df = pd.merge(sum_df, weighted_df, on=['PLAYER_ID', 'SEASON_ID'])
final_df

Unnamed: 0,PLAYER_ID,SEASON_ID,GP,MIN,FGM,FGA,FG3M,FG3A,index,FG_PCT,FG3_PCT
0,2544,2024-25,58,2023,555,1073,132,344,0,0.517,0.384
1,101108,2024-25,67,1885,196,469,115,306,1,0.418,0.376
2,200768,2024-25,33,623,39,111,28,83,2,0.351,0.337
3,201142,2024-25,56,2066,533,1013,139,334,3,0.526,0.416
4,201143,2024-25,52,1427,160,382,95,262,4,0.419,0.363
...,...,...,...,...,...,...,...,...,...,...,...
527,1642461,2024-25,15,59,5,15,1,8,527,0.333,0.125
528,1642484,2024-25,8,23,2,10,1,7,528,0.200,0.143
529,1642502,2024-25,6,21,0,2,0,1,529,0.000,0.000
530,1642505,2024-25,17,76,8,22,7,16,530,0.364,0.438


In [7]:
final_df = final_df[["PLAYER_ID", "FG_PCT", "FG3_PCT"]]

Simple aggregation - probably not useful.

In [None]:
agg_dict = {
    'GP': 'sum',
    'MIN': 'sum',
    'FGM': 'sum',
    'FGA': 'sum',
    'FG3M': 'sum',
    'FG3A': 'sum',
    'FG_PCT': 'mean',
    'FG3_PCT': 'mean'
}

# Group by PLAYER_ID and SEASON_ID, then aggregate
agg_df = df.groupby(['PLAYER_ID', 'SEASON_ID'], as_index=False).agg(agg_dict)

Weighted aggregation - shooting pctg weighted by number of games.

In [None]:
sum_cols = ['GP', 'MIN', 'FGM', 'FGA', 'FG3M', 'FG3A']

# Group by PLAYER_ID and SEASON_ID
grouped = df.groupby(['PLAYER_ID', 'SEASON_ID'], as_index=False)

# Sum the numeric stats
sum_df = grouped[sum_cols].sum()

# Now compute weighted FG_PCT and FG3_PCT
def weighted_pct(group):
    fg_pct = (group['FG_PCT'] * group['FGA']).sum() / group['FGA'].sum() if group['FGA'].sum() != 0 else 0
    fg3_pct = (group['FG3_PCT'] * group['FG3A']).sum() / group['FG3A'].sum() if group['FG3A'].sum() != 0 else 0
    return pd.Series({'FG_PCT': fg_pct, 'FG3_PCT': fg3_pct})

# Apply the weighted average function
weighted_df = grouped.apply(weighted_pct).reset_index()

# Merge summed and weighted data
final_df = pd.merge(sum_df, weighted_df, on=['PLAYER_ID', 'SEASON_ID'])


In [6]:
shots_df = pd.read_csv("data_v1")
shots_df.head()

Unnamed: 0,gameId,actionNumber,clock,period,teamId,teamTricode,personId,playerNameI,xLegacy,yLegacy,shotDistance,shotResult,isFieldGoal,scoreHome,scoreAway,location,actionType,subType,shotValue,actionId
0,22400203,7,PT11M48.00S,1,1610612752,NYK,1628384,O. Anunoby,-232,10,0,Missed,1,,,v,Missed Shot,Jump Shot,3,3
1,22400203,10,PT11M29.00S,1,1610612754,IND,1626167,M. Turner,-235,32,24,Missed,1,,,h,Missed Shot,Jump Shot,3,5
2,22400203,12,PT11M15.00S,1,1610612752,NYK,1628404,J. Hart,-16,11,2,Made,1,0.0,2.0,v,Made Shot,Driving Reverse Layup Shot,2,7
3,22400203,15,PT10M56.00S,1,1610612754,IND,1626167,M. Turner,-40,107,11,Made,1,2.0,2.0,h,Made Shot,Jump Shot,2,9
4,22400203,17,PT10M35.00S,1,1610612752,NYK,1628384,O. Anunoby,17,26,3,Missed,1,,,v,Missed Shot,Driving Layup Shot,2,10


Join player shooting PCT in current season with shot data.

In [10]:
test = pd.merge(shots_df, final_df, how="left", left_on="personId", right_on="PLAYER_ID")
test = test.drop(columns=["PLAYER_ID"])
test.head()

Unnamed: 0,gameId,actionNumber,clock,period,teamId,teamTricode,personId,playerNameI,xLegacy,yLegacy,...,isFieldGoal,scoreHome,scoreAway,location,actionType,subType,shotValue,actionId,FG_PCT,FG3_PCT
0,22400203,7,PT11M48.00S,1,1610612752,NYK,1628384,O. Anunoby,-232,10,...,1,,,v,Missed Shot,Jump Shot,3,3,0.465,0.364
1,22400203,10,PT11M29.00S,1,1610612754,IND,1626167,M. Turner,-235,32,...,1,,,h,Missed Shot,Jump Shot,3,5,0.473,0.393
2,22400203,12,PT11M15.00S,1,1610612752,NYK,1628404,J. Hart,-16,11,...,1,0.0,2.0,v,Made Shot,Driving Reverse Layup Shot,2,7,0.534,0.335
3,22400203,15,PT10M56.00S,1,1610612754,IND,1626167,M. Turner,-40,107,...,1,2.0,2.0,h,Made Shot,Jump Shot,2,9,0.473,0.393
4,22400203,17,PT10M35.00S,1,1610612752,NYK,1628384,O. Anunoby,17,26,...,1,,,v,Missed Shot,Driving Layup Shot,2,10,0.465,0.364


Data_v2 contains additionally player shooting pctg. This is calculated within season - could extend to multiple seasons. Would leave this depending on classification results.

In [11]:
test.to_csv("data_v2", index=False)