In [62]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
pd.set_option('display.max_columns', None)

data = pd.read_excel('datasets/nba_player_data.xlsx')

In [82]:
data.sample(10)

Unnamed: 0,Year,Season_type,PLAYER_ID,PLAYER,TEAM_ID,TEAM,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,AST_TOV,STL_TOV,season_start_year
4631,2018-19,Regular%20Season,201160,Jason Smith,1610612740,NO,20,191,21,59,0.356,9,26,0.346,14,16,0.875,16,36,52,14,3,7,13,30,65,1.08,0.23,2018
9320,2024-25,Regular%20Season,1642505,Alex Ducas,1610612760,OKC,21,125,12,30,0.4,10,21,0.476,2,2,1.0,9,17,26,5,5,0,4,12,36,1.25,1.25,2024
3886,2017-18,Regular%20Season,1626191,Chris McCullough,1610612764,WAS,19,90,18,42,0.429,1,8,0.125,9,14,0.643,7,18,25,4,0,5,2,8,46,2.0,0.0,2017
3718,2017-18,Regular%20Season,1626181,Norman Powell,1610612761,TOR,70,1062,150,374,0.401,53,186,0.285,32,39,0.821,14,105,119,89,37,16,66,111,385,1.35,0.56,2017
6221,2020-21,Regular%20Season,1629607,Jared Harper,1610612752,NYK,8,16,0,4,0.0,0,1,0.0,3,4,0.75,0,2,2,1,0,0,3,1,3,0.33,0.0,2020
203,2012-13,Regular%20Season,101131,Jason Maxiell,1610612765,DET,72,1789,204,457,0.446,0,0,0.0,90,145,0.621,135,274,409,54,32,95,81,172,498,0.67,0.4,2012
8202,2023-24,Regular%20Season,201566,Russell Westbrook,1610612746,LAC,68,1529,301,663,0.454,42,154,0.273,110,160,0.688,93,250,343,306,74,23,144,121,754,2.13,0.51,2023
5143,2019-20,Regular%20Season,202738,Isaiah Thomas,1610612764,WAS,40,925,174,426,0.408,78,189,0.413,62,76,0.816,11,57,68,146,13,6,77,77,488,1.9,0.17,2019
360,2012-13,Regular%20Season,202397,Ish Smith,1610612749,MIL,52,516,56,159,0.352,8,27,0.296,3,7,0.429,12,49,61,87,21,9,37,26,123,2.35,0.57,2012
6419,2020-21,Playoffs,1629035,Carsen Edwards,1610612738,BOS,2,5,2,3,0.667,1,2,0.5,0,0,0.0,0,1,1,0,0,0,0,0,5,0.0,0.0,2020


In [76]:
data.shape

(9623, 29)

# Data cleaning & analysis preparation

In [None]:
data.drop(columns=['RANK', 'EFF'], inplace=True)

In [70]:
data['season_start_year'] = data['Year'].str[:4].astype(int)

In [73]:
data['TEAM'].replace(to_replace=['NOP', 'NOH'], value='NO', inplace=True)

In [None]:
data['Season_type'].replace('Regular%20Season', 'Regular', inplace=True)

In [53]:
rs_df = data[data['Season_type'] == 'Regular']
playoffs_df = data[data['Season_type'] == 'Playoffs']

In [54]:
data.columns

Index(['Year', 'Season_type', 'PLAYER_ID', 'PLAYER', 'TEAM_ID', 'TEAM', 'GP',
       'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA',
       'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PTS', 'AST_TOV', 'STL_TOV', 'season_start_year'],
      dtype='object')

In [98]:
total_cols = ['MIN', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA', 'OREB', 'DREB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']

## Which player stats are correlated with each other?

In [103]:
data_per_min = data.groupby(['PLAYER', 'PLAYER_ID', 'Year'])[total_cols].sum().reset_index()

for col in data_per_min.columns[4:]:
    data_per_min[col] = data_per_min[col]/data_per_min['MIN']

data_per_min['FG%'] = data_per_min['FGM']/data_per_min['FGA']
data_per_min['3PT%'] = data_per_min['FG3M']/data_per_min['FG3A']
data_per_min['FT%'] = data_per_min['FTM']/data_per_min['FTA']
data_per_min['FG3A%'] = data_per_min['FG3A']/data_per_min['FGA']
data_per_min['PTS/FGA'] = data_per_min['PTS']/data_per_min['FGA']
data_per_min['FG3M/FGM'] = data_per_min['FG3M']/data_per_min['FGM']
data_per_min['FTA/FGA'] = data_per_min['FTA']/data_per_min['FGA']
data_per_min['TRU%'] = 0.5 * data_per_min['PTS']/(data_per_min['FGA'] + 0.475 * data_per_min['FTA'])
data_per_min['AST_TOV'] = data_per_min['AST']/data_per_min['TOV']

data_per_min = data_per_min[data_per_min['MIN'] >= 50]
data_per_min.drop(columns='PLAYER_ID', inplace=True)

fig = px.imshow(data_per_min.select_dtypes(include=["number"]).corr())
fig.show()


0.9202866754424455

## How are minutes played distributed?

## How has the game changed over the past 10 years?