In [1]:
import pandas as pd
import numpy as np
import altair as alt

In [2]:
nba_dat = pd.read_csv('Data_Scripting_Cleaning/Full_data/Training_Sets/nba_szn_train.csv')
all_nba_dat = nba_dat[nba_dat['all_nba_c_year'] == 1]
min_year = nba_dat.groupby('Tm').min('year')
min_year.reset_index(inplace=True)
min_year = min_year.rename(columns = {'index':'Tm','year':'num_year'})
min_year = min_year.loc[:,['Tm','num_year']]
min_year.loc[:,'num_year'] = 2023-min_year['num_year']
nba_dat = nba_dat.merge(min_year, on = 'Tm', how = 'outer')
all_nba_dat = nba_dat[nba_dat['all_nba_c_year'] == 1]
all_nba_dat.loc[:,'avg_all_nba'] = all_nba_dat['all_nba_c_year']/all_nba_dat['num_year']

url = 'all_nba_dat.json'
url2 = 'nba_dat.json'

all_nba_dat.to_json(url, orient='records')
nba_dat.to_json(url2, orient='records')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_nba_dat.loc[:,'avg_all_nba'] = all_nba_dat['all_nba_c_year']/all_nba_dat['num_year']


First we can do some EDA on the data we have. First we can look at which teams make the most All-NBA teams in order to determine whether `Tm` should be feature

In [3]:
#Now I will use altair to make a bar chart
#the y-axis will be the 30 NBA teams and the x-axis will be the count of all-nba teams their players have made
#the bars will be stacked based on which team made
alt.Chart(url).mark_bar().encode(
    y = alt.Y('count(all_nba_c_year):Q', title = 'Count of All-NBA Teams'),
    x = alt.X('Tm:O', title = 'NBA Team', sort = '-y'),
    color = alt.Color('all_nba_tm:O', title = 'All-NBA Team', scale = alt.Scale(scheme = 'viridis'))
    ).properties(title = 'All-NBA Counts 1980-2023')

Clearly from this plot we see that some teams have much more all-NBA teams then others. However this may be due to the fact that some teams have been in the league much longer than others. We may not plot the same chart, but we will divide the count by the number of years for the team in the league. This can be done by subtracting the 2023 from the minimum year listed in the dataset

In [4]:
#Now I will recreate the above bar chart but divide each of the counts by the num_year column
alt.Chart(url).mark_bar().encode(
    y = alt.Y('sum(avg_all_nba):Q', title = 'Average Number of All-NBA Teams per Year'),
    x = alt.X('Tm:O', title = 'NBA Team', sort = '-y'),
    color = alt.Color('all_nba_tm:O', title = 'All-NBA Team', scale = alt.Scale(scheme = 'viridis'))).properties(title = 'Average All-NBA per year 1980-2023')

We see that we don't have much of a change here, with the lakers still in the number one position of having roughly 1.2 all-NBA players per year. However, we do see some of the newer teams like the Toronto Raptors (TOR) and Orlando Magic (ORL) moving up a bit. Regardless this indicates that using `Tm` as an explanatory variable in our models may be useful. This makes intuitive sense as these awards are voted on by the media which tends to have some bias for larger market and/or historically more successful teams like the Lakers or Spurs. We will retain the traded Tm designation `TOT`, as I believe it is important to note that traded players do not tend to make these All-NBA teams very often, which may be a useful explanatory variable.

We may also look at which position each team tends to send to the All-NBA teams. From our chart below, we see that the Lakers tend to send more guards than any other position. This is most likely due to some of their historically good players like Kobe Bryant and Magic Johnson who each won multiple All-NBA awards.

In [5]:
#Now I will recreate the above bar chart but divide each of the counts by the num_year column
alt.Chart(url).mark_bar().encode(
    y = alt.Y('sum(avg_all_nba):Q', title = 'Average Number of All-NBA Teams per Year'),
    x = alt.X('Tm:O', title = 'NBA Team', sort = '-y'),
    color = alt.Color('Position:O', title = 'Position', scale = alt.Scale(scheme = 'viridis'))).properties(title = 'Average All-NBA Players per year 1980-2023')

We may now also look at the age-make up of these All-NBA players

In [6]:
alt.Chart(url).transform_density(
    'Age',
    as_=['Age', 'density'],
    #extent=[18, 40],
    groupby=['Position']
).mark_area(orient='horizontal').encode(
    y='Age:Q',
    color='Position:N',
    x=alt.X(
        'density:Q',
        stack='center',
        impute=None,
        title=None,
        axis=alt.Axis(labels=False, values=[0],grid=False, ticks=False),
    ),
    column=alt.Column(
        'Position:N',
        header=alt.Header(
            titleOrient='bottom',
            labelOrient='bottom',
            labelPadding=1,
        ),
    )
).properties(
    width=200
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
)

We see that in general the age-distribution across positions is quite similar, however we may want to consider how these players compare to those who did not make an All-NBA team. 

In [7]:
alt.data_transformers.disable_max_rows()

alt.Chart(url2).transform_density(
    'Age',
    as_=['Age', 'density'],
    #extent=[18, 43],
    groupby=['all_nba_tm']
).mark_area(orient='horizontal').encode(
    y='Age:Q',
    color='all_nba_tm:N',
    x=alt.X(
        'density:Q',
        stack='center',
        impute=None,
        title=None,
        axis=alt.Axis(labels=False, values=[0],grid=False, ticks=False)
       
    ),
    column=alt.Column(
        'all_nba_tm:N',
        header=alt.Header(
            titleOrient='bottom',
            labelOrient='bottom',
            labelPadding=1,
        ),
    )
).properties(
    width=150
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
)

We see a pattern here where among those who made an All-NBA team, they tend to skew slightly older than those who did not make an All-NBA team.

Now Let us examine some of the basic box-score statistics and see how those vary across players. 

In [8]:
alt.Chart(url2).transform_density(
    'PTS',
    as_=['PTS', 'density'],
    #extent=[3, 44],
    groupby=['all_nba_tm']
).mark_area(orient='horizontal').encode(
    y='PTS:Q',
    color='all_nba_tm:N',
    x=alt.X(
        'density:Q',
        stack='center',
        impute=None,
        title=None,
        axis=alt.Axis(labels=False, values=[0],grid=False, ticks=False)
       
    ),
    column=alt.Column(
        'all_nba_tm:N',
        header=alt.Header(
            titleOrient='bottom',
            labelOrient='bottom',
            labelPadding=1,
        ),
    )
).properties(
    width=150
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
)

Here we see a clear pattern. Players who tend to score more not only make all-NBA teams more, but the more they score the higher team they will go to. 

We may repeat these charts for both rebounds and assists.

In [9]:
alt.Chart(url2).transform_density(
    'TRB',
    as_=['TRB', 'density'],
    #extent=[0, 24],
    groupby=['all_nba_tm']
).mark_area(orient='horizontal').encode(
    y='TRB:Q',
    color='all_nba_tm:N',
    x=alt.X(
        'density:Q',
        stack='center',
        impute=None,
        title=None,
        axis=alt.Axis(labels=False, values=[0],grid=False, ticks=False)
       
    ),
    column=alt.Column(
        'all_nba_tm:N',
        header=alt.Header(
            titleOrient='bottom',
            labelOrient='bottom',
            labelPadding=1,
        ),
    )
).properties(
    width=150
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
)

We again see that players who make the teams follow a different distribution for TRB (total number of rebounds) than player who do not. For assists we have:

In [10]:
alt.Chart(url2).transform_density(
    'AST',
    as_=['AST', 'density'],
    #extent=[0, 17],
    groupby=['all_nba_tm']
).mark_area(orient='horizontal').encode(
    y='AST:Q',
    color='all_nba_tm:N',
    x=alt.X(
        'density:Q',
        stack='center',
        impute=None,
        title=None,
        axis=alt.Axis(labels=False, values=[0],grid=False, ticks=False)
       
    ),
    column=alt.Column(
        'all_nba_tm:N',
        header=alt.Header(
            titleOrient='bottom',
            labelOrient='bottom',
            labelPadding=1,
        ),
    )
).properties(
    width=150
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
)

A similar pattern emerges here as well. 

One advanced stat that has been widely used by pundits and fans alike has been true shooting percentage. This evaulates the offensive efficiency of a player by accounting for the fact that 3 pointers are 1.5 times as valuable as 2 pointers. Let us see how this differs across these groups. 

In [11]:
alt.Chart(url2).transform_density(
    'TS%',
    as_=['TS%', 'density'],
    #extent=[0.4, .8],
    groupby=['all_nba_tm']
).mark_area(orient='horizontal').encode(
    y='TS%:Q',
    color='all_nba_tm:N',
    x=alt.X(
        'density:Q',
        stack='center',
        impute=None,
        title=None,
        axis=alt.Axis(labels=False, values=[0],grid=False, ticks=False)
       
    ),
    column=alt.Column(
        'all_nba_tm:N',
        header=alt.Header(
            titleOrient='bottom',
            labelOrient='bottom',
            labelPadding=1,
        ),
    )
).properties(
    width=150
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
)

Here we see a similar pattern as points per game, which is not surprising since more efficient players will also be tasked with scoring more. We see almost every player who makes an All-NBA team is above average in true shooting percentage. 

Examining `VORP` (value over replacement player), we have one of our most distinct differences. The higher the vorp, the higher the All-NBA team according to this plot

In [12]:
alt.Chart(url2).transform_density(
    'VORP',
    as_=['VORP', 'density'],
    #extent=[-4, 14],
    groupby=['all_nba_tm']
).mark_area(orient='horizontal').encode(
    y='VORP:Q',
    color='all_nba_tm:N',
    x=alt.X(
        'density:Q',
        stack='center',
        impute=None,
        title=None,
        axis=alt.Axis(labels=False, values=[0],grid=False, ticks=False)
       
    ),
    column=alt.Column(
        'all_nba_tm:N',
        header=alt.Header(
            titleOrient='bottom',
            labelOrient='bottom',
            labelPadding=1,
        ),
    )
).properties(
    width=150
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
)

We may also look at which seeds these players tend to belong to.

In [13]:
alt.Chart(url2).transform_density(
    'seed',
    as_=['seed', 'density'],
    #extent=[-4, 14],
    groupby=['all_nba_tm']
).mark_area(orient='horizontal').encode(
    y='seed:Q',
    color='all_nba_tm:N',
    x=alt.X(
        'density:Q',
        stack='center',
        impute=None,
        title=None,
        axis=alt.Axis(labels=False, values=[0],grid=False, ticks=False)
       
    ),
    column=alt.Column(
        'all_nba_tm:N',
        header=alt.Header(
            titleOrient='bottom',
            labelOrient='bottom',
            labelPadding=1,
        ),
    )
).properties(
    width=150
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
)

We see that unsurprisingly, All-NBA players tend to come from lower seed teams (higher win counts).