This notebook will follow the ML lifecycle of predicting All-NBA selections for NBA players. I will be using a variety of techniques, including multinomial logistic regression, Random Forests, XGboosts, and a neural network.

First we must examine our data. We can begin with the data from Basketball-Reference showing the All-NBA selections for each year.

In [3]:
import pandas as pd
import numpy as np
import altair as alt
import sklearn as sk
import time

In [2]:
all_nba = pd.read_csv('bbref_data/all_nba_dat.csv')
all_nba.head()

NameError: name 'pd' is not defined

In [18]:
#Now I will convert each name to a string to remove the C, F, and G from the end of each name
all_nba['P1'] = all_nba['P1'].astype(str)
all_nba['P1'] = all_nba['P1'].str[:-2]
all_nba['P2'] = all_nba['P2'].astype(str)
all_nba['P2'] = all_nba['P2'].str[:-2]
all_nba['P3'] = all_nba['P3'].astype(str)
all_nba['P3'] = all_nba['P3'].str[:-2]
all_nba['P4'] = all_nba['P4'].astype(str)
all_nba['P4'] = all_nba['P4'].str[:-2]
all_nba['P5'] = all_nba['P5'].astype(str)
all_nba['P5'] = all_nba['P5'].str[:-2]
all_nba.head()

Unnamed: 0,Season,Lg,Tm,Voting,P1,P2,P3,P4,P5
0,2022-23,NBA,1st,(V),Joel Embiid,Giannis Antetokounmpo,Jayson Tatum,Luka Dončić,Shai Gilgeous-Alexander
1,2022-23,NBA,2nd,(V),Nikola Jokić,Jimmy Butler,Jaylen Brown,Donovan Mitchell,Stephen Curry
2,2022-23,NBA,3rd,(V),Domantas Sabonis,LeBron James,Julius Randle,De'Aaron Fox,Damian Lillard
3,2021-22,NBA,1st,(V),Nikola Jokić,Giannis Antetokounmpo,Jayson Tatum,Luka Dončić,Devin Booker
4,2021-22,NBA,2nd,(V),Joel Embiid,Kevin Durant,DeMar DeRozan,Ja Morant,Stephen Curry


In [19]:
all_nba_melt = pd.melt(all_nba, id_vars=['Season','Tm'], value_vars=['P1', 'P2', 'P3', 'P4', 'P5'])
all_nba_melt.rename(columns={'value':'Player'}, inplace=True)
all_nba_melt.rename(columns={'Tm':'All-NBA'}, inplace=True)

Unnamed: 0,Season,All-NBA,variable,Player
0,2022-23,1st,P1,Joel Embiid
1,2022-23,2nd,P1,Nikola Jokić
2,2022-23,3rd,P1,Domantas Sabonis
3,2021-22,1st,P1,Nikola Jokić
4,2021-22,2nd,P1,Joel Embiid


In [20]:
#Now I will convert P1 to C, P2 and P3 to F, and P4 and P5 to G
all_nba_melt = all_nba_melt.replace(['P1'], 'C')
all_nba_melt = all_nba_melt.replace(['P2', 'P3'], 'F')
all_nba_melt = all_nba_melt.replace(['P4', 'P5'], 'G')
all_nba_melt.rename(columns={'variable':'Position'}, inplace=True)


In [1]:
all_nba_melt['Season'] = all_nba_melt['Season'].astype(str)
all_nba_melt['Season'] = all_nba_melt['Season'].str[:4]
all_nba_melt['Season'] = all_nba_melt['Season'].astype(int) + 1
all_nba_melt.rename(columns={'Season':'year'}, inplace=True)
all_nba_melt.head()

NameError: name 'all_nba_melt' is not defined

Now we can scrape basketball-reference for the player data from these seasons

In [22]:

# URL of the webpage to scrape
url = "https://www.basketball-reference.com/leagues/NBA_2023_per_game.html"

# Read the table from the URL
tables = pd.read_html(url)

# Extract the first table from the list of tables
df = tables[0]
df['year'] = 2023
years = list(range(1980, 2023))

# Create an empty list to store the dataframes
all_data = []

for year in years:
    url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html".format(year)
    # Read the table from the URL
    tables = pd.read_html(url)

    # Extract the first table from the list of tables
    df_temp = tables[0]
    df_temp['year'] = year
    df = pd.concat([df, df_temp])
    time.sleep(20)

In [23]:
df.to_csv("bbref_basic_stat_dat.csv")

In [24]:
df

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,year
0,1,Precious Achiuwa,C,23,TOR,55,12,20.7,3.6,7.3,...,1.8,4.1,6.0,0.9,0.6,0.5,1.1,1.9,9.2,2023
1,2,Steven Adams,C,29,MEM,42,42,27.0,3.7,6.3,...,5.1,6.5,11.5,2.3,0.9,1.1,1.9,2.3,8.6,2023
2,3,Bam Adebayo,C,25,MIA,75,75,34.6,8.0,14.9,...,2.5,6.7,9.2,3.2,1.2,0.8,2.5,2.8,20.4,2023
3,4,Ochai Agbaji,SG,22,UTA,59,22,20.5,2.8,6.5,...,0.7,1.3,2.1,1.1,0.3,0.3,0.7,1.7,7.9,2023
4,5,Santi Aldama,PF,22,MEM,77,20,21.8,3.2,6.8,...,1.1,3.7,4.8,1.3,0.6,0.6,0.8,1.9,9.0,2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
837,601,Thaddeus Young,PF,33,TOR,26,0,18.3,2.6,5.5,...,1.5,2.9,4.4,1.7,1.2,0.4,0.8,1.7,6.3,2022
838,602,Trae Young,PG,23,ATL,76,76,34.9,9.4,20.3,...,0.7,3.1,3.7,9.7,0.9,0.1,4.0,1.7,28.4,2022
839,603,Omer Yurtseven,C,23,MIA,56,12,12.6,2.3,4.4,...,1.5,3.7,5.3,0.9,0.3,0.4,0.7,1.5,5.3,2022
840,604,Cody Zeller,C,29,POR,27,0,13.1,1.9,3.3,...,1.9,2.8,4.6,0.8,0.3,0.2,0.7,2.1,5.2,2022


In [25]:
# URL of the webpage to scrape
url = "https://www.basketball-reference.com/leagues/NBA_2023_advanced.html"

# Read the table from the URL
tables = pd.read_html(url)

# Extract the first table from the list of tables
df_adv = tables[0]
df_adv['year'] = 2023
years = list(range(1980, 2023))

# Create an empty list to store the dataframes
all_data = []

for year in years:
    url = "https://www.basketball-reference.com/leagues/NBA_{}_advanced.html".format(year)
    # Read the table from the URL
    tables = pd.read_html(url)

    # Extract the first table from the list of tables
    df_temp = tables[0]
    df_temp['year'] = year
    df_adv = pd.concat([df_adv, df_temp])
    time.sleep(20)

In [26]:
df_adv.to_csv("bbref_advanced_stat_dat.csv")