In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import os
from datetime import date
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os
from datetime import date
import numpy as np

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# NBA Player Stats per Season
***

### Web Scraper

In [2]:
# create empty dataframe to store all years
nba_df = pd.DataFrame()

# iterates through seasons >= 1989 and appends them to dataframe
for year in range(1989, date.today().year + 1):
    
    url = 'https://www.basketball-reference.com/leagues/NBA_' + str(year) + '_totals.html'
    page = requests.get(url)
    
    soup = BeautifulSoup(page.content, 'html.parser') # parses html from page
    
    table = soup.find_all(class_ = 'full_table')
    
    head = soup.find(class_ = 'thead')
    
    # takes header names and cleans them
    column_names_raw = [head.text for item in head][0]
    column_names_clean = column_names_raw.replace('\n', ',').split(',')[2:-1]
    
    players = []
    
    # adds all player observations in table to list and creates dataframe with player name as index
    for i in range(len(table)):
    
        player_ = []
    
        for td in table[i].find_all('td'):
            
            player_.append(td.text)
        
        players.append(player_)
    
    df = pd.DataFrame(players, columns = column_names_clean).set_index('Player')
    
    df['Year'] = year

    df.index = df.index.str.replace('*', '')
    
    nba_df = pd.concat([nba_df, df])

In [3]:
# create csv file of called seasons
nba_df.to_csv('89_to_' + str(date.today().year) + '_nba_stats.csv', header = True)

### EDA / Cleaning

In [4]:
nba_df.head(10)

Unnamed: 0_level_0,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Kareem Abdul-Jabbar,C,41,LAL,74,74,1695,313,659,0.475,0,...,103,231,334,74,38,85,95,196,748,1989
Mark Acres,C,26,BOS,62,0,632,55,114,0.482,1,...,59,87,146,19,19,6,23,94,137,1989
Michael Adams,PG,26,DEN,77,77,2787,468,1082,0.433,166,...,71,212,283,490,166,11,180,149,1424,1989
Mark Aguirre,SF,29,TOT,80,76,2597,586,1270,0.461,51,...,146,240,386,278,45,36,208,229,1511,1989
Danny Ainge,PG,29,TOT,73,54,2377,480,1051,0.457,116,...,71,184,255,402,93,8,145,186,1281,1989
Mark Alarie,PF,25,WSB,74,5,1141,206,431,0.478,13,...,103,152,255,63,25,22,62,160,498,1989
Steve Alford,PG,24,TOT,66,3,906,148,324,0.457,20,...,10,62,72,92,45,3,45,57,366,1989
Randy Allen,SG,24,SAC,7,0,43,8,19,0.421,0,...,3,4,7,0,1,1,2,7,17,1989
Greg Anderson,PF,24,SAS,82,56,2401,460,914,0.503,0,...,255,421,676,61,102,103,180,221,1127,1989
Michael Anderson,PG,22,SAS,36,12,730,73,175,0.417,1,...,44,45,89,153,44,3,84,64,204,1989


In [5]:
nba_df.shape

(15443, 29)

In [6]:
nba_df.dtypes

Pos     object
Age     object
Tm      object
G       object
GS      object
MP      object
FG      object
FGA     object
FG%     object
3P      object
3PA     object
3P%     object
2P      object
2PA     object
2P%     object
eFG%    object
FT      object
FTA     object
FT%     object
ORB     object
DRB     object
TRB     object
AST     object
STL     object
BLK     object
TOV     object
PF      object
PTS     object
Year     int64
dtype: object

In [7]:
nba_df.columns

Index(['Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA',
       '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year'],
      dtype='object')

In [8]:
# Change data types to int
nba_df = nba_df.astype({'Age':'int32', 'G':'int32', 'GS':'int32', 'MP':'int32', 'FG':'int32', 'FGA':'int32', '3P':'int32', 
               '3PA':'int32', '2P':'int32', '2PA':'int32', 'FT':'int32', 'FTA':'int32', 'ORB':'int32', 'DRB':'int32', 
               'TRB':'int32', 'AST':'int32', 'STL':'int32', 'BLK':'int32', 'TOV':'int32', 'PF':'int32', 'PTS':'int32'})

In [9]:
# drop bottom 25% of observations based on minutes played
bot_quartile_minutes = nba_df.describe()['MP']['25%']

nba_df = nba_df[nba_df.MP >= bot_quartile_minutes]

People who had not shot a 3 pointer in that season have '' as their element for 3P%, since 0 doesn't make sense to use, we will use NaNs.

In [10]:
# Change percent columns to floats
nba_df = nba_df.replace('', np.nan)

nba_df = nba_df.astype({'FG%':'float64', '3P%':'float64', '2P%':'float64', 'eFG%':'float64', 'FT%':'float64',})

In [11]:
nba_df.isna().sum()

Pos       0
Age       0
Tm        0
G         0
GS        0
MP        0
FG        0
FGA       0
FG%       0
3P        0
3PA       0
3P%     936
2P        0
2PA       0
2P%       0
eFG%      0
FT        0
FTA       0
FT%       0
ORB       0
DRB       0
TRB       0
AST       0
STL       0
BLK       0
TOV       0
PF        0
PTS       0
Year      0
dtype: int64

In [12]:
nba_df.shape

(11586, 29)

# All NBA Data
***

In [13]:
all_nba_df = pd.read_csv('all_nba_results.csv')

### EDA / Cleaning

In [14]:
all_nba_df.head(10)

Unnamed: 0,Season,Lg,Tm,Voting,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,2020-21,NBA,1st,(V),Nikola Jokić C,Giannis Antetokounmpo F,Kawhi Leonard F,Stephen Curry G,Luka Dončić G
1,2020-21,NBA,2nd,(V),Joel Embiid C,Julius Randle F,LeBron James F,Chris Paul G,Damian Lillard G
2,2020-21,NBA,3rd,(V),Rudy Gobert C,Jimmy Butler F,Paul George F,Bradley Beal G,Kyrie Irving G
3,2019-20,NBA,1st,(V),Anthony Davis C,Giannis Antetokounmpo F,LeBron James F,James Harden G,Luka Dončić G
4,2019-20,NBA,2nd,(V),Nikola Jokić C,Kawhi Leonard F,Pascal Siakam F,Damian Lillard G,Chris Paul G
5,2019-20,NBA,3rd,(V),Rudy Gobert C,Jayson Tatum F,Jimmy Butler F,Ben Simmons G,Russell Westbrook G
6,2018-19,NBA,1st,(V),Nikola Jokić C,Giannis Antetokounmpo F,Paul George F,James Harden G,Stephen Curry G
7,2018-19,NBA,2nd,(V),Joel Embiid C,Kevin Durant F,Kawhi Leonard F,Damian Lillard G,Kyrie Irving G
8,2018-19,NBA,3rd,(V),Rudy Gobert C,LeBron James F,Blake Griffin F,Kemba Walker G,Russell Westbrook G
9,2017-18,NBA,1st,(V),Anthony Davis C,LeBron James F,Kevin Durant F,Damian Lillard G,James Harden G


In [15]:
# Remove hyphen, make year the season ended, and convert to int.
all_nba_df.Season = (all_nba_df.Season.str.slice(0, 2) + all_nba_df.Season.str.slice(-2)).astype('int32')

In [16]:
all_nba_df.dtypes

Season         int32
Lg            object
Tm            object
Voting        object
Unnamed: 4    object
Unnamed: 5    object
Unnamed: 6    object
Unnamed: 7    object
Unnamed: 8    object
dtype: object

In [None]:
# convert the 1900 to 2000

all_nba_df.Season.replace(1900, 2000, inplace=True)

In [17]:
# remove unwanted years
all_nba_df = all_nba_df[all_nba_df.Season >= 1989]

In [18]:
all_nba_df

Unnamed: 0,Season,Lg,Tm,Voting,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,2021,NBA,1st,(V),Nikola Jokić C,Giannis Antetokounmpo F,Kawhi Leonard F,Stephen Curry G,Luka Dončić G
1,2021,NBA,2nd,(V),Joel Embiid C,Julius Randle F,LeBron James F,Chris Paul G,Damian Lillard G
2,2021,NBA,3rd,(V),Rudy Gobert C,Jimmy Butler F,Paul George F,Bradley Beal G,Kyrie Irving G
3,2020,NBA,1st,(V),Anthony Davis C,Giannis Antetokounmpo F,LeBron James F,James Harden G,Luka Dončić G
4,2020,NBA,2nd,(V),Nikola Jokić C,Kawhi Leonard F,Pascal Siakam F,Damian Lillard G,Chris Paul G
...,...,...,...,...,...,...,...,...,...
94,1990,NBA,2nd,(V),Hakeem Olajuwon C,Larry Bird F,Tom Chambers F,Kevin Johnson G,John Stockton G
95,1990,NBA,3rd,(V),David Robinson C,Chris Mullin F,James Worthy F,Clyde Drexler G,Joe Dumars G
96,1989,NBA,1st,(V),Hakeem Olajuwon C,Charles Barkley F,Karl Malone F,Magic Johnson G,Michael Jordan G
97,1989,NBA,2nd,(V),Patrick Ewing C,Tom Chambers F,Chris Mullin F,Kevin Johnson G,John Stockton G


In [19]:
# remove unwanted columns
all_nba_df = all_nba_df.drop(['Lg', 'Voting'], axis = 1)

In [20]:
all_nba_df

Unnamed: 0,Season,Tm,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,2021,1st,Nikola Jokić C,Giannis Antetokounmpo F,Kawhi Leonard F,Stephen Curry G,Luka Dončić G
1,2021,2nd,Joel Embiid C,Julius Randle F,LeBron James F,Chris Paul G,Damian Lillard G
2,2021,3rd,Rudy Gobert C,Jimmy Butler F,Paul George F,Bradley Beal G,Kyrie Irving G
3,2020,1st,Anthony Davis C,Giannis Antetokounmpo F,LeBron James F,James Harden G,Luka Dončić G
4,2020,2nd,Nikola Jokić C,Kawhi Leonard F,Pascal Siakam F,Damian Lillard G,Chris Paul G
...,...,...,...,...,...,...,...
94,1990,2nd,Hakeem Olajuwon C,Larry Bird F,Tom Chambers F,Kevin Johnson G,John Stockton G
95,1990,3rd,David Robinson C,Chris Mullin F,James Worthy F,Clyde Drexler G,Joe Dumars G
96,1989,1st,Hakeem Olajuwon C,Charles Barkley F,Karl Malone F,Magic Johnson G,Michael Jordan G
97,1989,2nd,Patrick Ewing C,Tom Chambers F,Chris Mullin F,Kevin Johnson G,John Stockton G


In [21]:
# make all nba team an int
all_nba_df.Tm = (all_nba_df.Tm.str.slice(0, 1)).astype('int32')

In [22]:
all_nba_df

Unnamed: 0,Season,Tm,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,2021,1,Nikola Jokić C,Giannis Antetokounmpo F,Kawhi Leonard F,Stephen Curry G,Luka Dončić G
1,2021,2,Joel Embiid C,Julius Randle F,LeBron James F,Chris Paul G,Damian Lillard G
2,2021,3,Rudy Gobert C,Jimmy Butler F,Paul George F,Bradley Beal G,Kyrie Irving G
3,2020,1,Anthony Davis C,Giannis Antetokounmpo F,LeBron James F,James Harden G,Luka Dončić G
4,2020,2,Nikola Jokić C,Kawhi Leonard F,Pascal Siakam F,Damian Lillard G,Chris Paul G
...,...,...,...,...,...,...,...
94,1990,2,Hakeem Olajuwon C,Larry Bird F,Tom Chambers F,Kevin Johnson G,John Stockton G
95,1990,3,David Robinson C,Chris Mullin F,James Worthy F,Clyde Drexler G,Joe Dumars G
96,1989,1,Hakeem Olajuwon C,Charles Barkley F,Karl Malone F,Magic Johnson G,Michael Jordan G
97,1989,2,Patrick Ewing C,Tom Chambers F,Chris Mullin F,Kevin Johnson G,John Stockton G


In [23]:
#remove positions from names
all_nba_df['Unnamed: 4']= all_nba_df['Unnamed: 4'].str.slice(0,-2)
all_nba_df['Unnamed: 5']= all_nba_df['Unnamed: 5'].str.slice(0,-2)
all_nba_df['Unnamed: 6']= all_nba_df['Unnamed: 6'].str.slice(0,-2)
all_nba_df['Unnamed: 7']= all_nba_df['Unnamed: 7'].str.slice(0,-2)
all_nba_df['Unnamed: 8']= all_nba_df['Unnamed: 8'].str.slice(0,-2)

In [24]:
# put names in a single column to explode()
all_nba_df['name_list'] = list(all_nba_df['Unnamed: 4'] + ',' + all_nba_df['Unnamed: 5'] + ',' + all_nba_df['Unnamed: 6'] \
+ ',' + all_nba_df['Unnamed: 7'] + ',' + all_nba_df['Unnamed: 8'])

In [25]:
# remove single name columns
all_nba_df = all_nba_df.drop(['Unnamed: 4', 'Unnamed: 5','Unnamed: 6','Unnamed: 7','Unnamed: 8'], axis = 1)

In [26]:
# turn string of names into  single list
all_nba_df.name_list = all_nba_df.name_list.str.split(',', expand = False)

In [27]:
# create new rows for each name in list
all_nba_df = all_nba_df.explode('name_list')

In [28]:
all_nba_df

Unnamed: 0,Season,Tm,name_list
0,2021,1,Nikola Jokić
0,2021,1,Giannis Antetokounmpo
0,2021,1,Kawhi Leonard
0,2021,1,Stephen Curry
0,2021,1,Luka Dončić
...,...,...,...
98,1989,3,Robert Parish
98,1989,3,Terry Cummings
98,1989,3,Dominique Wilkins
98,1989,3,Dale Ellis


In [29]:
# rename columns to match other dataframe and for clarity
all_nba_df = all_nba_df.rename(columns = {'Tm': 'all_nba_team', 'name_list': 'Player', 'Season': 'Year'})

In [30]:
# set player as index to match other Dataframe
all_nba_df = all_nba_df.set_index('Player')

In [31]:
all_nba_df

Unnamed: 0_level_0,Year,all_nba_team
Player,Unnamed: 1_level_1,Unnamed: 2_level_1
Nikola Jokić,2021,1
Giannis Antetokounmpo,2021,1
Kawhi Leonard,2021,1
Stephen Curry,2021,1
Luka Dončić,2021,1
...,...,...
Robert Parish,1989,3
Terry Cummings,1989,3
Dominique Wilkins,1989,3
Dale Ellis,1989,3


In [32]:
# merge dataframes on player name and season
merged_df = nba_df.merge(all_nba_df, how = 'left', left_on = ['Player', 'Year'], right_on = ['Player', 'Year'])

In [33]:
merged_df

Unnamed: 0_level_0,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,all_nba_team
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Kareem Abdul-Jabbar,C,41,LAL,74,74,1695,313,659,0.475,0,...,231,334,74,38,85,95,196,748,1989,
Mark Acres,C,26,BOS,62,0,632,55,114,0.482,1,...,87,146,19,19,6,23,94,137,1989,
Michael Adams,PG,26,DEN,77,77,2787,468,1082,0.433,166,...,212,283,490,166,11,180,149,1424,1989,
Mark Aguirre,SF,29,TOT,80,76,2597,586,1270,0.461,51,...,240,386,278,45,36,208,229,1511,1989,
Danny Ainge,PG,29,TOT,73,54,2377,480,1051,0.457,116,...,184,255,402,93,8,145,186,1281,1989,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Delon Wright,SG,29,ATL,77,8,1452,122,269,0.454,44,...,170,220,188,93,19,45,56,342,2022,
Thaddeus Young,PF,33,TOT,52,1,845,141,272,0.518,17,...,128,208,104,54,18,52,81,322,2022,
Trae Young,PG,23,ATL,76,76,2652,711,1544,0.460,233,...,234,284,737,72,7,303,128,2155,2022,
Omer Yurtseven,C,23,MIA,56,12,706,130,247,0.526,1,...,209,294,49,17,20,41,84,299,2022,


In [34]:
# people who did not make All NBA assigned 0s
merged_df.all_nba_team = merged_df.all_nba_team.fillna(0)

In [35]:
merged_df

Unnamed: 0_level_0,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,all_nba_team
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Kareem Abdul-Jabbar,C,41,LAL,74,74,1695,313,659,0.475,0,...,231,334,74,38,85,95,196,748,1989,0.0
Mark Acres,C,26,BOS,62,0,632,55,114,0.482,1,...,87,146,19,19,6,23,94,137,1989,0.0
Michael Adams,PG,26,DEN,77,77,2787,468,1082,0.433,166,...,212,283,490,166,11,180,149,1424,1989,0.0
Mark Aguirre,SF,29,TOT,80,76,2597,586,1270,0.461,51,...,240,386,278,45,36,208,229,1511,1989,0.0
Danny Ainge,PG,29,TOT,73,54,2377,480,1051,0.457,116,...,184,255,402,93,8,145,186,1281,1989,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Delon Wright,SG,29,ATL,77,8,1452,122,269,0.454,44,...,170,220,188,93,19,45,56,342,2022,0.0
Thaddeus Young,PF,33,TOT,52,1,845,141,272,0.518,17,...,128,208,104,54,18,52,81,322,2022,0.0
Trae Young,PG,23,ATL,76,76,2652,711,1544,0.460,233,...,234,284,737,72,7,303,128,2155,2022,0.0
Omer Yurtseven,C,23,MIA,56,12,706,130,247,0.526,1,...,209,294,49,17,20,41,84,299,2022,0.0


In [36]:
merged_df.shape

(11586, 30)