### Getting the Data of NBA Players 

In [200]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

#Reference - https://betterprogramming.pub/a-step-by-step-guide-to-web-scraping-nba-data-with-python-jupyter-beautifulsoup-and-pandas-7e2d334d4195

In [201]:
# Create an URL object
# Get NBA player data for each year 2018-2022
#url = 'https://www.basketball-reference.com/leagues/NBA_2022_per_game.html'
#url = 'https://www.basketball-reference.com/leagues/NBA_2021_per_game.html'
#url = 'https://www.basketball-reference.com/leagues/NBA_2020_per_game.html'
#url = 'https://www.basketball-reference.com/leagues/NBA_2019_per_game.html'
url = 'https://www.basketball-reference.com/leagues/NBA_2018_per_game.html'

In [202]:
# Create object page - request to get info from a particular web page and grab its contents 
page = requests.get(url)
page

#The HTTP 200 OK success status response code indicates that the request has succeeded

<Response [200]>

In [203]:
#page.content  #Helps to see the content of page

In [204]:
#We pass the page.content to BeautifulSoup to make it readable

soup = BeautifulSoup(page.content, 'html.parser')

In [205]:
#print(soup.prettify()) #It will display the content in more structured tree format making it easier to read 

In [206]:
table = soup.find_all(class_ = "full_table")  #Get all the row data, here each row <tr> has class "full_table"

In [207]:
#table

In [208]:
head  = soup.find(class_ = "thead") #All headers of the table are enclosed in the <thead> tag

#column_names = [head.text for item in head][0]

#column_names

In [209]:
head

<tr class="thead">
<th aria-label="Rk" class="ranker sort_default_asc show_partial_when_sorting center" data-stat="ranker" data-tip="Rank">Rk</th>
<th aria-label="Player" class="sort_default_asc center" data-stat="player">Player</th>
<th aria-label="Pos" class="sort_default_asc center" data-stat="pos" data-tip="Position">Pos</th>
<th aria-label="Age" class="sort_default_asc center" data-stat="age" data-tip="Player's age on February 1 of the season">Age</th>
<th aria-label="Tm" class="sort_default_asc center" data-stat="team_id" data-tip="Team">Tm</th>
<th aria-label="Games" class="center" data-stat="g" data-tip="Games">G</th>
<th aria-label="Games Started" class="center" data-stat="gs" data-tip="Games Started">GS</th>
<th aria-label="Minutes Played Per Game" class="hide_non_quals center" data-stat="mp_per_g" data-tip="Minutes Played Per Game">MP</th>
<th aria-label="Field Goals Per Game" class="hide_non_quals center" data-stat="fg_per_g" data-tip="Field Goals Per Game">FG</th>
<th aria

In [210]:
head.text

'\nRk\nPlayer\nPos\nAge\nTm\nG\nGS\nMP\nFG\nFGA\nFG%\n3P\n3PA\n3P%\n2P\n2PA\n2P%\neFG%\nFT\nFTA\nFT%\nORB\nDRB\nTRB\nAST\nSTL\nBLK\nTOV\nPF\nPTS\n'

In [211]:
column_names_clean = head.text.replace("\n", ",").split(",")[2:-1]

column_names_clean

['Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'GS',
 'MP',
 'FG',
 'FGA',
 'FG%',
 '3P',
 '3PA',
 '3P%',
 '2P',
 '2PA',
 '2P%',
 'eFG%',
 'FT',
 'FTA',
 'FT%',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS']

In [212]:
players = []

for i in range(len(table)):
    
    player = []
    
    for td in table[i].find_all("td"):
        player.append(td.text)
        
    players.append(player)
    

In [213]:
df = pd.DataFrame(players, columns = column_names_clean).set_index("Player")

In [214]:
df

Unnamed: 0_level_0,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Álex Abrines,SG,24,OKC,75,8,15.1,1.5,3.9,.395,1.1,...,.848,0.3,1.2,1.5,0.4,0.5,0.1,0.3,1.7,4.7
Quincy Acy,PF,27,BRK,70,8,19.4,1.9,5.2,.356,1.5,...,.817,0.6,3.1,3.7,0.8,0.5,0.4,0.9,2.1,5.9
Steven Adams,C,24,OKC,76,76,32.7,5.9,9.4,.629,0.0,...,.559,5.1,4.0,9.0,1.2,1.2,1.0,1.7,2.8,13.9
Bam Adebayo,C,20,MIA,69,19,19.8,2.5,4.9,.512,0.0,...,.721,1.7,3.8,5.5,1.5,0.5,0.6,1.0,2.0,6.9
Arron Afflalo,SG,32,ORL,53,3,12.9,1.2,3.1,.401,0.5,...,.846,0.1,1.2,1.2,0.6,0.1,0.2,0.4,1.1,3.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Cody Zeller,C,25,CHO,33,0,19.0,2.6,4.7,.545,0.1,...,.718,2.0,3.3,5.4,0.9,0.4,0.6,1.0,2.5,7.1
Tyler Zeller,C,28,TOT,66,34,16.8,2.8,5.1,.560,0.2,...,.722,1.7,3.0,4.6,0.7,0.2,0.5,0.7,1.9,6.7
Paul Zipser,SF,23,CHI,54,12,15.3,1.5,4.3,.346,0.7,...,.760,0.2,2.2,2.4,0.9,0.4,0.3,0.8,1.6,4.0
Ante Žižić,C,21,CLE,32,2,6.7,1.5,2.1,.731,0.0,...,.724,0.8,1.1,1.9,0.2,0.1,0.4,0.3,0.9,3.7


In [215]:
# add year column for each dataframe 
#df['year'] = 2022
#df['year'] = 2021
#df['year'] = 2020
#df['year'] = 2019
df['year'] = 2018

In [217]:
df.drop(columns=['Pos','Age','G','GS','MP','FG','FGA','FG','3P','ORB',\
                 'DRB','TRB','AST','STL','BLK','TOV','PF','PTS','FT','FTA','FT',\
                'FG%','3PA','3P%','2P','2PA','2P%','eFG%','FT%'],inplace = True )

In [218]:
df.head()

Unnamed: 0_level_0,Tm,year
Player,Unnamed: 1_level_1,Unnamed: 2_level_1
Álex Abrines,OKC,2018
Quincy Acy,BRK,2018
Steven Adams,OKC,2018
Bam Adebayo,MIA,2018
Arron Afflalo,ORL,2018


In [219]:
#save the new data
# repeat for every year
df.to_csv('Data/2017_2018_nba_players_data.csv', header = True)

In [247]:
# read in new data 
df1 = pd.read_csv('Data/2020_2021_nba_players_data.csv')
df2 = pd.read_csv('Data/2021_2022_nba_players_data.csv')
df3 = pd.read_csv('Data/2019_2020_nba_players_data.csv')
df4 = pd.read_csv('Data/2018_2019_nba_players_data.csv')
df5 = pd.read_csv('Data/2017_2018_nba_players_data.csv')
df6 = pd.read_csv('Data/players_handles.csv')

In [248]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 540 entries, 0 to 539
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Player  540 non-null    object
 1   Tm      540 non-null    object
 2   year    540 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 12.8+ KB


In [249]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 597 entries, 0 to 596
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Player     597 non-null    object
 1   TwitterID  590 non-null    object
 2   Tm         597 non-null    object
 3   year       597 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 18.8+ KB


In [312]:
#source: https://pandas.pydata.org/docs/user_guide/merging.html
# merge all years with players twitter id data

df = df6.merge(df1.merge(df2.merge(df3.merge(df4.merge(df5, on = 'Player', how = 'outer', suffixes = ('2019','2018')),\
                                   on = 'Player', how = 'outer',suffixes = ('2020','2022')),\
                         on = 'Player', how = 'outer',suffixes = ('2022','2020')),on = 'Player', how = 'outer',suffixes = ('2021','2021')),on = 'Player', how = 'outer',suffixes = ('2021','2021'))

In [310]:
# check shape
df.shape

(1980, 13)

In [314]:
# check columns names
df.head()

Unnamed: 0,Player,Twitter,Tm,year,TwitterID,Tm2022,year2022,Tm2020,year2020,Tm2019,year2019,Tm2018,year2018
0,Alexis Ajinça,AjincaAlexis42,,,,,,,,,,,
1,Morris Almond,FreeMoAlmond,,,,,,,,,,,
2,Giannis Antetokounmpo,Giannis_An34,MIL,2021.0,Giannis_An34,MIL,2022.0,MIL,2020.0,MIL,2019.0,MIL,2018.0
3,Ömer Aşık,AsikOmer,,,,,,,,,,TOT,2018.0
4,Gustavo Ayón,Gustavo_Ayon15,,,,,,,,,,,


In [300]:
# correcting columns names
df.rename(columns = {'Tm':'Tm_2021', 'year':'year_2021','Tm_x':'team_2022','year_x':'year_2022','Tm_y':'Tm_2020',\
                     'year_y': 'year_2020'},inplace = True)

In [304]:
# check tail of data
df.tail(100)

Unnamed: 0,Player,Twitter,Tm,year,TwitterID,Tm2022,year2022,Tm2020,year2020,Tm2019,year2019,Tm2018,year2018
1880,Rex Walters,,,,,,,,,,,,
1881,Bill Walton,BillWalton,,,,,,,,,,,
1882,John Warren,,,,,,,,,,,,
1883,T.J. Warren,TonyWarrenJr,IND,2021.0,,,,IND,2020.0,PHO,2019.0,PHO,2018.0
1884,Hakim Warrick,hdubb21,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1975,Timofey Mozgov,,,,,,,,,,,BRK,2018.0
1976,Johnny O'Bryant,,,,,,,,,,,CHO,2018.0
1977,Willie Reed,,,,,,,,,,,TOT,2018.0
1978,Josh Smith,,,,,,,,,,,NOP,2018.0


In [271]:
# drop rows with no twitter handles 
df.dropna(subset=['Twitter'], axis=0,inplace = True)

In [272]:
#drop old twitter column with only 2022 data
df.drop(columns = 'TwitterID', inplace= True)

In [273]:
# recheck shape
df.shape

(1795, 12)

In [274]:
df.head()

Unnamed: 0,Player,Twitter,Tm_2021,year_2021,team_2022,year_2022,Tm_2020,year_2020,Tm2019,year2019,Tm2018,year2018
0,Alexis Ajinça,AjincaAlexis42,,,,,,,,,,
1,Morris Almond,FreeMoAlmond,,,,,,,,,,
2,Giannis Antetokounmpo,Giannis_An34,MIL,2021.0,MIL,2022.0,MIL,2020.0,MIL,2019.0,MIL,2018.0
3,Ömer Aşık,AsikOmer,,,,,,,,,TOT,2018.0
4,Gustavo Ayón,Gustavo_Ayon15,,,,,,,,,,


In [278]:
df.to_csv('Data/2017_2022_nba_players_data.csv', header = True)