In [1]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

In [3]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))




[WDM] - Current google-chrome version is 113.0.5672
[WDM] - Get LATEST chromedriver version for 113.0.5672 google-chrome
[WDM] - Driver [/Users/jeremydumalig/.wdm/drivers/chromedriver/mac64/113.0.5672.63/chromedriver] found in cache


In [6]:
driver.get("https://www.nba.com/stats/players/traditional?PerMode=PerGame&sort=PTS&dir=-1&SeasonType=Regular+Season")

# Note: I didn't write any code to 1) get rid of the "Accept cookies" pop-up or
# 2) to show all 11 pages (all 539 rows)

# Instead, after this cell ran, I manually went in the Chromedriver window and 
# 1) clicked "Accept cookies" and 2) toggled to show all pages

In [7]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [12]:
# Length == 1, so I can just use soup.find() instead of soup.find_all()
len( soup.find_all("table", {"class" : "Crom_table__p1iZz"}) )

1

In [13]:
table = soup.find("table", {"class" : "Crom_table__p1iZz"})

In [46]:
thead = table.find("thead")

# th stands for table-header
table_headers = thead.find_all("th")

# extract actual header name from th elements
cleaned_headers = [i.text for i in table_headers]

# more clean up
cleaned_headers = [i for i in cleaned_headers if "RANK" not in i]

print( "Total columns:", len(cleaned_headers) )

Total columns: 30


In [35]:
# tr stands for table-row
table_rows = table.find("tbody").find_all("tr")

In [51]:
# look at only first item to make sure I'm scraping the right data
first_item = table_rows[0]
# td stands for table-data
first_item_td = first_item.find_all("td")
# Extract actual data from td elements
first_item_data = [i.text for i in first_item_td]

first_item_data

['1',
 'Joel Embiid',
 'PHI',
 '29',
 '66',
 '43',
 '23',
 '34.6',
 '33.1',
 '11.0',
 '20.1',
 '54.8',
 '1.0',
 '3.0',
 '33.0',
 '10.0',
 '11.7',
 '85.7',
 '1.7',
 '8.4',
 '10.2',
 '4.2',
 '3.4',
 '1.0',
 '1.7',
 '3.1',
 '56.2',
 '39.0',
 '1.0',
 '6.4']

In [52]:
# pull td elements for each row in table_rows
# td_in_rows is now a list of lists
td_in_rows = [r.find_all("td") for r in table_rows]

# nested list comprehension to extract actual data from each row
# code is basically identical to above cell
# 
table_data = [[td.text for td in i] for i in td_in_rows]
print("Total players:", len(table_data))
print("Total data items for each player:", len(table_data[0]))

# note that above, we have 30 columns and that number matches the total data items for each player here
# this is good

Total players: 539
Total data items for each player: 30


In [55]:
pd.DataFrame(data=table_data, 
             columns=cleaned_headers)

Unnamed: 0,Unnamed: 1,Player,Team,Age,GP,W,L,Min,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-
0,1,Joel Embiid,PHI,29,66,43,23,34.6,33.1,11.0,20.1,54.8,1.0,3.0,33.0,10.0,11.7,85.7,1.7,8.4,10.2,4.2,3.4,1.0,1.7,3.1,56.2,39.0,1.0,6.4
1,2,Luka Doncic,DAL,24,66,33,33,36.2,32.4,10.9,22.0,49.6,2.8,8.2,34.2,7.8,10.5,74.2,0.8,7.8,8.6,8.0,3.6,1.4,0.5,2.5,56.8,36.0,10.0,1.9
2,3,Damian Lillard,POR,32,58,27,31,36.3,32.2,9.6,20.7,46.3,4.2,11.3,37.1,8.8,9.6,91.4,0.8,4.0,4.8,7.3,3.3,0.9,0.3,1.9,49.1,16.0,2.0,1.8
3,4,Shai Gilgeous-Alexander,OKC,24,68,33,35,35.5,31.4,10.4,20.3,51.0,0.9,2.5,34.5,9.8,10.9,90.5,0.9,4.0,4.8,5.5,2.8,1.6,1.0,2.8,50.4,3.0,0.0,2.2
4,5,Giannis Antetokounmpo,MIL,28,63,47,16,32.1,31.1,11.2,20.3,55.3,0.7,2.7,27.5,7.9,12.3,64.5,2.2,9.6,11.8,5.7,3.9,0.8,0.8,3.1,54.8,46.0,6.0,5.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
534,535,Alondes Williams,BKN,23,1,1,0,5.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,1.0,-0.8,0.0,0.0,-5.0
535,535,Deonte Burton,SAC,29,2,1,1,3.2,0.0,0.0,1.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5
536,535,Frank Jackson,UTA,25,1,0,1,5.1,0.0,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,3.9,0.0,0.0,-2.0
537,535,Michael Foster Jr.,PHI,20,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0
