In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as cond
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))



Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Trying to download new driver from https://chromedriver.storage.googleapis.com/98.0.4758.102/chromedriver_win32.zip
Driver has been saved in cache [C:\Users\pjame\.wdm\drivers\chromedriver\win32\98.0.4758.102]


---

## Extract Our Data Table

In [2]:
# Create an instance of our browser
driver = webdriver.Chrome()

In [3]:
# Navigate to our target page
url = "https://www.nba.com/stats/players/traditional/?PerMode=Totals"
driver.get(url)

In [4]:
# Wait until the pagination selector is clickable
drop_down = WebDriverWait(driver, 20).until(
    cond.element_to_be_clickable(
        (By.CSS_SELECTOR, "select.stats-table-pagination__select")),
    message="Select Pagination - Timeout")

In [None]:
# and then select the "All" option
Select(drop_down).select_by_visible_text("All")

In [5]:
# Find the data table
data_table = WebDriverWait(driver, 20).until(
    cond.presence_of_element_located(
        (By.CSS_SELECTOR, 'div.nba-stat-table table')),
    message="Cannot find data table!")

In [6]:
# Find the columns that are displayed (filter the hidden columns)
# Only columns in this list will be kept in our final result later
hidden_columns_html = data_table.find_elements_by_css_selector(
    'div.nba-stat-table__overflow:first-of-type thead tr th:not([hidden])')

# This is the list of columns we will use in our final result
data_columns = [col.text for col in hidden_columns_html]

# We will use "RANK" as the name of our 1st column, so we can name it here:
data_columns[0] = 'RANK'

data_columns



['RANK',
 'PLAYER',
 'TEAM',
 'AGE',
 'GP',
 'W',
 'L',
 'MIN',
 'PTS',
 'FGM',
 'FGA',
 'FG%',
 '3PM',
 '3PA',
 '3P%',
 'FTM',
 'FTA',
 'FT%',
 'OREB',
 'DREB',
 'REB',
 'AST',
 'TOV',
 'STL',
 'BLK',
 'PF',
 'FP',
 'DD2',
 'TD3',
 '+/-']

In [7]:
# Pass the table's outer HTML to pd.read_html() which
# will parse html into a pandas DataFrame
df = pd.read_html(data_table.get_attribute('outerHTML'))[0]

In [14]:
df.head()

Unnamed: 0.1,Unnamed: 0,PLAYER,TEAM,AGE,GP,W,L,MIN,PTS,FGM,...,REB RANK,AST RANK,TOV RANK,STL RANK,BLK RANK,PF RANK,FP RANK,DD2 RANK,TD3 RANK,+/- RANK
0,1,DeMar DeRozan,CHI,32,55,36,19,1960,1547,566,...,,,,,,,,,,
1,2,Trae Young,ATL,23,53,26,27,1824,1475,495,...,,,,,,,,,,
2,3,Giannis Antetokounmpo,MIL,27,49,31,18,1604,1443,496,...,,,,,,,,,,
3,4,Jayson Tatum,BOS,23,56,32,24,2020,1439,500,...,,,,,,,,,,
4,5,Stephen Curry,GSW,33,54,40,14,1881,1393,452,...,,,,,,,,,,


In [15]:
# at this point we can release our driver, we won't need it any more
driver.quit()

---

## Cleanup our Data

1. Rename the `Unnamed` column to `RANK`
    1. This column in the HTML did not have a name
    2. By default pandas will name columns without a name with `Unnamed: X`
    3. Since this is the 1st column without a name, pandas will name it `Unnamed: 0`

In [16]:
df.rename(columns={'Unnamed: 0': 'RANK'}, inplace=True)

2. Keep only selected columns (use `data_columns` from before) 

In [17]:
df = df[data_columns]
df

Unnamed: 0,RANK,PLAYER,TEAM,AGE,GP,W,L,MIN,PTS,FGM,...,REB,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-
0,1,DeMar DeRozan,CHI,32,55,36,19,1960,1547,566,...,287,283,129,47,17,120,2378.9,5,0,232
1,2,Trae Young,ATL,23,53,26,27,1824,1475,495,...,206,493,217,53,5,85,2418.7,28,0,55
2,3,Giannis Antetokounmpo,MIL,27,49,31,18,1604,1443,496,...,551,296,162,48,68,156,2734.2,31,4,308
3,4,Jayson Tatum,BOS,23,56,32,24,2020,1439,500,...,469,233,162,55,40,130,2474.3,18,0,394
4,5,Stephen Curry,GSW,33,54,40,14,1881,1393,452,...,286,342,175,74,21,114,2359.2,10,2,474
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
587,564,Sam Dekker,TOR,27,1,1,0,1,0,0,...,0,0,0,0,0,0,0.0,0,0,-5
588,564,Trayvon Palmer,DET,27,1,0,1,17,0,0,...,2,0,1,0,0,2,1.4,0,0,-12
589,564,Tyler Hall,NYK,24,1,1,0,2,0,0,...,0,0,0,0,0,0,0.0,0,0,-5
590,564,Xavier Sneed,UTA,24,2,2,0,8,0,0,...,2,0,0,0,0,2,2.4,0,0,-6


---

## Export our Data

In [19]:
df.to_csv(f"nba-stats.csv", index=False)