In [20]:
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import os

In [21]:
# set directory for exporting csv files
os.makedirs('historical-data/', exist_ok=True)

In [22]:
# Create function that cleans up PLAYER string
def player_cleaning(player):
    player = player.split("\n")
    player = [i.strip() for i in player]
    player = [i for i in player if i] 
    return player

# Create function that scrapes NBA.com data, exports as csv
def nba_stats(url, csv_name):
    # Open chromedriver to input url
    driver = webdriver.Chrome(ChromeDriverManager().install())
    driver.get(url)
    
    # Accept all cookies in NBA.com if applicable
    WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH,'//*[@id="onetrust-accept-btn-handler"]'))).click()

    # Display all players from dropdown menu
    WebDriverWait(driver, 15).until(
        EC.element_to_be_clickable((By.XPATH,"//body/main[1]/div[1]/div[1]/div[2]/div[1]/div[1]/nba-stat-table[1]/div[1]/div[1]/div[1]/select[1]"))).click()
    WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH,"//body/main[1]/div[1]/div[1]/div[2]/div[1]/div[1]/nba-stat-table[1]/div[1]/div[1]/div[1]/select[1]/option[1]"))).click()

    # Extract HTML using BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Quit chromedriver
    driver.quit()

    # Extract headers as list
    headers = [th.getText() for th in soup.find('div', {'class':'nba-stat-table__overflow'}).findAll('th')]
    headers = [i for i in headers if "RANK" not in i]

    # Extract players as list of lists
    player_rows = [tr.getText().strip() for tr in soup.tbody.findAll('tr')]
    player_rows = [player_cleaning(player) for player in player_rows]
    
    # Make sure length of headers = length of each row in player_rows
    headers = headers[0:len(player_rows[0])]
    
    # Create dataframe with headers and players, export as csv
    df = pd.DataFrame(player_rows, columns = headers)
    df.to_csv(csv_name, index=False)

In [23]:
# Create function that scraptes NBA stats from year 1 to year 2, exports as individual csv files
def season_scraper(link, suffix, year1, year2):
    nba_string1 = link.split("Season=20")[0] + "Season=20"
    nba_string2 = link.split("Season=20")[1][5:]

    for x in range(0, year2 - year1):
        if x < 10:
            first = "0" + str(x)
        else:
            first = str(x)

        if x + 1 < 10:
            second = "0" + str(x + 1)
        else:
            second = str(x + 1)

        year = first + "-" + second
        csv_name = "historical-data/nba_" + suffix + second + ".csv"

        nba_stats(nba_string1 + year + nba_string2, csv_name)

In [24]:
# Define year 1 and year 2
year1 = 2000
year2 = 2022

# Scrape traditional stats from year 1 to year 2
season_scraper("https://www.nba.com/stats/players/traditional/?sort=GP&dir=-1&Season=2020-21&SeasonType=Regular%20Season", 
               "traditional",
               year1, year2)

# Scrape advanced stats from year 1 to year 2
season_scraper("https://www.nba.com/stats/players/advanced/?sort=GP&dir=-1&Season=2020-21&SeasonType=Regular%20Season", 
               "advanced",
               year1, year2)

In [25]:
# Create function that converts numbers to strings
def str_convert(number):
    if number < 10:
        return "0" + str(number)
    else:
        return str(number)

In [26]:
# Create list of predictor variables
cols = ['AGE', 'MIN', 'PTS', 'FG%', '3P%', 'FT%', 'USG%']

# Create empty list, to be filled with dataframes
nba_dataframes = []

# Iterate through NBA data from year 1 to year 2
for x in range(1, 22):
    
    # Load traditional and advanced stats from year x, merge into one dataframe
    trad_one = pd.read_csv("historical-data/nba_traditional" + str_convert(x) + ".csv", index_col=0)
    adv_one = pd.read_csv("historical-data/nba_advanced" + str_convert(x) + ".csv", index_col=0)
    nba_one = trad_one.merge(adv_one[['PLAYER', 'USG%', 'PIE']], on='PLAYER')
    
    # Load traditional and advanced stats from year x+1, merge into one dataframe
    trad_two = pd.read_csv("historical-data/nba_traditional" + str_convert(x + 1) + ".csv", index_col=0)
    adv_two = pd.read_csv("historical-data/nba_advanced" + str_convert(x + 1) + ".csv", index_col=0)
    nba_two = trad_two.merge(adv_two[['PLAYER', 'USG%', 'PIE']], on='PLAYER')
    
    # Merge dataframes from year x and year x+1
    nba = nba_one.merge(nba_two[['PLAYER', 'GP', 'TEAM'] + cols], on='PLAYER', how='inner', suffixes=('', '2'))
    
    # Create new columns, rename columns
    nba['YEAR1'] = 2000 + x 
    nba['YEAR2'] = 2001 + x 
    nba['PPG'] = nba.PTS
    nba['PPG2'] = nba.PTS2
    nba['d_PPG'] = nba.PPG2 - nba.PPG
     
    # Add dataframe to list of dataframes
    nba_dataframes.append(nba)
    
# Concatenate all dataframes into one
nba = pd.concat(nba_dataframes)

In [27]:
# Load traditional and advanced stats from 2021-22, merge into one dataframe
trad22 = pd.read_csv("historical-data/nba_traditional22.csv", index_col=0)
adv22 = pd.read_csv("historical-data/nba_advanced22.csv", index_col=0)
nba22 = trad22.merge(adv22[['PLAYER', 'USG%', 'PIE']], on='PLAYER')

# Rename column
nba22['PPG'] = nba22.PTS

In [28]:
# Export dataframes as csv files
nba.to_csv("historical-data/nba2000s.csv")
nba22.to_csv("historical-data/nba22.csv")