In [1]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

In [2]:
def get_headers(per_game_table):
    table_head = per_game_table.find("thead")
    headers = table_head.find_all("th")

    column_names = []
    for i in headers:
        column_names.append(i.text)

    return [i.text for i in headers]

def get_cleaned_rows(per_game_table):
    table_body = per_game_table.find("tbody")
    rows = table_body.find_all("tr")
    
    def clean_row(row):
        first_row_elements = row.find_all("td")
        first_row_elements = [row.find("th").text] + [i.text for i in first_row_elements]

        return first_row_elements
    
    return [clean_row(i) for i in rows]

In [3]:
def scrape_url(driver, url):
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    per_game_table = soup.find("table", {"id" : "totals_stats"})
    
    column_names = get_headers( per_game_table )
    cleaned_rows = get_cleaned_rows( per_game_table )
        
    return pd.DataFrame(cleaned_rows, columns=column_names)

In [4]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

dataframes = []
for year in list(range(2007, 2014+1)):
    url = "https://www.basketball-reference.com/leagues/NBA_" + str(year) + "_totals.html"
    
    df = scrape_url(driver, url)
    df['Season'] = year
    
    dataframes.append(df)

driver.quit()




[WDM] - Current google-chrome version is 113.0.5672
[WDM] - Get LATEST chromedriver version for 113.0.5672 google-chrome
[WDM] - Driver [/Users/jeremydumalig/.wdm/drivers/chromedriver/mac64/113.0.5672.63/chromedriver] found in cache


In [5]:
# Add column to indicate games played in prior season
add_lastGP = []
for i in range(1, len(dataframes)):
    add_lastGP.append(pd.merge(dataframes[i], 
                              dataframes[i-1][['Player', 'G', 'MP']],
                              on='Player', 
                              how='inner'))

In [6]:
nba = pd.concat(add_lastGP)

nba['MP'] = nba['MP_x'].astype(float) / nba['G_x'].astype(float)
nba['lastMP'] = nba['MP_y'].astype(float) / nba['G_y'].astype(float)

# Scale GP for 2011-12 lockout season (only 66 games played)
nba['G_x'] = np.where(nba['Season'] == 2012, nba['G_x'].astype(float) * (82/66), nba['G_x'])
nba['G_y'] = np.where(nba['Season'] == 2013, nba['G_y'].astype(float) * (82/66), nba['G_y'])

nba = nba[['Player', 'Tm', 'Age', 'G_y', 'G_x', 'lastMP', 'MP', 'Season']]
nba.columns = ['Player', 'Team', 'age', 'lastGP', 'GP', 'lastMP', 'MP', 'Season']

nba['tech'] = nba.Team.isin(['SAS', 'DAL', 'HOU', 'NYK']).astype(int)
nba['time'] = (nba.Season >= 2014).astype(int)

nba.to_csv("nba_wearables.csv")