In [11]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

In [12]:
def get_headers(per_game_table):
    headers = [th.text for th in per_game_table.find("thead").find_all("tr")[1].find_all("th")]

    for i in np.arange(len(headers)):
        h = headers[i]

        if h in headers[:i]:
            headers[i] = "OPP_" + headers[i]

    return headers

def get_cleaned_rows(per_game_table):
    table_body = per_game_table.find("tbody")
    rows = table_body.find_all("tr")
    
    def clean_row(row):
        first_row_elements = row.find_all("td")
        first_row_elements = [row.find("th").text] + [i.text for i in first_row_elements]

        return first_row_elements
    
    return [clean_row(i) for i in rows]

In [13]:
def scrape_url(driver, url, team):
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    per_game_table = soup.find("table", {"id" : "wnba_tgl_basic"})
    
    column_names = get_headers( per_game_table )
    cleaned_rows = get_cleaned_rows( per_game_table )
        
    df = pd.DataFrame(cleaned_rows, columns=column_names)
    df['Team'] = team
    
    return df

In [14]:
teams_df = pd.read_csv("wnba_teams.csv")
teams = teams_df.set_index('Team')['Abbreviation'].to_dict()

In [15]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

dataframes = []
for t in teams:
    url = "https://www.basketball-reference.com/wnba/teams/" + teams[t] + "/2023/gamelog/"
    
    dataframes.append( scrape_url(driver, url, t) )

driver.quit()




[WDM] - Current google-chrome version is 113.0.5672
[WDM] - Get LATEST chromedriver version for 113.0.5672 google-chrome
[WDM] - Driver [/Users/jeremydumalig/.wdm/drivers/chromedriver/mac64/113.0.5672.63/chromedriver] found in cache


In [21]:
wnba_logs = pd.concat(dataframes)
wnba_logs = wnba_logs.reset_index(drop=True)

wnba_logs['POSS'] = 0.96 * (wnba_logs.FGA.astype(int) +
                            wnba_logs.TOV.astype(int) + 
                            0.44*(wnba_logs.FTA.astype(int)) - 
                            wnba_logs.ORB.astype(int))
wnba_logs['OPP_POSS'] = 0.96 * (wnba_logs.OPP_FGA.astype(int) +
                                wnba_logs.OPP_TOV.astype(int) + 
                                0.44*(wnba_logs.OPP_FTA.astype(int)) - 
                                wnba_logs.OPP_ORB.astype(int))

wnba_logs['PPP'] = wnba_logs.Tm.astype(int) / wnba_logs.POSS.astype(int)
wnba_logs['OPP_PPP'] = wnba_logs.OPP_Opp.astype(int) / wnba_logs.OPP_POSS.astype(int)
wnba_logs['Final'] = wnba_logs.Tm.astype(str) + "-" + wnba_logs.OPP_Opp.astype(str)

wnba_logs['FTA%'] = 100 * wnba_logs['FTA'].astype(int) / wnba_logs['FGA'].astype(int)
wnba_logs['OPP_FTA%'] = 100 * wnba_logs.OPP_FTA.astype(int) / wnba_logs.OPP_FGA.astype(int)

wnba_logs['DRB'] = wnba_logs.TRB.astype(int) - wnba_logs.ORB.astype(int)
wnba_logs['OPP_DRB'] = wnba_logs.OPP_TRB.astype(int) - wnba_logs.OPP_ORB.astype(int)
wnba_logs['REB'] = wnba_logs.ORB.astype(int) + wnba_logs.DRB.astype(int)
wnba_logs['OPP_REB'] = wnba_logs.OPP_ORB.astype(int) + wnba_logs.OPP_DRB.astype(int)
wnba_logs['ORB%'] = 100 * wnba_logs.ORB.astype(int) / (wnba_logs.ORB.astype(int) + wnba_logs.OPP_DRB.astype(int))
wnba_logs['DRB%'] = 100 * wnba_logs.DRB.astype(int) / (wnba_logs.DRB.astype(int) + wnba_logs.OPP_ORB.astype(int))
wnba_logs['REB%'] = 100 * wnba_logs.REB.astype(int) / (wnba_logs.REB.astype(int) + wnba_logs.OPP_REB.astype(int))
wnba_logs['OPP_ORB%'] = 100 * wnba_logs.OPP_ORB.astype(int) / (wnba_logs.OPP_ORB.astype(int) + wnba_logs.DRB.astype(int))
wnba_logs['OPP_DRB%'] = 100 * wnba_logs.OPP_DRB.astype(int) / (wnba_logs.OPP_DRB.astype(int) + wnba_logs.ORB.astype(int))
wnba_logs['OPP_REB%'] = 100 * wnba_logs.OPP_REB.astype(int) / (wnba_logs.OPP_REB.astype(int) + wnba_logs.REB.astype(int))

wnba_logs['TO%'] = 100 * wnba_logs.TOV.astype(int) / wnba_logs.POSS.astype(int)
wnba_logs['OPP_TO%'] = 100 * wnba_logs.OPP_TOV.astype(int) / wnba_logs.OPP_POSS.astype(int)

# Splits and Percentages
wnba_logs['FG%'] = 100 * wnba_logs.FG.astype(int) / wnba_logs.FGA.astype(int)
wnba_logs['FGA'] = wnba_logs.FG.astype(str) + "-" + wnba_logs.FGA.astype(str)
wnba_logs['3P%'] = 100 * wnba_logs['3P'].astype(int) / wnba_logs['3PA'].astype(int)
wnba_logs['3PA'] = wnba_logs['3P'].astype(str) + "-" + wnba_logs['3PA'].astype(str)
wnba_logs['FT%'] = 100 * wnba_logs.FT.astype(int) / wnba_logs.FTA.astype(int)
wnba_logs['FTA'] = wnba_logs.FT.astype(str) + "-" + wnba_logs.FTA.astype(str)
wnba_logs['OPP_FG%'] = 100 * wnba_logs.OPP_FG.astype(int) / wnba_logs.OPP_FGA.astype(int)
wnba_logs['OPP_FGA'] = wnba_logs.OPP_FG.astype(str) + "-" + wnba_logs.OPP_FGA.astype(str)
wnba_logs['OPP_3P%'] = 100 * wnba_logs['OPP_3P'].astype(int) / wnba_logs['OPP_3PA'].astype(int)
wnba_logs['OPP_3PA'] = wnba_logs['OPP_3P'].astype(str) + "-" + wnba_logs['OPP_3PA'].astype(str)
wnba_logs['OPP_FT%'] = 100 * wnba_logs.OPP_FT.astype(int) / wnba_logs.OPP_FTA.astype(int)
wnba_logs['OPP_FTA'] = wnba_logs.OPP_FT.astype(str) + "-" + wnba_logs.OPP_FTA.astype(str)
wnba_logs['REB'] = wnba_logs.REB.astype(str) + "-" + wnba_logs.OPP_REB.astype(str)

wnba_logs['Date'] = pd.to_datetime(wnba_logs.Date)
wnba_logs['Team'] = wnba_logs.Team.apply(lambda i : teams[i])
wnba_logs['Opponent'] = wnba_logs.Opp

wnba_logs = wnba_logs.round(2)

selected_cols = ['Date', 'Team', 'Opponent', 'W/L', 'Final',
                 'POSS', 'OPP_POSS',
                 'PPP', 'OPP_PPP', 'FTA%', 'OPP_FTA%',
                 'ORB%', 'DRB%', 'REB%', 'REB',
                 'TO%', 'OPP_TO%',
                 'FGA', 'FG%', 'OPP_FGA', 'OPP_FG%',
                 '3PA', '3P%', 'OPP_3PA', 'OPP_3P%',
                 'FTA', 'FT%', 'OPP_FTA', 'OPP_FT%']

for c in selected_cols:
    if (wnba_logs[c].dtype in [int, float]) and (c not in ['PPP', 'OPP_PPP']):
        wnba_logs[c] = wnba_logs[c].round(1)
    else:
        pass
    
wnba_logs = wnba_logs[selected_cols]

In [22]:
wnba_logs.to_csv("wnba_logs.csv")