In [77]:
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os

pd.options.mode.chained_assignment = None
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [78]:
# Extract player name from play-by-play action
# Ex. "Foul by LAST_NAME,FIRST_NAME"
def extract_player(action):
    player_raw = [i for i in action.split() if "," in i] # Ex. ['LAST_NAME,FIRST_NAME']
    
    if len(player_raw) == 0: # Ex. team timeout
        player = ""
    else:
        player = list(reversed(player_raw[0].split(","))) # split by comma, reverse list
        player = " ".join(player).title() # reduce to one string, fix capitalization

        # outliers
        if "Soest" in player:
            player = 'Peyton Van Soest'
        
    return player #Ex. 'Stephen Curry'

In [79]:
# Extract attributes from HTML of play-by-play action, 
# status of Chicago team (home or away), and quarter
def clean_row(row, chicago, quarter):
    time = row.find('td', {'class' : 'time'}).string

    scores = row.find_all('span', {'class' : ['v-score', 'h-score']})
    if len(scores) == 0:
        score = "0-0"
    else:
        score = '-'.join([i.string for i in scores])
        
    # Identify the team repsonsible for the action
    action_team = row.find('img')['alt']
    if action_team == "Chicago":
        team = chicago
    else:
        if chicago == 'Home':
            team = 'Away'
        else:
            team = 'Home'
        
    try:
        action = row.find('span', {'class' : 'text'}).string.strip()
    except:
        action = row.find('strong').string # timeouts are contained in bold tags
    action = ' '.join(action.split())
    action = action.replace("\n", "")
    
    substitution = 0
    if "the" in action: # all substitution actions end in "enters the game" or "goes to the bench"
        substitution = 1

    player = extract_player(action)

    return [quarter, time, score, team, action, substitution, player]

In [80]:
# Scrape play-by-play data from url, return dataframe of game actions
def uaa_pbp(d, url, chicago, opponent):
    d.get(url)
    soup = BeautifulSoup(d.page_source, 'html.parser')

    quarter = soup.find_all('table', { 'role' : 'presentation' })
    first = quarter[0].find_all('tr', {'class' : 'row'})
    second = quarter[1].find_all('tr', {'class' : 'row'})
    third = quarter[2].find_all('tr', {'class' : 'row'})
    fourth = quarter[3].find_all('tr', {'class' : 'row'})
    
    date = soup.find_all('h1')[1].find('span').text

    first_qr = [[date, chicago, opponent] + clean_row(r, chicago, 1) for r in first]
    second_qr = [[date, chicago, opponent] + clean_row(r, chicago, 2) for r in second]
    third_qr = [[date, chicago, opponent] + clean_row(r, chicago, 3) for r in third]
    fourth_qr = [[date, chicago, opponent] + clean_row(r, chicago, 4) for r in fourth]

    game = pd.DataFrame(first_qr + second_qr + third_qr + fourth_qr, 
                      columns = ['Date', 'Chicago', 'Opponent', 'Quarter', 'Time', 'Score', 'Team', 'Action', 'Substitution', 'Player'])

    return game

In [81]:
# Iterate through each action and determine the lineup (only changes on subtitution actions)
def generate_lineups(df, team, starters):
    starters_copy = starters.copy()
    current = starters

    lineups = [", ".join(starters)]
    
    for x in range(1, len(df)):
        action = df.Action[x]
        
        # First play of half (only for certain teams)
        if (df.Quarter[x] % 2 == 0) and (df.Date[x] == "January 28, 2022"):
            current = starters_copy
        
        # Lineup doesn't change if other team is responsible for action
        if df.Team[x] != team:
            lineups.append( ", ".join(current) )
            continue
            
        # Change lineup on substitution plays
        player = extract_player(action)
        
        # outlier
        if (action == "POWE,MARISSA enters the game") and (action == df.Action[x-3]):
            action = "VAN SOEST,PEYTON enters the game"
            df.loc[x, 'Action'] = action
        elif x > 5: # filter for 5th play or later
            if (df.Date[x] == 'January 28, 2022') and (df.Time[x] == "02:45") and (df.Quarter[x] == 4):
                lineups.append("Kati Heller, Mallory Brodnik, Sophia North, Peyton Van Soest, Grace Hynes")
                continue
        
        if ("game" in action) and (player not in current): # sub in
                current.append(player)
        elif ("bench" in action) and (player in current): # sub out
            current.remove(player)

        lineups.append( ", ".join(current) )

    return lineups

In [1]:
# Pull points scored from each action/play
def points_scored(action):
    if "made 3" in action:
        return 3
    elif "made free" in action:
        return 1
    elif "made" in action:
        return 2
    else:
        return 0

In [82]:
# Scrape single game from box score url
def game_scraper(d, box_url):
    d.get(box_url)
    soup = BeautifulSoup(d.page_source, 'html.parser')
    
    # First logo tag belongs to away team : if it is Chicago's logo, then Chicago is the away team
    uc_logo = "https://cdn.prestosports.com/action/cdn/logos/rpi/137/mbkb.png"
    if uc_logo == soup.find('div', {'class' : 'team-logo'}).find('img')['src']:
        chicago = "Away"
    else:
        chicago = "Home"
        
    teams = [t.text.strip() for t in soup.find_all('tbody')[0].find_all('th', {'class' : 'name'})]
    opponent = [t for t in teams if t != 'Chicago'][0]
    
    chicago_players = [a.string for a in soup.find_all('a', {'class' : 'player-name'})]
    chicago_players = chicago_players[0:int(len(chicago_players) / 3)]
    chicago_starters = ", ".join(chicago_players[0:5])
    
    # Scrape play-by-play data using function defined above
    pbp_url = "https://athletics.uchicago.edu" + soup.find('a', {'id' : 'pbp'})['href']
    df = uaa_pbp(d, pbp_url, chicago, opponent)
    
    # Add lineup column based on home/away status
    df['Lineup'] = generate_lineups(df, chicago, chicago_players[0:5])
    
    return df

In [83]:
# Scrape entire season for Chicago based on schedule url
def uaa_scraper(schedule_url):
    driver = webdriver.Chrome(ChromeDriverManager().install())
    driver.get(schedule_url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    games = soup.find_all('a', {'class' : 'link'})[1::2]
    game_links = ["https://athletics.uchicago.edu" + i['href'] for i in games]
    games = [game_scraper(driver, g) for g in game_links]
    
    driver.quit()

    return games

In [84]:
game_logs = uaa_scraper("https://athletics.uchicago.edu/sports/wbkb/2021-22/schedule")




[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [/Users/jeremydumalig/.wdm/drivers/chromedriver/mac64/102.0.5005.61/chromedriver] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install())


In [85]:
uchicago = pd.concat(game_logs).reset_index(drop=True)

dates = uchicago.groupby('Date').count().reset_index().iloc[:, 0:2]
dates['Date'] = pd.to_datetime(dates['Date'])
dates = dates.sort_values(by=['Date']).reset_index(drop=True)

game_numbers = []
for x in range(dates.shape[0]):
    for y in range(dates[dates.index == x]['Chicago'].values[0]):
        game_numbers.append(x+1)

uchicago['Game'] = game_numbers
        
boundaries = [1]
for x in range(1, len(uchicago) - 1):
    curent_game = uchicago.iloc[x]['Game']
    previous_game = uchicago.iloc[x-1]['Game']
    next_game = uchicago.iloc[x+1]['Game']
    
    if curent_game != previous_game:
        boundaries.append(1)
    elif curent_game != next_game:
        boundaries.append(-1)
    else:
        boundaries.append(0)
    
uchicago['Boundary']= boundaries + [1]

uchicago['Lineup'] = [", ".join(sorted(l.split(", "))) for l in uchicago.Lineup.values]

uchicago.to_csv("w_uchicago_pbp.csv")

In [87]:
uchicago = pd.read_csv("w_uchicago_pbp.csv").iloc[:, 1:]