In [1]:
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os

pd.options.mode.chained_assignment = None
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_colwidth', None)

In [2]:
# Iterate through each action and determine the lineup (only changes on subtitution actions)
def generate_lineups(df, team, starters):
    starters_copy = starters.copy()
    current = starters

    lineups = [", ".join(starters)]
    
    for x in range(1, len(df)):
        if (df.Opponent[x] == 'North Central (Ill.)') and (df.Time[x] == '07:53') and (df.Half[x] == 2):
            current = "Lindsey Carter, Grace Hynes, Bella Alfaro, Marissa Powe, Isabelle Herrera".split(", ")
            lineups.append( ", ".join(current) )
            continue
        if (df.Opponent[x] == 'North Central (Ill.)') and (df.Time[x] == '04:52') and (df.Half[x] == 4):
            current = "Grace Hynes, Ashley Gao, Ellie Gross, Marissa Powe, Lindsey Carter".split(", ")
            lineups.append( ", ".join(current) )
            continue
        if (df.Opponent[x] == 'North Park') and (df.Time[x] == '07:14') and (df.Half[x] == 2):
            current = "Ellie Gross, Grace Hynes, Sophia North, Marissa Powe, Ashley Gao".split(", ")
            lineups.append( ", ".join(current) )
            continue
        if (df.Opponent[x] == 'Colorado College') and (df.Time[x] == '04:29') and (df.Half[x] == 3):
            current = "Grace Hynes, Ashley Gao, Sophia North, Lindsey Carter, Bella Alfaro".split(", ")
            lineups.append( ", ".join(current) )
            continue
        if (df.Opponent[x] == 'Colorado College') and (df.Time[x] == '16:23') and (df.Half[x] == 2):
            current = "Alec Bryan, Eamonn Kenah, Joe Berry, Arrish Bhandal, Thomas Kurowski".split(", ")
            lineups.append( ", ".join(current) )
            continue
        if (df.Opponent[x] == 'Eureka') and (df.Time[x] == '05:00') and (df.Half[x] == 2):
            current = "Marissa Powe, Grace Hynes, Ashley Gao, Bella Alfaro, Sophia North".split(", ")
            lineups.append( ", ".join(current) )
            continue
        if (df.Opponent[x] == 'Wheaton (Ill.)') and (df.Time[x] == '07:06') and (df.Half[x] == 4):
            current = "Ellie Gross, Marissa Powe, Sophia North, Mallory Brodnik, Bella Alfaro".split(", ")
            lineups.append( ", ".join(current) )
            continue
        if (df.Opponent[x] == 'Wis. Lutheran') and (df.Time[x] == '09:59') and (df.Half[x] == 4):
            current = "Ellie Gross, Mallory Brodnik, Grace Hynes, Sophia North, Marissa Powe".split(", ")
            lineups.append( ", ".join(current) )
            continue
        if (df.Opponent[x] == 'Ill. Wesleyan') and (df.Time[x] == '02:26') and (df.Half[x] == 3):
            current = "Elliot Paschal, Bryce Hopkins, Thomas Kurowski, Alex Battist, Joe Berry".split(", ")
            lineups.append( ", ".join(current) )
            continue
        
        action = df.Action[x]
        if (df.Half[x] == 3) and df.Half[x-1] == 2: # DIFFERENT FOR TEAMS
                current = starters_copy
        
        # Lineup doesn't change if other team is responsible for action
        if (df.Team[x] != team) or (df.index[x] == 342):
            lineups.append( ", ".join(current) )
            continue
            
        # Change lineup on substitution plays
        player = extract_player(action)
        
        if ("game" in action) and (player not in current): # sub in
                current.append(player)
        elif ("bench" in action) and (player in current): # sub out
            current.remove(player)

        lineups.append( ", ".join(current) )

    return lineups

In [3]:
# Extract player name from play-by-play action
# Ex. "Foul by LAST_NAME,FIRST_NAME"
def extract_player(action):
    player_raw = [i for i in action.split() if "," in i] # Ex. ['LAST_NAME,FIRST_NAME']
    
    if len(player_raw) == 0: # Ex. team timeout
        player = ""
    else:
        player = list(reversed(player_raw[0].split(","))) # split by comma, reverse list
        player = " ".join(player).title() # reduce to one string, fix capitalization

        # outlier due to accents
        if "Kerem" in player:
            player = 'Kerem Öztürk'
        
    return player

In [4]:
# Extract attributes from HTML of play-by-play action, 
# status of Chicago team (home or away), and half (1 or 2)
def clean_row(row, chicago, half):
    time = row.find('td', {'class' : 'time'}).string

    scores = row.find_all('span', {'class' : ['v-score', 'h-score']})
    if len(scores) == 0:
        score = "0-0"
    else:
        score = '-'.join([i.string for i in scores])
        
    # Identify the team repsonsible for the action
    action_team = row.find('img')['alt']
    if "Chicago" in action_team:
        team = chicago
    else:
        if chicago == 'Home':
            team = 'Away'
        else:
            team = 'Home'
        
    try:
        action = row.find('span', {'class' : 'text'}).string.strip()
    except:
        action = row.find('strong').string # timeouts are contained in bold tags
    action = ' '.join(action.split())
    action = action.replace("\n", "")
    
    substitution = 0
    if "the" in action: # all substitution actions end in "enters the game" or "goes to the bench"
        substitution = 1

    player = extract_player(action)

    return [half, time, score, team, action, substitution, player]

In [5]:
# Scrape play-by-play data from url, return dataframe of game actions
def uaa_pbp(d, url, chicago, opponent):
    d.get(url)
    soup = BeautifulSoup(d.page_source, 'html.parser')

    half = soup.find_all('table', { 'role' : 'presentation' })
    first = half[0].find_all('tr', {'class' : 'row'})
    second = half[1].find_all('tr', {'class' : 'row'})
    
    date = soup.find_all('h1')[1].find('span').text

    first_half = [[date, chicago, opponent] + clean_row(r, chicago, 1) for r in first]
    second_half = [[date, chicago, opponent] + clean_row(r, chicago, 2) for r in second]
    
    if len(half) > 2:
        third = half[2].find_all('tr', {'class' : 'row'})
        fourth = half[3].find_all('tr', {'class' : 'row'})
        
        third_quarter = [[date, chicago, opponent] + clean_row(r, chicago, 3) for r in third]
        fourth_quarter = [[date, chicago, opponent] + clean_row(r, chicago, 4) for r in fourth]
        
        game = pd.DataFrame(first_half + second_half + third_quarter + fourth_quarter, 
                            columns = ['Date', 'Chicago', 'Opponent', 'Half', 'Time', 'Score', 
                                       'Team', 'Action', 'Substitution', 'Player'])
    else:
        game = pd.DataFrame(first_half + second_half, 
                            columns = ['Date', 'Chicago', 'Opponent', 'Half', 'Time', 'Score', 
                                       'Team', 'Action', 'Substitution', 'Player'])

    return game

In [6]:
# Pull points scored from each action/play
def points_scored(action):
    if "made 3" in action:
        return 3
    elif "made free" in action:
        return 1
    elif "made" in action:
        return 2
    else:
        return 0

In [7]:
# Scrape single game from box score url
def game_scraper(driver, url):
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # First logo tag belongs to away team : if it is Chicago's logo, then Chicago is the away team
    uc_logo = "https://cdn.prestosports.com/action/cdn/logos/rpi/137/mbkb.png"
    if uc_logo == soup.find('div', {'class' : 'team-logo'}).find('img')['src']:
        chicago = "Away"
    else:
        chicago = "Home"
        
    teams = [t.text.strip() for t in soup.find_all('tbody')[0].find_all('th', {'class' : 'name'})]
    teams = [" ".join(t.split(" ")[1:]) if t[0] == "#" else t for t in teams]
    opponent = [t for t in teams if "Chicago" not in t][0]
    
    chicago_players = [a.string for a in soup.find_all('a', {'class' : 'player-name'})]
    chicago_players = chicago_players[0:int(len(chicago_players) / 3)]
    
    # Scrape play-by-play data using function defined above
    pbp_url = "https://athletics.uchicago.edu" + soup.find('a', {'id' : 'pbp'})['href']
    plays = uaa_pbp(driver, pbp_url, chicago, opponent)
    
    # Add lineup column based on home/away status
    plays['Lineup'] = generate_lineups(plays, chicago, chicago_players[0:5])
    
    # Add column for points scored for each play
    plays['Points'] = plays.Action.map(points_scored)
    
    # Separate dataframe by team
    chicago = plays[(plays.Chicago == plays.Team)]
    opponent = plays[(plays.Chicago != plays.Team)]
    
    # Create new dataframe with aggregate stats
    plays_stats = pd.DataFrame(columns = ['Stats', 'Chicago', 'Opponent'])
    plays_stats['Stats'] = ['OREB', 'DREB', 'STL', 'BLK', 'PF', 'PTS']
    plays_stats['Chicago'] = [chicago[(chicago.Action.str.contains("offensive rebound"))].shape[0],
               chicago[(chicago.Action.str.contains("defensive rebound"))].shape[0],
               chicago[(chicago.Action.str.contains("Steal"))].shape[0],
               chicago[(chicago.Action.str.contains("Block"))].shape[0],
               chicago[(chicago.Action.str.contains("Foul"))].shape[0],
               chicago.Points.sum()]
    plays_stats['Opponent'] = [opponent[(opponent.Action.str.contains("offensive rebound"))].shape[0],
                opponent[(opponent.Action.str.contains("defensive rebound"))].shape[0],
                opponent[(opponent.Action.str.contains("Steal"))].shape[0],
                opponent[(opponent.Action.str.contains("Block"))].shape[0],
                opponent[(opponent.Action.str.contains("Foul"))].shape[0],
                opponent.Points.sum()]
    plays_stats.columns = ['Stats', 'Chicago', plays.Opponent[0]]
    
    
    # Scrape team stats page
    driver.get( url.split("view=")[0] + "view=teamstats" )
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Pull out names of stats and actual values
    stats = [i.text for i in soup.find_all('div', {'class': 'stats-box half'})[1].find_all('th')]
    values = [i.text.strip() for i in soup.find_all('div', {'class': 'stats-box half'})[1].find_all('td')]

    # Create new dataframe with aggregate stats
    team_stats = pd.DataFrame()
    team_stats['Stats'] = stats[3:-1]
    team_stats['Team 1'] = [values[i-1] for i in range(1, len(values), 2)]
    team_stats['Team 2'] = [values[i] for i in range(1, len(values), 2)]    
    
    if stats[1][0] == "#":
        team1 = " ".join(stats[1].split(" ")[1:])
    else:
        team1 = stats[1]
    if stats[2][0] == "#":
        team2 = " ".join(stats[1].split(" ")[1:])
    else:
        team2 = stats[2]
    team_stats.columns = ['Stats', team1, team2]
    

    # Concatenate two dataframes of aggregate stats
    df = pd.concat([team_stats, plays_stats]).reset_index(drop=True)
    
    all_stats = df.transpose().iloc[1:, :]
    all_stats = all_stats.reset_index(drop=False)
    all_stats.columns = ['Team'] + list(df.Stats)
    all_stats['Opponent'] = [t for t in teams if "Chicago" not in t][0]
    
    all_stats['Date'] = plays.Date.values[0]
    
    return [plays, all_stats]

In [8]:
plays = True

def uaa_scraper(schedule_url, num_games):
    driver = webdriver.Chrome(ChromeDriverManager().install())
    driver.get(schedule_url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    games = soup.find_all('a', {'class' : 'link'})[1::2]
    game_links = ["https://athletics.uchicago.edu" + i['href'] + "?view=plays" for i in games]
    game_links = game_links[0:num_games]
    games = [game_scraper(driver, g)[0] for g in game_links] # g[0] for play-by-play
    games = pd.concat(games)
    
    if plays == True:
        current = 1
        games_list = [1]
        for i in range(1, len(games)):
            if games.Date.values[i] != games.Date.values[i-1]:
                current +=1
            games_list.append(current)

        games['Game'] = games_list
    
    driver.quit()

    return games


m_season = "https://athletics.uchicago.edu/sports/mbkb/2022-23/schedule"
w_season = "https://athletics.uchicago.edu/sports/wbkb/2022-23/schedule"

# season = uaa_scraper(m_season, 5)

In [9]:
# season.to_csv("season_plays23.csv")

In [10]:
def single_scraper(url):
    driver = webdriver.Chrome(ChromeDriverManager().install())
    df = game_scraper(driver, url)
    driver.quit()
    
    return df

## Overview

In [11]:
url = "https://athletics.uchicago.edu/sports/wbkb/2022-23/boxscores/20221230_df01.xml?view=plays"

df = single_scraper(url)




[WDM] - Current google-chrome version is 108.0.5359
[WDM] - Get LATEST chromedriver version for 108.0.5359 google-chrome
[WDM] - About to download new driver from https://chromedriver.storage.googleapis.com/108.0.5359.71/chromedriver_mac64.zip
[WDM] - Driver has been saved in cache [/Users/jeremydumalig/.wdm/drivers/chromedriver/mac64/108.0.5359.71]
  driver = webdriver.Chrome(ChromeDriverManager().install())


In [12]:
season = pd.read_csv("w_season_overview23.csv").iloc[:, 1:]

season = pd.concat([season, df[1]])

season['AST'] = season['Assists']
season['REB'] = season['Rebounds']
season['TOV'] = season['Turnovers']
season['PTS OFF TOV'] = season['Points off Turnovers']
season['2ND CHANCE'] = season['2nd Chance Points']
if 'Points in the Paint' in season.columns:
    season['PAINT'] = season['Points in the Paint']
if 'Fastbreak Points' in season.columns:
    season['FASTBREAK'] = season['Fastbreak Points']
season['BENCH'] = season['Bench Points']

season['FGM'] = season['Field Goal'].map(lambda s : int( s.split("-")[0] ))
season['FGA'] = season['Field Goal'].map(lambda s : int( s.split("-")[1] ))
season['3PM'] = season['3 Point'].map(lambda s : int( s.split("-")[0] ))
season['3PA'] = season['3 Point'].map(lambda s : int( s.split("-")[1] ))
season['FTM'] = season['Free Throw'].map(lambda s : int( s.split("-")[0] ))
season['FTA'] = season['Free Throw'].map(lambda s : int( s.split("-")[1] ))

season['FG%'] = 100 * round(season['FGM'] / season['FGA'], 3)
season['3P%'] = 100 * round(season['3PM'] / season['3PA'], 3)
season['FT%'] = 100 * round(season['FTM'] / season['FTA'], 3)

season['Chicago'] = (season.Team == "Chicago")

season = season[season.columns.drop(list(season.filter(regex='Unnamed')))]

season

Unnamed: 0,Team,Field Goal,Field Goal %,3 Point,3 Point %,Free Throw,Free Throw %,Rebounds,Assists,Turnovers,...,FG%,3P%,FT%,Chicago,Points in the Paint,Fastbreak Points,Largest Lead,Time of Largest Lead,PAINT,FASTBREAK
0,Chicago,28-60,46.7%,7-19,36.8%,5-8,62.5%,40,13,17,...,46.7,36.8,62.5,True,,,,,,
1,North Central (Ill.),18-64,28.1%,6-19,31.6%,6-8,75.0%,36,11,17,...,28.1,31.6,75.0,False,,,,,,
2,Chicago,19-53,35.8%,1-13,7.7%,25-30,83.3%,53,8,14,...,35.8,7.7,83.3,True,28.0,6.0,36.0,4th-03:31,28.0,6.0
3,Benedictine (Ill.),9-56,16.1%,1-23,4.3%,17-25,68.0%,33,5,11,...,16.1,4.3,68.0,False,16.0,0.0,0.0,-,16.0,0.0
4,Colorado College,24-68,35.3%,2-23,8.7%,22-25,88.0%,36,13,16,...,35.3,8.7,88.0,False,38.0,9.0,0.0,-,38.0,9.0
5,Chicago,28-56,50.0%,9-18,50.0%,18-26,69.2%,36,14,18,...,50.0,50.0,69.2,True,32.0,11.0,21.0,3rd-06:19,32.0,11.0
6,Carroll (Wis.),22-67,32.8%,6-26,23.1%,11-15,73.3%,34,13,15,...,32.8,23.1,73.3,False,32.0,8.0,7.0,1st-05:27,32.0,8.0
7,Chicago,26-51,51.0%,6-17,35.3%,12-16,75.0%,39,20,22,...,51.0,35.3,75.0,True,32.0,12.0,17.0,3rd-06:40,32.0,12.0
8,Carthage,23-49,46.9%,9-20,45.0%,17-23,73.9%,26,11,8,...,46.9,45.0,73.9,False,24.0,5.0,6.0,,24.0,5.0
9,Chicago,24-53,45.3%,12-25,48.0%,20-22,90.9%,30,15,8,...,45.3,48.0,90.9,True,18.0,6.0,10.0,,18.0,6.0


In [13]:
season.drop_duplicates().to_csv("w_season_overview23.csv")

## Plays

In [14]:
def time_to_seconds(time):
    return 60*int(time.split(":")[0]) + int(time.split(":")[1])

In [15]:
# WBB
game = df[0]

current = 1
quarters = [1]
for i in range(1, len(game)):
    if time_to_seconds(game.Time.values[i]) > time_to_seconds(game.Time.values[i-1]):
        current += 1
    quarters.append(current)
    
game['Half'] = quarters

season_plays = pd.read_csv("w_season_plays23.csv").iloc[:, 1:]
season_plays = pd.concat([season_plays, game])

current = 1
games = [1]
for i in range(1, len(season_plays)):
    if season_plays.Date.values[i] != season_plays.Date.values[i-1]:
        current +=1
    games.append(current)
    
season_plays['Game'] = games

season_plays

Unnamed: 0,Date,Chicago,Opponent,Half,Time,Score,Team,Action,Substitution,Player,Lineup,Points,Game,length
0,"November 12, 2022",Home,North Central (Ill.),1,09:51,0-0,Home,"HYNES,GRACE missed layup",0,Grace Hynes,"Mallory Brodnik, Marissa Powe, Ellie Gross, Gr...",0,1,
1,"November 12, 2022",Home,North Central (Ill.),1,09:45,0-0,Away,"MCCLURE,MEGAN defensive rebound",0,Megan Mcclure,"Mallory Brodnik, Marissa Powe, Ellie Gross, Gr...",0,1,
2,"November 12, 2022",Home,North Central (Ill.),1,09:33,0-0,Away,"SMITH,MITRESE missed jump shot",0,Mitrese Smith,"Mallory Brodnik, Marissa Powe, Ellie Gross, Gr...",0,1,
3,"November 12, 2022",Home,North Central (Ill.),1,09:29,0-0,Home,"POWE,MARISSA defensive rebound",0,Marissa Powe,"Mallory Brodnik, Marissa Powe, Ellie Gross, Gr...",0,1,
4,"November 12, 2022",Home,North Central (Ill.),1,09:19,0-0,Home,"BRODNIK,MALLORY missed layup",0,Mallory Brodnik,"Mallory Brodnik, Marissa Powe, Ellie Gross, Gr...",0,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
460,"December 30, 2022",Home,Knox,4,01:10,42-72,Home,"HERRERA,ISABELLE defensive rebound",0,Isabelle Herrera,"Isabelle Herrera, Bella Alfaro, Caroline Workm...",0,11,
461,"December 30, 2022",Home,Knox,4,00:56,42-72,Home,"WORKMAN,CAROLINE missed 3-pt. jump shot",0,Caroline Workman,"Isabelle Herrera, Bella Alfaro, Caroline Workm...",0,11,
462,"December 30, 2022",Home,Knox,4,00:55,42-72,Away,TEAM defensive rebound,0,,"Isabelle Herrera, Bella Alfaro, Caroline Workm...",0,11,
463,"December 30, 2022",Home,Knox,4,00:34,42-72,Away,"BUNGER,MADI missed jump shot",0,Madi Bunger,"Isabelle Herrera, Bella Alfaro, Caroline Workm...",0,11,


In [16]:
season_plays.to_csv("w_season_plays23.csv")

In [98]:
# MBB
season_plays = pd.read_csv("season_plays23.csv")

season_plays = season_plays[(season_plays.Opponent != 'Ill. Wesleyan')]

season_plays = pd.concat([season_plays, df[0]])

current = 1
games_list = [1]
for i in range(1, len(season_plays)):
    if season_plays.Date.values[i] != season_plays.Date.values[i-1]:
        current +=1
    games_list.append(current)

season_plays['Game'] = games_list

season_plays = season_plays[season_plays.columns.drop(list(season_plays.filter(regex='Unnamed')))]

season_plays

Unnamed: 0,Date,Chicago,Opponent,Half,Time,Score,Team,Action,Substitution,Player,Lineup,Points,Game
0,"November 10, 2022",Home,Lake Forest,1,19:44,2-0,Away,"SIEGIEN,FRANK made jump shot",0,Frank Siegien,"Alec Bryan, Ezra Moos, Thomas Kurowski, Elliot...",2,1
1,"November 10, 2022",Home,Lake Forest,1,19:44,2-0,Away,"Assist by BULL,ELIJAH",0,Elijah Bull,"Alec Bryan, Ezra Moos, Thomas Kurowski, Elliot...",0,1
2,"November 10, 2022",Home,Lake Forest,1,19:16,2-0,Home,"Turnover by BRYAN,ALEC",0,Alec Bryan,"Alec Bryan, Ezra Moos, Thomas Kurowski, Elliot...",0,1
3,"November 10, 2022",Home,Lake Forest,1,18:50,5-0,Away,"DEITCH,NOAH made 3-pt. jump shot",0,Noah Deitch,"Alec Bryan, Ezra Moos, Thomas Kurowski, Elliot...",3,1
4,"November 10, 2022",Home,Lake Forest,1,18:50,5-0,Away,"Assist by BULL,ELIJAH",0,Elijah Bull,"Alec Bryan, Ezra Moos, Thomas Kurowski, Elliot...",0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
536,"December 20, 2022",Home,Ill. Wesleyan,3,00:04,66-63,Away,"WILLIAMS,HAKIM defensive rebound",0,Hakim Williams,"Elliot Paschal, Bryce Hopkins, Thomas Kurowski...",0,10
537,"December 20, 2022",Home,Ill. Wesleyan,3,00:04,66-63,Home,"Foul by PASCHAL,ELLIOT",0,Elliot Paschal,"Elliot Paschal, Bryce Hopkins, Thomas Kurowski...",0,10
538,"December 20, 2022",Home,Ill. Wesleyan,3,00:04,67-63,Away,"WILLIAMS,HAKIM made free throw",0,Hakim Williams,"Elliot Paschal, Bryce Hopkins, Thomas Kurowski...",1,10
539,"December 20, 2022",Home,Ill. Wesleyan,3,00:04,67-63,Away,"WILLIAMS,HAKIM missed free throw",0,Hakim Williams,"Elliot Paschal, Bryce Hopkins, Thomas Kurowski...",0,10


In [99]:
season_plays.drop_duplicates().to_csv("season_plays23.csv")