In [1]:
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os

pd.options.mode.chained_assignment = None
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [2]:
mbb_urls = {"NYU" : "https://gonyuathletics.com/sports/mens-basketball/stats/2022-23",
            "WashU" : "https://washubears.com/sports/mens-basketball/stats?path=mbball",
            "Brandeis" : "https://www.brandeisjudges.com/sports/mbkb/2022-23/teams/brandeis?view=gamelog",
            "Case Western" : "https://athletics.case.edu/sports/mbkb/2022-23/teams/casewesternreserve",
            "Carnegie Mellon" : "https://athletics.cmu.edu/sports/mbkb/2022-23/teams/carnegiemellon?view=gamelog",
            "Emory" : "https://www.emoryathletics.com/sports/mbkb/2022-23/schedule",
            "Rochester" : "https://uofrathletics.com/sports/mens-basketball/stats/2022-23"}

wbb_urls = [u.replace("mbkb", "wbkb").replace("mens", "womens") for u in list(mbb_urls.values())]
wbb_urls = dict(map(lambda i, j : (i, j), list(mbb_urls.keys()), wbb_urls))

In [19]:
def fg_helper(splits):
    fgm = sum([int(i.split("-")[0]) for i in splits])
    fga = sum([int(i.split("-")[1]) for i in splits])
    
    return str(fgm) + "-" + str(fga)

def totals_helper(df):
    poss = (0.96*(sum(df['FGA'].astype(int)) + 
                  sum(df['TO'].astype(int)) + 
                  0.44*sum(df['FTA'].astype(int))) - 
            sum(df['OREB'].astype(int))) / len(df)
    opp_poss = (0.96*(sum(df['OPP FGA'].astype(int)) + 
                      sum(df['OPP TO'].astype(int)) + 
                      0.44*sum(df['OPP FTA'].astype(int))) - 
                sum(df['OPP OREB'].astype(int))) / len(df)
    ppp = round(sum(df['PTS'].astype(int)) / (poss * len(df)), 1)
    fgs = fg_helper( list(df['FGM/A']) )
    threes = fg_helper( list(df['3FG/A']) )
    opp_fgs = fg_helper( list(df['OPP FGM/A']) )
    opp_threes = fg_helper( list(df['OPP 3PM/A']) )
    orb = 100 * round(sum(df['OREB'].astype(int)) / (sum(df['OREB'].astype(int)) + sum(df['OPP DREB'].astype(int))), 3)
    drb = 100 * round(sum(df['DREB'].astype(int)) / (sum(df['DREB'].astype(int)) + sum(df['OPP OREB'].astype(int))), 3)
    reb = 100 * round(sum(df['REB'].astype(int)) / (sum(df['REB'].astype(int)) + sum(df['OPP REB'].astype(int))), 3)
    to = 100 * round(sum(df['TO'].astype(int)) / (poss * len(df)), 3)
    opp_to = 100 * round(sum(df['OPP TO'].astype(int)) / (opp_poss * len(df)), 3)

    return ["Total", "", sum(df['Margin']), 
            round(poss, 1), ppp, fgs, threes, opp_fgs, opp_threes,
            sum(df['REB'].astype(int)), sum(df['OPP REB'].astype(int)), 
            round(orb, 1), round(drb, 1), round(reb, 1), 
            round(to, 1), round(opp_to, 1)]

In [22]:
def game_by_game_df(driver, soup, school):
    table = soup.find_all('div', {"class" : "sidearm-table-overflow-on-x-large"})
    team = [t for t in table if "Team" in t.find("caption").text][0]
    opponents = [t for t in table if "Opponents" in t.find("caption").text][0]

    t_columns = [i.text for i in team.find("tr") if i.text != "\n"]
    t_entries = [[i.text for i in row.find_all("td")] for row in team.find_all("tr")[1:]]
    o_columns = [i.text for i in opponents.find("tr") if i.text != "\n"]
    o_entries = [[i.text for i in row.find_all("td")] for row in opponents.find_all("tr")[1:]]

    team_df = pd.DataFrame(t_entries[:-2], columns=t_columns)
    opponents_df = pd.DataFrame(o_entries[:-2], columns=o_columns)
    game_log = team_df.join(opponents_df, rsuffix=" OPP")

    game_log['Margin'] = (game_log['Score'].apply(lambda i : int(i.split("-")[0])) - 
                     game_log['Score'].apply(lambda i : int(i.split("-")[1])))
    game_log['FGA'] = game_log['FGM/A'].apply(lambda i : int(i.split("-")[1]))
    game_log['OPP FGA'] = game_log['FGM/A OPP'].apply(lambda i : int(i.split("-")[1]))
    game_log['FTA'] = game_log['FTM/A'].apply(lambda i : int(i.split("-")[1]))
    game_log['OPP FTA'] = game_log['FTM/A OPP'].apply(lambda i : int(i.split("-")[1]))
    game_log['OREB'] = game_log['OFF']
    game_log['DREB'] = game_log['DEF']
    game_log['OPP OREB'] = game_log['OFF OPP']
    game_log['OPP DREB'] = game_log['DEF OPP']
    game_log['OPP TO'] = game_log['TO OPP']
    game_log['POSS'] = 0.96*(game_log['FGA'] + 
                        game_log['TO'].astype(int) + 
                        0.44*game_log['FTA']) - game_log['OFF'].astype(int)
    game_log['OPP POSS'] = 0.96*(game_log['OPP FGA'] + 
                            game_log['TO OPP'].astype(int) + 
                            0.44*game_log['OPP FTA']) - game_log['OFF OPP'].astype(int)
    game_log['PPP'] = round(game_log['PTS'].astype(int) / game_log['POSS'], 1)
    game_log['REB'] = game_log['TOT']
    game_log['OPP REB'] = game_log['TOT OPP']
    game_log['ORB%'] = 100 * round(game_log['OFF'].astype(int) / 
                              (game_log['OFF'].astype(int) + game_log['DEF OPP'].astype(int)), 3)
    game_log['DRB%'] = 100 * round(game_log['DEF'].astype(int) / 
                              (game_log['DEF'].astype(int) + game_log['OFF OPP'].astype(int)), 3)
    game_log['REB%'] = 100 * round(game_log['TOT'].astype(int) / 
                              (game_log['TOT'].astype(int) + game_log['TOT OPP'].astype(int)), 3)
    game_log['TO%'] = 100 * round(game_log['TO'].astype(int) / game_log['POSS'], 3)
    game_log['OPP TO%'] = 100 * round(game_log['TO OPP'].astype(int) / game_log['OPP POSS'], 3)
    game_log['POSS'] = game_log['POSS'].round(1)
    game_log['OPP POSS'] = game_log['OPP POSS'].round(1)
    
    game_log['OPP FGM/A'] = game_log['FGM/A OPP']
    game_log['OPP 3PM/A'] = game_log['3FG/A OPP']
    
    totals_row = totals_helper(game_log)
    
    game_log = game_log[['Opponent', 'W/L', 'Margin', 
                         'POSS', 'PPP', 'FGM/A', '3FG/A', 'OPP FGM/A', 'OPP 3PM/A',
                         'REB', 'OPP REB', 'ORB%', 'DRB%', 'REB%', 
                         'TO%', 'OPP TO%']]
    
    game_log.loc[len(game_log)] = totals_row
    game_log['Team'] = school

    return game_log

In [5]:
def opp_helper(soup, school, opp=True):
    if school == "Case Western":
        school = "Case Western Reserve"
        
    if opp:
        opponent = [t for t in soup.find_all("div", {"class" : "player-stats"})[0].find_all("table") 
                    if t.find("h2").text != school][0]
    else:
        opponent = [t for t in soup.find_all("div", {"class" : "player-stats"})[0].find_all("table") 
                    if t.find("h2").text == school][0]

    columns = [[th.text for th in tr.find_all("th")] for tr in opponent.find_all("tr")][0][2:]
    entries = [i.text.strip() for i in opponent.find("tr", {"class" : "totals"}).find_all("td")][1:]
    
    return pd.DataFrame([entries], columns=columns)

def game_log_df(driver, soup, school):
    if school == "Carnegie Mellon":
        columns = [th.text for th in soup.find(id="gamelog").find("tr").find_all("th")]
        rows = soup.find(id="gamelog").find_all("tr")[1:]
        entries = [[td.text.strip().replace("\t", "") for td in tr.find_all("td")] for tr in rows]
    else:
        columns = [i.text for i in soup.find_all("div", {"class" : "stats-box full clearfix"})[0].find_all("th")]
        rows = soup.find_all("div", {"class" : "stats-box full clearfix"})[0].find_all("tr")
        entries = [[td.text.strip().replace("\t", "") for td in r.find_all("td")] for r in rows][1:]
        
        if school == "Case Western":
            rows = [r for r in rows if not str_list("Buffalo State", [i.text for i in r.find_all("td")])]

    game_log = pd.DataFrame(entries, columns=columns)
    game_log = game_log[~(game_log.Score == "")]
    game_log = game_log.reset_index(drop=True)

    game_urls = [urls[school].split("/teams/")[0] + 
                 r.find("a")['href'].split("..")[1] 
                 for r in rows[0:game_log.shape[0] + 1] if r.find_all("a") != []]
    
    game_soups = []
    for url in game_urls:
        driver.get(url)
        game_soups.append( BeautifulSoup(driver.page_source, 'html.parser') )

    opponent_df = pd.concat([opp_helper(s, school) for s in game_soups])
    opponent_df = opponent_df.reset_index(drop=True)
    opponent_df['Opponent'] = game_log['Opponent']

    game_log = game_log.join(opponent_df, rsuffix=" OPP")

    game_log['W/L'] = game_log['Score'].apply(lambda i : i.split(",")[0])
    game_log['Margin'] = game_log.Score.apply(lambda i : eval(i.split(",")[1]))
    game_log['FGA'] = game_log['fg'].apply(lambda i : int(i.split("-")[1]))
    game_log['OPP FGA'] = game_log['FGM-A'].apply(lambda i : int(i.split("-")[1]))
    game_log['FTA'] = game_log['ft'].apply(lambda i : int(i.split("-")[1]))
    game_log['OPP FTA'] = game_log['FTM-A'].apply(lambda i : int(i.split("-")[1]))
    game_log['OPP OREB'] = game_log['OREB']
    game_log['OPP DREB'] = game_log['DREB']
    game_log['OREB'] = game_log['off']
    game_log['DREB'] = game_log['def']
    game_log['OPP TO'] = game_log['TO']
    game_log['TO'] = game_log['to']
    game_log['PTS'] = game_log['pts']
    game_log['POSS'] = 0.96*(game_log['FGA'] + 
                        game_log['to'].astype(int) + 
                        0.44*game_log['FTA']) - game_log['off'].astype(int)
    game_log['OPP POSS'] = 0.96*(game_log['OPP FGA'] + 
                                 game_log['TO'].astype(int) + 
                                 0.44*game_log['OPP FTA']) - game_log['OREB'].astype(int)
    game_log['PPP'] = round(game_log['pts'].astype(int) / game_log['POSS'], 1)
    game_log['FGM/A'] = game_log['fg']
    game_log['3FG/A'] = game_log['3pt']
    game_log['OPP REB'] = game_log['REB'].astype(int)
    game_log['REB'] = game_log['reb'].astype(int)
    game_log['ORB%'] = 100 * round(game_log['off'].astype(int) / 
                              (game_log['off'].astype(int) + game_log['DREB'].astype(int)), 3)
    game_log['DRB%'] = 100 * round(game_log['def'].astype(int) / 
                              (game_log['def'].astype(int) + game_log['OREB'].astype(int)), 3)
    game_log['REB%'] = 100 * round(game_log['REB'].astype(int) / 
                              (game_log['REB'].astype(int) + game_log['OPP REB'].astype(int)), 3)
    game_log['TO%'] = 100 * round(game_log['to'].astype(int) / game_log['POSS'], 3)
    game_log['OPP TO%'] = 100 * round(game_log['TO'].astype(int) / game_log['OPP POSS'], 3)
    game_log['POSS'] = game_log['POSS'].round(1)
    game_log['OPP POSS'] = game_log['OPP POSS'].round(1)
    
    totals_row = totals_helper(game_log)
    
    game_log = game_log[['Opponent', 'W/L', 'Margin', 
                         'POSS', 'PPP', 'FGM/A', '3FG/A', 
                         'REB', 'OPP REB', 'ORB%', 'DRB%', 'REB%', 
                         'TO%', 'OPP TO%']]
    
    game_log.loc[len(game_log)] = totals_row
    game_log['Team'] = school

    return game_log

In [29]:
def html_df(driver, soup, school):
    events = soup.find_all("div", {"class" : "event-row"})

    results = [e.find("div", {"class" : "result"}).text.strip() for e in events]
    results = [r for r in results if r != ""]

    opponents = [e.find("div", {"class" : "opponent"}).text.strip() for e in events]
    if Team == "Men's Team":
        opponents = [o.replace("\n", "") for o in opponents if o != "Oglethorpe"]
    opponents = opponents[0:len(results)]

    if Team == "Women's Team":
        game_links = [e.find("div", {"class" : "links"}).find_all("a")[1]['href'] for e in [e for e in events[0:len(results)+1] if not str_list("Cancelled", [i.text for i in e])]]
    else:
        game_links = [e.find("div", {"class" : "links"}).find_all("a")[1]['href'] for e in events[0:len(results)]]
    game_links = ["https://www.emoryathletics.com/" + l for l in game_links]

    game_soups = []
    for url in game_links:
        driver.get(url)
        game_soups.append( BeautifulSoup(driver.page_source, 'html.parser') )

    team_df = pd.concat([opp_helper(s, "Emory", opp=False) for s in game_soups])
    team_df = team_df.reset_index(drop=True)
    team_df['Opponent'] = opponents
    opponent_df = pd.concat([opp_helper(s, "Emory") for s in game_soups])
    opponent_df = opponent_df.reset_index(drop=True)
    opponent_df['Opponent'] = opponents

    game_log = team_df.join(opponent_df, rsuffix=" OPP")

    game_log['W/L'] = [r.split(",")[0] for r in results]
    game_log['Margin'] = [eval(r.split(", ")[1]) for r in results]
    game_log['FGA'] = game_log['FGM-A'].apply(lambda i : int(i.split("-")[1]))
    game_log['FTA'] = game_log['FTM-A'].apply(lambda i : int(i.split("-")[1]))
    game_log['OPP FGM'] = game_log['FGM-A OPP'].apply(lambda i : int(i.split("-")[0]))
    game_log['OPP FGA'] = game_log['FGM-A OPP'].apply(lambda i : int(i.split("-")[1]))
    game_log['OPP 3PM'] = game_log['3PM-A OPP'].apply(lambda i : int(i.split("-")[0]))
    game_log['OPP 3PA'] = game_log['3PM-A OPP'].apply(lambda i : int(i.split("-")[1]))
    game_log['OPP FGM/A'] = game_log['OPP FGM'].astype(str) + "-" + game_log['OPP FGA'].astype(str)
    game_log['OPP 3PM/A'] = game_log['OPP 3PM'].astype(str) + "-" + game_log['OPP 3PA'].astype(str)
    game_log['OPP FTA'] = game_log['FTM-A OPP'].apply(lambda i : int(i.split("-")[1]))
    game_log['OPP OREB'] = game_log['OREB OPP']
    game_log['OPP DREB'] = game_log['DREB OPP']
    game_log['OPP TO'] = game_log['TO OPP']
    game_log['POSS'] = 0.96*(game_log['FGA'] + 
                            game_log['TO'].astype(int) + 
                            0.44*game_log['FTA']) - game_log['OREB'].astype(int)
    game_log['OPP POSS'] = 0.96*(game_log['OPP FGA'] + 
                                 game_log['TO OPP'].astype(int) + 
                                 0.44*game_log['OPP FTA']) - game_log['OREB OPP'].astype(int)
    game_log['PPP'] = round(game_log['PTS'].astype(int) / game_log['POSS'], 1)
    game_log['FGM/A'] = game_log['FGM-A']
    game_log['3FG/A'] = game_log['3PM-A']
    game_log['OPP REB'] = game_log['REB OPP'].astype(int)
    game_log['ORB%'] = 100 * round(game_log['OREB'].astype(int) / 
                              (game_log['OREB'].astype(int) + game_log['DREB OPP'].astype(int)), 3)
    game_log['DRB%'] = 100 * round(game_log['DREB'].astype(int) / 
                              (game_log['DREB'].astype(int) + game_log['OREB OPP'].astype(int)), 3)
    game_log['REB%'] = 100 * round(game_log['REB'].astype(int) / 
                              (game_log['REB'].astype(int) + game_log['OPP REB'].astype(int)), 3)
    game_log['TO%'] = 100 * round(game_log['TO'].astype(int) / game_log['POSS'], 3)
    game_log['OPP TO%'] = 100 * round(game_log['TO OPP'].astype(int) / game_log['OPP POSS'], 3)
    game_log['POSS'] = game_log['POSS'].round(1)
    game_log['OPP POSS'] = game_log['OPP POSS'].round(1)
    
    totals_row = totals_helper(game_log)
    
    game_log = game_log[['Opponent', 'W/L', 'Margin', 
                         'POSS', 'PPP', 'FGM/A', '3FG/A', 'OPP FGM/A', 'OPP 3PM/A',
                         'REB', 'OPP REB', 'ORB%', 'DRB%', 'REB%', 
                         'TO%', 'OPP TO%']]
    
    game_log.loc[len(game_log)] = totals_row
    game_log['Team'] = school

    return game_log

Unnamed: 0,Opponent,W/L,Margin,POSS,PPP,FGM/A,3FG/A,OPP FGM/A,OPP 3PM/A,REB,OPP REB,ORB%,DRB%,REB%,TO%,OPP TO%,Team
0,Piedmont,W,18,76.1,1.1,31-68,14-36,21-65,6-25,47,48,29.8,68.8,49.5,18.4,23.8,Emory
1,Sewanee,W,25,79.6,1.3,42-77,11-23,28-74,11-34,49,35,40.0,71.4,58.3,21.4,19.3,Emory
2,Guilford,L,14,64.3,0.9,19-55,10-31,27-67,9-23,38,46,30.0,59.1,45.2,26.4,18.3,Emory
3,Covenant,W,34,70.9,1.4,33-63,11-26,22-59,9-32,43,23,53.6,73.7,65.2,21.2,25.8,Emory
4,Maryville (Tenn.),W,9,72.1,1.1,31-51,3-13,26-60,5-27,36,22,25.0,88.2,62.1,30.5,17.8,Emory
5,Bates,W,34,81.1,1.1,33-88,10-23,16-65,7-34,54,37,40.4,79.5,59.3,13.6,23.8,Emory
6,Colby,W,14,63.3,1.3,29-58,8-20,24-65,12-37,39,30,41.4,67.5,56.5,20.5,19.8,Emory
7,LaGrange,W,18,74.8,1.1,33-66,11-26,23-66,5-22,47,31,37.1,79.1,60.3,24.1,14.7,Emory
8,Connecticut Col.,W,8,70.8,1.0,23-64,5-25,25-73,4-20,43,49,28.9,63.8,46.7,15.5,20.6,Emory
9,Berry,L,5,73.7,1.0,23-76,6-26,23-57,9-24,46,50,34.5,68.4,47.9,10.8,27.6,Emory


In [7]:
def scout_scraper(team):
    global Team
    global urls
    global soups
    
    Team = team
    
    if team == "Women's Team":
        urls = wbb_urls
    else:
        urls = mbb_urls
    
    d = webdriver.Chrome(ChromeDriverManager().install())

    soups = []
    for url in list(urls.values()):
        d.get(url)
        soups.append( BeautifulSoup(d.page_source, 'html.parser') )
        
    game_logs = {"NYU" : game_by_game_df(d, soups[0], "NYU"),
                 "WashU" : game_by_game_df(d, soups[1], "WashU"),
                 "Brandeis" : game_log_df(d, soups[2], "Brandeis"),
                 "Case Western" : game_log_df(d, soups[3], "Case Western"),
                 "Carnegie Mellon" : game_log_df(d, soups[4], "Carnegie Mellon"),
                 "Emory" : html_df(d, soups[5], "Emory"),
                 "Rochester" : game_by_game_df(d, soups[6], "Rochester")}

    d.quit()
    
    return game_logs

In [140]:
m_game_logs = scout_scraper("Men's Team")
w_game_logs = scout_scraper("Women's Team")




[WDM] - Current google-chrome version is 108.0.5359
[WDM] - Get LATEST chromedriver version for 108.0.5359 google-chrome
[WDM] - Driver [/Users/jeremydumalig/.wdm/drivers/chromedriver/mac64/108.0.5359.71/chromedriver] found in cache
  d = webdriver.Chrome(ChromeDriverManager().install())


In [141]:
pd.concat(list(m_game_logs.values())).to_csv("mbb_uaa_scout.csv")
pd.concat(list(w_game_logs.values())).to_csv("wbb_uaa_scout.csv")