In [4]:
#This is to add some of the data we've discovered we need after starting modeling
#For now, it's just playoff game designations and what lineup is currently on the floor for each team.
import pandas as pd
import numpy as np

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time
import random


In [13]:
def get_href(row):
    box_score_td = row.find_element(By.CSS_SELECTOR, 'td[data-stat="box_score_text"]')
    a = box_score_td.find_element(By.TAG_NAME, "a")
    href = a.get_attribute("href")
    return href

In [43]:
# Starting with playoff designations, going to scrape from the playoff series page on basketball-reference
class playoff_schedule:
    def __init__(self):
        self.teams = np.zeros(shape = [16, 16]) #matrix of all teams in the playoffs
        self.team_keys = {} #dictionary to map team names to matrix indices
        self.df = pd.DataFrame(data=[], columns=["Year", "Round", "Game Number", "Home Team", "Away Team",
                                                 "Home Score", "Away Score", "url", "attendance", "arena", "time"])
        

        head_options = webdriver.ChromeOptions()
        head_options.add_argument("--disable-blink-features=AutomationControlled")
        head_options.add_argument("--disable-gpu")
        head_options.add_argument("--no-sandbox")
        head_options.add_argument("--disable-extensions")
        head_options.add_argument("--disable-dev-shm-usage")
        head_options.add_argument("--disable-popup-blocking")
        head_options.add_argument("--disable-notifications")
        head_options.add_argument("--disable-infobars")
        head_options.add_argument("--headless")
        head_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        head_options.add_experimental_option('useAutomationExtension', False)
        prefs = {"profile.managed_default_content_settings.images": 2}
        head_options.add_experimental_option("prefs", prefs)
        self.options = head_options


    def get_playoff_game_number(self, home_team, away_team):
        #This may be excessively complicated, but it's working
        if home_team not in self.team_keys:
            index = len(self.team_keys)
            self.team_keys[home_team] = index
        if away_team not in self.team_keys:
            index = len(self.team_keys)
            self.team_keys[away_team] = index
        #Okay now they are added...
        #We need to find out what game and round they are in
        home_index = self.team_keys[home_team]
        away_index = self.team_keys[away_team]
        cur_game = self.teams[home_index, away_index] + 1
        if cur_game == 1: #We are in a new round
            self.teams[home_index, home_index] += 1
            self.teams[away_index, away_index] += 1
        
        self.teams[home_index, away_index] += 1
        self.teams[away_index, home_index] += 1
        pf_round = self.teams[home_index, home_index]

        return pf_round, cur_game


    def get_playoff_year_schedule(self, year):
        driver = webdriver.Chrome(options=self.options)

        url = f"https://www.basketball-reference.com/playoffs/NBA_{year}_games.html"
        driver.get(url)
        try:
            schedule_table = WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.ID, "schedule"))
                )
            rows = schedule_table.find_elements(By.TAG_NAME, "tr")
            for row in rows:
                data = row.find_elements(By.TAG_NAME, "td")
                if (len(data) > 9):
                    time = data[0].text
                    visitor = data[1].text
                    visitor_score = data[2].text
                    home = data[3].text
                    home_score = data[4].text
                    boxscore = get_href(row)
                    attendance = data[7].text
                    arena = data[9].text
                    round, game_no = self.get_playoff_game_number(home, visitor)
                    self.df.loc[len(self.df)] = [year, round, game_no, home, visitor,
                                                 home_score, visitor_score, boxscore, attendance, arena, time]

        except TimeoutException:
            print(f"{year} did not work")
            driver.quit()

        driver.quit()

    def run_loop(self, years):
        for year in years:
            self.teams = np.zeros(shape = [16, 16]) #reset for each year
            self.team_keys = {}
            self.get_playoff_year_schedule(year)
            print(f"{year} done")


In [44]:
pf = playoff_schedule()
years = [i for i in range(2012,2025)]
pf.run_loop(years)

2012 done
2013 done
2014 done
2015 done
2016 done
2017 done
2018 done
2019 done
2020 done
2021 done
2022 done
2023 done
2024 done


In [50]:
pf.df["game_id"] = 15410 + pf.df.index #Game ID starting at the end of all the regular season games we have so it's unique
pf.df.to_csv("../data/PlayoffScheduleData.csv", index=False)

In [1]:
#OKAY, now on to lineups!
#Never mind, gotta get box score and play by play for all the playoff games...
#I'm pretty much just gonna copy and paste the code we used from milestone 2 for box score and pbp data

team_to_abbrev = {
    "New York Knicks": "NYK",
    "Dallas Mavericks": "DAL",
    "Los Angeles Lakers": "LAL",
    "Oklahoma City Thunder": "OKC",
    "Golden State Warriors": "GSW",
    "Cleveland Cavaliers": "CLE",
    "Washington Wizards": "WAS",
    "Orlando Magic": "ORL",
    "Indiana Pacers": "IND",
    "Charlotte Bobcats": "CHA",
    "Minnesota Timberwolves": "MIN",
    "San Antonio Spurs": "SAS",
    "Phoenix Suns": "PHO",
    "Portland Trail Blazers": "POR",
    "Sacramento Kings": "SAC",
    "New Jersey Nets": "NJN",
    "Miami Heat": "MIA",
    "Milwaukee Bucks": "MIL",
    "Toronto Raptors": "TOR",
    "Atlanta Hawks": "ATL",
    "Detroit Pistons": "DET",
    "New Orleans Hornets": "NOH",
    "Memphis Grizzlies": "MEM",
    "Denver Nuggets": "DEN",
    "Houston Rockets": "HOU",
    "Boston Celtics": "BOS",
    "Utah Jazz": "UTA",
    "Los Angeles Clippers": "LAC",
    "Chicago Bulls": "CHI",
    "Philadelphia 76ers": "PHI",
    "Brooklyn Nets": "BRK",
    "New Orleans Pelicans": "NOP",
    "Charlotte Hornets": "CHO"
} #Used to convert the team name to the abbreviation used by basketabll reference

In [34]:
class box_score_scraping:
    def __init__(self, driver, game_id, url, away_team, home_team):
        self.home_team = home_team
        self.away_team = away_team
        self.game_id = game_id
        self.player_dict = {}
        self.driver = driver

        
        self.url = url

    def get_box_score(self):
        time.sleep(random.uniform(0.5, 3))
        self.driver.get(self.url)
        col_names = []

        player_dict = {"Game_ID": self.game_id, "Player_Name": [], "Player_ID": [], "Starter": [], "Home": []}

        try:
            WebDriverWait(self.driver, 5, poll_frequency=0.1).until(
                EC.presence_of_element_located((By.ID, f"div_box-{self.home_team}-game-basic"))
                )
            # WebDriverWait(self.driver, 10, poll_frequency=0.1).until(
            #     EC.presence_of_element_located((By.ID, "content"))
            #     )
            home_table = self.driver.find_element(By.ID, f"div_box-{self.home_team}-game-basic")
            away_table = self.driver.find_element(By.ID, f"div_box-{self.away_team}-game-basic")


            # away_table = away_table = WebDriverWait(self.driver, 20).until(
            #     EC.presence_of_element_located((By.ID, f"div_box-{self.away_team}-game-basic"))
            # )
        except TimeoutException as e:
            raise e
        
        rows = home_table.find_elements(By.TAG_NAME, "tr")
        starter = 0
        for row in rows:
            starter += 1

            if (starter == 2): #These are the column names
                col_names = row.text.split()[1:] #Get rid of the starters column name because we're just using player name instead
                for i in col_names:
                    player_dict[i] = []
            
            jawn = row.find_elements(By.CSS_SELECTOR, "th")
            player_id = jawn[0].get_attribute("data-append-csv")
            player_name = jawn[0].text
            if player_id is None:
                continue #This is one of the descriptor or title rows, not the things we want
            data = row.find_elements(By.TAG_NAME, "td")
            if len(data) == 1: #DID NOT PLAY, WE DON'T COUNT THEM
                continue

            #We have the data, just need to add it to the dictionary properly
            player_dict["Player_ID"].append(player_id)
            player_dict["Home"].append(True)
            player_dict["Player_Name"].append(player_name)
            player_dict["Starter"].append(True if starter <= 7 else False)

            for i in range(len(col_names)):
                player_dict[col_names[i]].append(data[i].text)

        
        rows = away_table.find_elements(By.TAG_NAME, "tr")
        starter = 0
        for row in rows:
            starter += 1
            
            jawn = row.find_elements(By.CSS_SELECTOR, "th")
            player_id = jawn[0].get_attribute("data-append-csv")
            player_name = jawn[0].text
            if player_id is None:
                continue #This is one of the descriptor or title rows, not the things we want
            data = row.find_elements(By.TAG_NAME, "td")
            if len(data) == 1: #DID NOT PLAY, WE DON'T COUNT THEM
                continue

            #We have the data, just need to add it to the dictionary properly
            player_dict["Player_ID"].append(player_id)
            player_dict["Home"].append(False)
            player_dict["Player_Name"].append(player_name)
            player_dict["Starter"].append(True if starter <= 7 else False)

            for i in range(len(col_names)):
                player_dict[col_names[i]].append(data[i].text)



        self.player_dict = player_dict
    
    def append_data(self):
        df = pd.DataFrame.from_dict(self.player_dict)
        df.to_csv("../data/PlayoffBoxScoreData.csv", mode = 'a', header=False)

In [45]:
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-extensions")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-popup-blocking")
options.add_argument("--disable-notifications")
options.add_argument("--disable-infobars")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)


df = pd.read_csv("../data/PlayoffScheduleData.csv")
already_done = pd.read_csv("../data/PlayoffBoxScoreData.csv")["Game_ID"]

remaining_df = df.loc[~df["game_id"].isin(already_done)]

driver = webdriver.Chrome(options=options)

games = 0

error = 0
success = 0

for index, row in remaining_df.iterrows():
    bss = box_score_scraping(driver, row["game_id"], row["url"], team_to_abbrev[row["Away Team"]], team_to_abbrev[row["Home Team"]])
    try:
        bss.get_box_score()
            
        bss.append_data()
        error = 0
        games += 1
        success += 1
        print(games)

        if (games % 20 == 0): #restart the driver every so often, it seems to be slowing down heavily
            driver.quit()
            driver = webdriver.Chrome(options=options)
            success = 0


    except Exception as e:
        success = 0
        error += 1
        print(e)
        print(row["url"])

        if (error > 5): # We got nothing returned 5 times in a row, let it rest
            print("chilling")
            driver.quit()
            time.sleep(10)
            error = 0
            driver = webdriver.Chrome(options=options)
        continue
driver.quit()

In [37]:
driver.quit()