In [16]:
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import numpy as np
import re
import random
import time
from datetime import datetime, date

In [64]:
class pbp_scraping:
    def __init__(self, driver, game_id, url):
        self.url = url
        self.pbp_dict = {"game_id": game_id, "time": [], "quarter": [], "player_id": [],
                         "play_type": [], "secondary_player_id": [], "away score": [], "home score": [],
                         "rebound": [], "shot_type": [], "distance": [], "shot_score": [], "away_lineup_id": [], "home_lineup_id": []} #Gonna convert to dataframe later

        self.driver = driver
        self.quarter = "1"


    

    def get_player_id(self, play_col):
        # 'https://www.basketball-reference.com/players/c/curryst01.html'
        # Format of a link ^
        links = play_col.find_elements(By.XPATH, ".//a[contains(@href, '/players/')]")
        player_1 = "Team"
        player_2 = np.nan
        
        if len(links) >= 1:
            player_1 = links[0].get_attribute("href")
            player_1 = player_1.split("/")[-1]
            player_1 = player_1.split(".")[0]
        
        if len(links) == 2:
            player_2 = links[1].get_attribute("href")
            player_2 = player_2.split("/")[-1]
            player_2 = player_2.split(".")[0]


        return player_1, player_2
    
    def play_type(self, away_text, home_text):
        col_num = 0
        if away_text != " ":
            text = away_text
            col_num = 1
        if home_text != " ":
            text = home_text
            col_num = 5
        
        try:
            if (text == " "):
                return "No Play", col_num
            words = text.split()
            
            # Rebound check
            if (words[1] == "rebound"):
                return "Rebound", col_num
            # Shot made check
            if (words[2] == "makes"):
                # ADD FREE THROW CHECK
                return "Shot Made", col_num
            # Shot missed check
            if (words[2] == "misses"):
                return "Shot Missed", col_num
            # Turnover
            if (words[0] == "Turnover"):
                return "Turnover", col_num
            # Foul
            if ((words[1] == "foul") | (words[2] == "foul")):
                return "Foul", col_num
            # Enters
            if (words[2] == "enters"):
                return "Enters", col_num
            # Timeout
            if (words[3] == "timeout"):
                return "Timeout", col_num
            
            return "No Play", col_num
        
        except IndexError:
            return "No Play", col_num
        
    
    def get_shot_data(self, shot_text):
        shot_score = np.nan
        shot_type = np.nan
        distance = np.nan
        shot_info = shot_text.text.split()
        shot_score = shot_info[3]
        shot_type = shot_info[4]
        # distance_check = shot_text.text.split("(")[0]
        # distance = distance_check.split()[-1]
        distance_check = re.search(r"(\d+) ft", shot_text.text)
        distance = distance_check.group(0) if distance_check else np.nan

        return shot_score, shot_type, distance

    def get_rebound_type(self, rebound_text):
        rebound = np.nan

        rebound = rebound_text.text.split()[0] #Offensive or Defensive
        return rebound
        
    def append_to_dict(self, timer, player_id, play_type, secondary_player_id, 
                       away_score, home_score, rebound,
                       shot_type, distance, shot_score):
        self.pbp_dict["time"].append(timer)
        self.pbp_dict["quarter"].append(self.quarter)
        self.pbp_dict["player_id"].append(player_id)
        self.pbp_dict["play_type"].append(play_type)
        self.pbp_dict["secondary_player_id"].append(secondary_player_id)
        self.pbp_dict["rebound"].append(rebound)
        self.pbp_dict["shot_type"].append(shot_type)
        self.pbp_dict["distance"].append(distance)
        self.pbp_dict["shot_score"].append(shot_score)
        self.pbp_dict["away score"].append(away_score)
        self.pbp_dict["home score"].append(home_score)
        self.pbp_dict["away_lineup_id"].append(-1)
        self.pbp_dict["home_lineup_id"].append(-1)
        #Tada!
        
    def scrape_table(self):
        time.sleep(random.uniform(0.1,1))

        self.driver.get(self.url)
        play_table = self.driver.find_element(By.ID, "all_pbp")
        rows = play_table.find_elements(By.TAG_NAME, "tr")
        for row in rows:
            cols = row.find_elements(By.TAG_NAME, "td")
            #Check that it is a play
            if len(cols) == 6:
                play_type, col_num = self.play_type(cols[1].text, cols[5].text)
                if (play_type == "No Play"): #Move to the next row if we don't have a play
                    continue #This really only happens in extenuating circumstances and REALLY weird plays
                player_id, secondary_player_id = self.get_player_id(cols[col_num])
                timer = cols[0].text
                scores = cols[3].text.split("-")
                away_score = scores[0]
                home_score = scores[1]
                if (play_type in ["Shot Made", "Shot Missed"]):
                    shot_score, shot_type, distance = self.get_shot_data(cols[col_num])
                else:
                    shot_score, shot_type, distance = np.nan, np.nan, np.nan
                if (play_type=="Rebound"):
                    rebound = self.get_rebound_type(cols[col_num])
                else:
                    rebound = np.nan
                self.append_to_dict(timer, player_id, play_type, secondary_player_id, 
                                    away_score, home_score, rebound,
                                    shot_type, distance, shot_score)

            #check that quarter has ended
            if len(cols) == 2:
                quarter_text = cols[1].text
                if quarter_text.startswith("Start of"):
                    self.quarter = f"{quarter_text.split()[-2]} {quarter_text.split()[-1]}"
            

            


In [None]:
head_options = webdriver.ChromeOptions()
head_options.add_argument("--disable-blink-features=AutomationControlled")
head_options.add_argument("--disable-gpu")
head_options.add_argument("--no-sandbox")
head_options.add_argument("--disable-extensions")
head_options.add_argument("--disable-dev-shm-usage")
head_options.add_argument("--disable-popup-blocking")
head_options.add_argument("--disable-notifications")
head_options.add_argument("--disable-infobars")
head_options.add_experimental_option("excludeSwitches", ["enable-automation"])
head_options.add_experimental_option('useAutomationExtension', False)
prefs = {"profile.managed_default_content_settings.images": 2}
head_options.add_experimental_option("prefs", prefs)


driver = webdriver.Chrome(options=head_options)

games = 0

#GET THE GAME IDS WE HAVEN'T DONE YET
done_ids = pd.read_csv("../data/PBPData.csv")["game_id"].unique().tolist()
all_game_ids = pd.read_csv("../data/BoxScoreData.csv")["Game_ID"].unique().tolist()
game_ids = [gid for gid in all_game_ids if gid not in done_ids]
game_ids = [str(gid) for gid in game_ids if gid > 10000]
game_ids.sort(reverse=True)

df = pd.read_csv("../data/ScheduleData.csv")



for i in game_ids:
    fake_url = df.loc[df["game_id"] == i]["url"].values[0] #Get the url, need to change it up a bit though
    splitted = fake_url.split("/boxscores")

    url = f"{splitted[0]}/boxscores/pbp{splitted[1]}"
    try:
        pbp_class = pbp_scraping(driver, i, url)
        pbp_class.scrape_table()
        new_df = pd.DataFrame.from_dict(pbp_class.pbp_dict)
        new_df.to_csv("../data/PBPData.csv", mode='a', header=False, index=False)
        games += 1
        print(i)
    except Exception as e:
        print(f"Error on game {i}: {e}")
        continue





driver.quit()

In [107]:
driver.quit()

In [25]:
sched = pd.read_csv("../data/ScheduleData.csv")

sched["date"] = sched["url"].apply(lambda x: date(
    int(x.split("/")[-1][0:4]),
    int(x.split("/")[-1][4:6]),
    int(x.split("/")[-1][6:8])
))
sched["date"] = pd.to_datetime(sched["date"])

In [31]:
sched.loc[(sched.date >= "2020-12-15") & (sched.date <= "2021-05-16"), "Season"] = 2021
sched.loc[(sched.date >= "2021-10-19") & (sched.date <= "2022-04-10"), "Season"] = 2022
sched.loc[(sched.date >= "2022-10-15") & (sched.date <= "2023-04-09"), "Season"] = 2023
sched.loc[(sched.date >= "2023-10-15") & (sched.date <= "2024-04-14"), "Season"] = 2024


In [35]:
sched = sched.loc[~sched.Season.isna()]

In [None]:
head_options = webdriver.ChromeOptions()
head_options.add_argument("--headless")
head_options.add_argument("--disable-blink-features=AutomationControlled")
head_options.add_argument("--disable-gpu")
head_options.add_argument("--no-sandbox")
head_options.add_argument("--disable-extensions")
head_options.add_argument("--disable-dev-shm-usage")
head_options.add_argument("--disable-popup-blocking")
head_options.add_argument("--disable-notifications")
head_options.add_argument("--disable-infobars")
head_options.add_experimental_option("excludeSwitches", ["enable-automation"])
head_options.add_experimental_option('useAutomationExtension', False)
prefs = {"profile.managed_default_content_settings.images": 2}
head_options.add_experimental_option("prefs", prefs)


driver = webdriver.Chrome(options=head_options)

games = 0

pbp_ids= pd.read_csv("../data/PlayByPlay.csv")["game_id"]

sched = sched.loc[np.abs(sched["Away Score"].astype(int) - sched["Home Score"].astype(int)) <= 10]

game_ids = sched.loc[~sched.game_id.isin(pbp_ids)]["game_id"]


df = pd.read_csv("../data/ScheduleData.csv")

print(len(game_ids))
for i in game_ids:
    fake_url = df.loc[df["game_id"] == i]["url"].values[0] #Get the url, need to change it up a bit though
    splitted = fake_url.split("/boxscores")

    url = f"{splitted[0]}/boxscores/pbp{splitted[1]}"
    try:
        pbp_class = pbp_scraping(driver, i, url)
        pbp_class.scrape_table()
        new_df = pd.DataFrame.from_dict(pbp_class.pbp_dict)
        new_df.to_csv("../data/PlayByPlay.csv", mode='a', header=False, index=False)
        games += 1
        print(i)
    except Exception as e:
        print(f"Error on game {i}: {e}")
        continue





driver.quit()

In [50]:
driver.quit()

In [65]:
# Now for playoffs

head_options = webdriver.ChromeOptions()
head_options.add_argument("--headless")
head_options.add_argument("--disable-blink-features=AutomationControlled")
head_options.add_argument("--disable-gpu")
head_options.add_argument("--no-sandbox")
head_options.add_argument("--disable-extensions")
head_options.add_argument("--disable-dev-shm-usage")
head_options.add_argument("--disable-popup-blocking")
head_options.add_argument("--disable-notifications")
head_options.add_argument("--disable-infobars")
head_options.add_experimental_option("excludeSwitches", ["enable-automation"])
head_options.add_experimental_option('useAutomationExtension', False)
prefs = {"profile.managed_default_content_settings.images": 2}
head_options.add_experimental_option("prefs", prefs)


driver = webdriver.Chrome(options=head_options)


df = pd.read_csv("../data/PlayoffScheduleData.csv")

playoff_game_ids = df["game_id"]






for i in playoff_game_ids:
    fake_url = df.loc[df["game_id"] == i]["url"].values[0]
    splitted = fake_url.split("/boxscores")

    url = f"{splitted[0]}/boxscores/pbp{splitted[1]}"
    try:
        pbp_class = pbp_scraping(driver, i, url)
        pbp_class.scrape_table()
        new_df = pd.DataFrame.from_dict(pbp_class.pbp_dict)
        new_df.to_csv("../data/PlayoffPBPData.csv", mode='a', header=False, index=False)
        games += 1
        print(i)
    except Exception as e:
        print(url)
        print(f"Error on game {i}: {e}")
        continue





driver.quit()

15410
15411
15412
15413
15414
15415
15416
15417
15418
15419
15420
15421
15422
15423
15424
15425
15426
15427
15428
15429
15430
15431
15432
15433
15434
15435
15436
15437
15438
15439
15440
15441
15442
15443
15444
15445
15446
15447
15448
15449
15450
15451
15452
15453
15454
15455
15456
15457
15458
15459
15460
15461
15462
15463
15464
15465
15466
15467
15468
15469
15470
15471
15472
15473
15474
15475
15476
15477
15478
15479
15480
15481
15482
15483
15484
15485
15486
15487
15488
15489
15490
15491
15492
15493
15494
15495
15496
15497
15498
15499
15500
15501
15502
15503
15504
15505
15506
15507
15508
15509
15510
15511
15512
15513
15514
15515
15516
15517
15518
15519
15520
15521
15522
15523
15524
15525
15526
15527
15528
15529
15530
15531
15532
15533
15534
15535
15536
15537
15538
15539
15540
15541
15542
15543
15544
15545
15546
15547
15548
15549
15550
15551
15552
15553
15554
15555
15556
15557
15558
15559
15560
15561
15562
15563
15564
15565
15566
15567
15568
15569
15570
15571
15572
15573
15574
15575
1557