In [30]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from functools import reduce
import pandas as pd
import numpy as np
import time

pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)

In [31]:
def time_to_seconds(time):
    if ':' in time:
        time = [float(i) for i in time.split(":")]
        return time[0]*60 + time[1]
    else:
        return float(time)

In [32]:
def filter_player(player_list):
    if player_list[0] == 'K.' and player_list[1] == "Thompson":
        return ["Klay Thompson"]
    elif player_list[0] == 'T.' and player_list[1] == "Thompson":
        return ["Tristan Thompson"]
    else:
        for i in player_list:
            if ("." in i) or any(pos in i for pos in ['PG', 'SG', 'SF', 'PF', 'C']):
                player_list.remove(i)

        return player_list

def roster_from_soup(soup, home=False, away=False):
    index = 0
    if home:
        index = 1
    
    roster = soup.find_all('table', {'class' : 'Table Table--align-right Table--fixed Table--fixed-left'})[index]
    roster = [i.text.split(" ") for i in roster.find_all('tr', {'class' : 'Table__TR'})]
    roster = [i for i in roster if len(i) > 1]
    roster = [filter_player(i)[0] for i in roster]
    
    # outlier
    if len([i for i in roster if i in ['Embiid', 'Maxey', 'Thybulle']]) > 0:
        roster.remove("Curry")
        roster.remove("Green")
        roster.append("Seth Curry")
        roster.append("Danny Green")
    
    return roster    

In [33]:
def rosters_from_box(driver, box_link):
    driver.get(box_link)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    away_roster = roster_from_soup(soup, away=True)
    home_roster = roster_from_soup(soup, home=True)
    
    return (away_roster, home_roster)

In [34]:
def player_status(df, name):
    df[name] = np.where((df.team == df.action_team) & (df.action.str.contains(name)), 1, 0)
    return df

In [35]:
def quarter_scraper(driver, q, rosters):
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    rows = soup.find_all('tr', {'class' : 'playByPlay__tableRow'})

    quarter = 1
    clocks, game_times, actions, action_teams, aways, homes = ([] for i in range(6))

    for row in rows:
        # clock
        clock = row.find('td', {'class' : 'playByPlay__time'}).text

        # game_time
        if "OT" in q:
            num_ot = q[0]
            game_time = 12*60*4 + (5*60 - time_to_seconds(clock)) + 5*60*(int(q[0])-1)
        else:
            game_time = 12*60 - time_to_seconds(clock) + 12*60*(int(q)-1)

        # action
        action = row.find('td', {'class' : 'playByPlay__text'}).text

        # action_team
        action_team = 'Home'
        if any(i in action for i in rosters[0]):
            action_team = 'Away'
        
        # away
        away = row.find('td', {'class' : 'playByPlay__score--away'}).text

        # home
        home = row.find('td', {'class' : 'playByPlay__score--home'}).text

        clocks.append(clock)
        game_times.append(game_time)
        actions.append(action)
        action_teams.append(action_team)
        aways.append(away)
        homes.append(home)

    game = pd.DataFrame(list(zip(clocks, game_times, [q] * len(rows), actions, action_teams, aways, homes)),
                       columns = ['clock', 'game_time', 'quarter', 'action', 'action_team', 'away', 'home'])

    return game

In [36]:
def ot_scraper(d, df, rosters):
    for path in ["//body/div[@id='espnfitt']/div[@id='DataWrapper']/div[@id='fitt-analytics']/div[1]/div[4]/div[2]/div[1]/div[5]/div[1]/div[1]/section[2]/div[1]/nav[1]/ul[1]/li[5]/button[1]", 
                 "//button[contains(text(),'2 OT')]", 
                 "//button[contains(text(),'3 OT')]", 
                 "//button[contains(text(),'4 OT')]"]:
        try:
            d.find_element(By.XPATH, path).click()
            df = pd.concat([df, quarter_scraper(d, "OT", rosters)])
        except:
            pass

In [37]:
def game_scraper(d, url, rosters, home_away):
    d.get(url)
    time.sleep(1)
    soup = BeautifulSoup(d.page_source, 'html.parser')
    rows = soup.find_all('tr', {'class' : 'playByPlay__tableRow'})
    
    df = pd.DataFrame()

    for q in ['1st', '2nd', '3rd', '4th']:
        d.find_element(By.XPATH, "//button[contains(text(),'" + q + "')]").click()
        df = pd.concat([df, quarter_scraper(d, q[0], rosters)])
        
    ot_scraper(d, df, rosters)
    
    if home_away == 'vs':
        df['team'] = 'Home'
        
        for player in rosters[1]:
            df = player_status(df, player)
    else:
        df['team'] = 'Away'
        
        for player in rosters[0]:
            df = player_status(df, player)

    return df

In [38]:
def link_scraper(driver, link, home_away):
    pbp_link = "/playbyplay/".join(link.split("/game/"))
    box_link = "/boxscore/".join(link.split("/game/"))
    rosters = rosters_from_box(driver, box_link)

    return game_scraper(driver, pbp_link, rosters, home_away)

In [39]:
def season_scraper(url):
    driver = webdriver.Chrome(ChromeDriverManager().install())
    driver.get(url)
    time.sleep(2)
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    game_links = soup.find_all('tr', {'class' : 'Table__TR'})[1:]
    link_status = [(g.find('span', {'class' : 'ml4'}).find('a')['href'], # game_link and home/away status
                    g.find('span', {'class' : 'pr2'}).text) for g in game_links if ("Postponed" not in g.text)]
    season_logs = [link_scraper(driver, i[0], i[1]) for i in link_status]
    
    driver.quit()

    return season_logs

In [40]:
def game_index(df, index):
    df['Game'] = index
    return df

In [41]:
# gsw = season_scraper("https://www.espn.com/nba/team/schedule/_/name/gs/seasontype/2")

# gsw = [game_index(g, i+1) for i, g in enumerate(gsw)]
# warriors = reduce(lambda left, right: pd.merge(left, right, how='outer'), gsw)
# warriors = warriors.fillna(0)
# warriors.rename(columns={'Klay Thompson':'Thompson'}, inplace=True)

# warriors.to_csv("warriors.csv")

In [45]:
warriors = pd.read_csv("warriors.csv").iloc[:, 1:]

In [43]:
warriors_ast = warriors[(warriors.action.str.contains("assists")) & (warriors.team == warriors.action_team)]
warriors_ast['Assister'] = warriors_ast.action.apply(lambda a : a.split("(")[1].split(" assists")[0])
warriors_ast['Assisted'] = warriors_ast.action.apply(lambda a : a.split(" makes")[0])
warriors_ast['Duo'] = warriors_ast.Assister + " to " + warriors_ast.Assisted

duos = pd.DataFrame(warriors_ast.Duo.value_counts()).reset_index()
duos.columns = ['Duo', 'Count']

duos.head(5)

Unnamed: 0,Duo,Count
0,Draymond Green to Andrew Wiggins,68
1,Jordan Poole to Andrew Wiggins,65
2,Draymond Green to Stephen Curry,63
3,Draymond Green to Jordan Poole,55
4,Stephen Curry to Kevon Looney,54
