In [288]:
import requests
from lxml import html
from dateutil import parser
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('precision', 2)
pd.set_option('expand_frame_repr', True)

In [298]:
team_name_abbrev = {
    'Atlanta Hawks':           'ATL',
    'Boston Celtics':          'BOS',
    'Brooklyn Nets':           'BRK',
    'Charlotte Hornets':       'CHO',
    'Cleveland Cavaliers':     'CLE',
    'Chicago Bulls':           'CHI',
    'Dallas Mavericks':        'DAL',
    'Denver Nuggets':          'DEN',
    'Detroit Pistons':         'DET',
    'Golden State Warriors':   'GSW',
    'Houston Rockets':         'HOU',
    'Indiana Pacers':          'IND',
    'Los Angeles Clippers':    'LAC',
    'Los Angeles Lakers':      'LAL',
    'Memphis Grizzlies':       'MEM',
    'Miami Heat':              'MIA',
    'Milwaukee Bucks':         'MIL',
    'Minnesota Timberwolves':  'MIN',
    'New Orleans Pelicans':    'NOP',
    'New York Knicks':         'NYK',
    'Oklahoma City Thunder':   'OKC',
    'Orlando Magic':           'ORL',
    'Philadelphia 76ers':      'PHI',
    'Phoenix Suns':            'PHO',
    'Portland Trail Blazers':  'POR',
    'Sacramento Kings':        'SAC',
    'San Antonio Spurs':       'SAS',
    'Toronto Raptors':         'TOR',
    'Utah Jazz':               'UTA',
    'Washington Wizards':      'WAS',
}

In [234]:
def get_monthly_schedule(year, month):
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html'
    page = requests.get(url)
    tree = html.fromstring(page.content)
    
    game_date = tree.xpath('//*[@data-stat="date_game"]/a/text()')

    road_team = tree.xpath('//*[@data-stat="visitor_team_name"]/a/text()')
    road_pts = tree.xpath('//*[@data-stat="visitor_pts"]/text()')
    road_pts.pop(0)  # Remove col name

    home_team = tree.xpath('//*[@data-stat="home_team_name"]/a/text()')
    home_pts = tree.xpath('//*[@data-stat="home_pts"]/text()')
    home_pts.pop(0)  # Remove col name

    box_score = tree.xpath('//*[@data-stat="box_score_text"]/a/@href')
    
    sched = {
        'DATE':           game_date,
        'ROAD_TEAM':      road_team,
        'ROAD_PTS':       road_pts,
        'HOME_TEAM':      home_team,
        'HOME_PTS':       home_pts,
        'BOX_SCORE_URL':  box_score,
    }
    
    sched = pd.DataFrame(sched)
    sched['ROAD_TM'] = sched['ROAD_TEAM'].map(team_name_abbrev)
    sched['HOME_TM'] = sched['HOME_TEAM'].map(team_name_abbrev)
    sched = sched[['DATE', 'ROAD_TEAM', 'ROAD_TM', 'ROAD_PTS',
                           'HOME_TEAM', 'HOME_TM', 'HOME_PTS', 'BOX_SCORE_URL']]
    
    BBALLREF = 'https://www.basketball-reference.com'
    sched['BOX_SCORE_URL'] = sched['BOX_SCORE_URL'].apply(lambda x: BBALLREF + x)
    
    def format_date(date):
        date = parser.parse(date)
        return date.strftime('%Y-%m-%d')
    
    sched['DATE'] = sched['DATE'].apply(format_date)
    
    return sched

In [235]:
def get_daily_schedule(date):
    """
    date: a string with format 'YYYY-MM-DD'
    """
    
    # Get month and day from date
    parsed_date = parser.parse(date)
    month = parsed_date.strftime('%B').lower()
    year = parsed_date.strftime('%Y')
    
    if month in ['october, november, december']:
        year = str(int(year) + 1)  # Increment year
    
    sched = get_monthly_schedule(year, month)
    
    return sched.query('DATE == @date').reset_index(drop=True)
    

In [296]:
class BoxScore:
    def __init__(self, tree):
        self.tree = tree

        
    def _get_col(self, col_name):
        subtree = self.tree.xpath(self.table + f'//td[@data-stat="{col_name}"]')
        return [el.text for el in subtree]

    
    def _get_players(self):
        inactive_players = []
        rows = self.tree.xpath(self.table + '/tbody/tr')
        for row in rows:
            player = row.xpath('th/a/text()')
            stats = row.xpath('td/text()')
            if len(stats) == 1:
                inactive_players.append(player[0])
                
        active_players = self.tree.xpath(self.table + '//th[@data-stat="player"]/a/text()')
        for player in inactive_players:
            active_players.remove(player)
        active_players.append('Team Totals')
        
        return active_players, inactive_players

    
    def _format_time(MP):
        if len(MP.split(':')) > 1:
            (m, s) = MP.split(':')
            return int(m) + int(s) / 60
        else:
            return int(MP)
        
        
class BasicBoxScore(BoxScore):
    def get(self, team_name):
        self.table = f'//*[@id="box_{team_name}_basic"]'
        
        box_score = {}
        active_players, inactive_players = self._get_players()
        box_score['PLAYER_NAME'] = active_players

        col_names = self.tree.xpath(self.table + '/thead/tr[2]/th/text()')
        col_names.pop(0)   # Remove player name col

        data_stats = self.tree.xpath(self.table + '/thead/tr[2]/th/@data-stat')
        data_stats.pop(0)  # Remove player name data attribute 

        for col, stat in zip(col_names, data_stats):
            box_score[col] = self._get_col(stat)

        box_score = pd.DataFrame(box_score)
        box_score.fillna(value=np.nan, inplace=True)

        box_score['MP'] = box_score['MP'].apply(BasicBoxScore._format_time)
    
        return box_score
    

class AdvancedBoxScore(BoxScore):
    def get(self, team_name):
        self.table = f'//*[@id="box_{team_name}_advanced"]'
        
        box_score = {}
        active_players, inactive_players = self._get_players()
        box_score['PLAYER_NAME'] = active_players

        col_names = self.tree.xpath(self.table + '/thead/tr[2]/th/text()')
        col_names.pop(0)   # Remove player name col

        data_stats = self.tree.xpath(self.table + '/thead/tr[2]/th/@data-stat')
        data_stats.pop(0)  # Remove player name data attribute 

        for col, stat in zip(col_names, data_stats):
            box_score[col] = self._get_col(stat)
    
        box_score = pd.DataFrame(box_score)
        box_score.fillna(value=np.nan, inplace=True)

        box_score['MP'] = box_score['MP'].apply(BoxScore._format_time)
    
        return box_score  

In [258]:
def get_box_scores(date, team_name, url):
    page = requests.get(url)
    tree = html.fromstring(page.content)
    
    basic = BasicBoxScore(tree).get(team_name.lower())
    adv = AdvancedBoxScore(tree).get(team_name.lower())
    
    basic['USG%'] = adv['USG%']
    
    return basic, adv

In [254]:
basic_box_score = './basic_box_score.csv'
adv_box_score = './adv_box_score.csv'

In [265]:
def get_daily_box_scores(sched):
    global basic_box_score, adv_box_score
    
    for index, row in sched.iterrows():
        game_date = row['DATE']
        road_team = row['ROAD_TM']
        home_team = row['HOME_TM']
        box_score_url = row['BOX_SCORE_URL']
        
        road_basic, road_adv = get_box_scores(game_date, road_team, box_score_url)
        home_basic, home_adv = get_box_scores(game_date, home_team, box_score_url)

        # Road team
        road_basic['DATE'] = game_date
        road_basic['OWN_TEAM'] = road_team
        road_basic['OPP_TEAM'] = home_team
        road_basic['VENUE'] = 'R'

        # Home team
        home_basic['DATE'] = game_date
        home_basic['OWN_TEAM'] = home_team
        home_basic['OPP_TEAM'] = road_team
        home_basic['VENUE'] = 'H'        
        
        basic = pd.concat([road_basic, home_basic])
        
        reordered_cols = ['DATE', 'PLAYER_NAME', 'OWN_TEAM', 'OPP_TEAM', 'VENUE', 'MP',
                          'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%',
                          'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', '+/-', 'USG%']
        basic = basic[reordered_cols]
        
        with open(basic_box_score, 'a') as f:
            basic.to_csv(f, header=False, index=False)
            
        print(f'Grabbed {road_team} vs {home_team} box score!')
        #print(road_basic, '\n', home_basic)
    print('All done!')

In [304]:
sched = get_daily_schedule('2018-04-03')
sched

Unnamed: 0,DATE,ROAD_TEAM,ROAD_TM,ROAD_PTS,HOME_TEAM,HOME_TM,HOME_PTS,BOX_SCORE_URL
0,2018-04-03,Charlotte Hornets,CHO,114,Chicago Bulls,CHI,120,https://www.basketball-reference.com/boxscores...
1,2018-04-03,Toronto Raptors,TOR,106,Cleveland Cavaliers,CLE,112,https://www.basketball-reference.com/boxscores...
2,2018-04-03,Portland Trail Blazers,POR,109,Dallas Mavericks,DAL,115,https://www.basketball-reference.com/boxscores...
3,2018-04-03,Indiana Pacers,IND,104,Denver Nuggets,DEN,107,https://www.basketball-reference.com/boxscores...
4,2018-04-03,Washington Wizards,WAS,104,Houston Rockets,HOU,120,https://www.basketball-reference.com/boxscores...
5,2018-04-03,San Antonio Spurs,SAS,110,Los Angeles Clippers,LAC,113,https://www.basketball-reference.com/boxscores...
6,2018-04-03,Atlanta Hawks,ATL,98,Miami Heat,MIA,101,https://www.basketball-reference.com/boxscores...
7,2018-04-03,Boston Celtics,BOS,102,Milwaukee Bucks,MIL,106,https://www.basketball-reference.com/boxscores...
8,2018-04-03,Orlando Magic,ORL,97,New York Knicks,NYK,73,https://www.basketball-reference.com/boxscores...
9,2018-04-03,Golden State Warriors,GSW,111,Oklahoma City Thunder,OKC,107,https://www.basketball-reference.com/boxscores...


In [305]:
get_daily_box_scores(sched)

Grabbed CHO vs CHI box score!
Grabbed TOR vs CLE box score!
Grabbed POR vs DAL box score!
Grabbed IND vs DEN box score!
Grabbed WAS vs HOU box score!
Grabbed SAS vs LAC box score!
Grabbed ATL vs MIA box score!
Grabbed BOS vs MIL box score!
Grabbed ORL vs NYK box score!
Grabbed GSW vs OKC box score!
Grabbed BRK vs PHI box score!
Grabbed SAC vs PHO box score!
Grabbed LAL vs UTA box score!
All done!
