In [232]:
import requests
from lxml import html
from dateutil import parser
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('precision', 2)

In [233]:
team_name_abbrev = {
    'Atlanta Hawks':           'ATL',
    'Boston Celtics':          'BOS',
    'Brooklyn Nets':           'BRK',
    'Charlotte Hornets':       'CHO',
    'Cleveland Cavaliers':     'CLE',
    'Chicago Bulls':           'CHI',
    'Dallas Mavericks':        'DAL',
    'Denver Nuggets':          'DEN',
    'Detroit Pistons':         'DET',
    'Golden State Warriors':   'GSW',
    'Houston Rockets':         'HOU',
    'Indiana Pacers':          'IND',
    'Los Angeles Clippers':    'LAC',
    'Los Angeles Lakers':      'LAL',
    'Memphis Grizzlies':       'MEM',
    'Miami Heat':              'MIA',
    'Milwaukee Bucks':         'MIL',
    'Minnesota Timberwolves':  'MIN',
    'New Orleans Pelicans':    'NOP',
    'New York Knicks':         'NYK',
    'Oklahoma City Thunder':   'OKC',
    'Orlando Magic':           'ORL',
    'Philadelphia 76ers':      'PHI',
    'Phoenix Suns':            'PHO',
    'Portland Trailblazers':   'POR',
    'Sacramento Kings':        'SAC',
    'San Antonio Spurs':       'SAS',
    'Toronto Raptors':         'TOR',
    'Utah Jazz':               'UTA',
    'Washington Wizards':      'WAS',
}

In [234]:
def get_monthly_schedule(year, month):
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html'
    page = requests.get(url)
    tree = html.fromstring(page.content)
    
    game_date = tree.xpath('//*[@data-stat="date_game"]/a/text()')

    road_team = tree.xpath('//*[@data-stat="visitor_team_name"]/a/text()')
    road_pts = tree.xpath('//*[@data-stat="visitor_pts"]/text()')
    road_pts.pop(0)  # Remove col name

    home_team = tree.xpath('//*[@data-stat="home_team_name"]/a/text()')
    home_pts = tree.xpath('//*[@data-stat="home_pts"]/text()')
    home_pts.pop(0)  # Remove col name

    box_score = tree.xpath('//*[@data-stat="box_score_text"]/a/@href')
    
    sched = {
        'DATE':           game_date,
        'ROAD_TEAM':      road_team,
        'ROAD_PTS':       road_pts,
        'HOME_TEAM':      home_team,
        'HOME_PTS':       home_pts,
        'BOX_SCORE_URL':  box_score,
    }
    
    sched = pd.DataFrame(sched)
    sched['ROAD_TM'] = sched['ROAD_TEAM'].map(team_name_abbrev)
    sched['HOME_TM'] = sched['HOME_TEAM'].map(team_name_abbrev)
    sched = sched[['DATE', 'ROAD_TEAM', 'ROAD_TM', 'ROAD_PTS',
                           'HOME_TEAM', 'HOME_TM', 'HOME_PTS', 'BOX_SCORE_URL']]
    
    BBALLREF = 'https://www.basketball-reference.com'
    sched['BOX_SCORE_URL'] = sched['BOX_SCORE_URL'].apply(lambda x: BBALLREF + x)
    
    def format_date(date):
        date = parser.parse(date)
        return date.strftime('%Y-%m-%d')
    
    sched['DATE'] = sched['DATE'].apply(format_date)
    
    return sched

In [235]:
def get_daily_schedule(date):
    """
    date: a string with format 'YYYY-MM-DD'
    """
    
    # Get month and day from date
    parsed_date = parser.parse(date)
    month = parsed_date.strftime('%B').lower()
    year = parsed_date.strftime('%Y')
    
    if month in ['october, november, december']:
        year = str(int(year) + 1)  # Increment year
    
    sched = get_monthly_schedule(year, month)
    
    return sched.query('DATE == @date').reset_index(drop=True)
    

In [238]:
class BoxScore:
    def __init__(self, tree):
        self.tree = tree

        
    def _get_col(self, col_name):
        subtree = self.tree.xpath(self.table + f'//td[@data-stat="{col_name}"]')
        return [el.text for el in subtree]

    
    def _get_players(self):
        inactive_players = []
        rows = self.tree.xpath(self.table + '/tbody/tr')
        for row in rows:
            player = row.xpath('th/a/text()')
            stats = row.xpath('td/text()')
            if (len(stats) > 0) and (stats[0] == 'Did Not Play'):
                inactive_players.append(player[0])
                
        active_players = self.tree.xpath(self.table + '//th[@data-stat="player"]/a/text()')
        for player in inactive_players:
            active_players.remove(player)
        active_players.append('Team Totals')
        
        return active_players, inactive_players

    
    def _format_time(MP):
        if len(MP.split(':')) > 1:
            (m, s) = MP.split(':')
            return int(m) + int(s) / 60
        else:
            return int(MP)
        
        
class BasicBoxScore(BoxScore):
    def get(self, team_name):
        self.table = f'//*[@id="box_{team_name}_basic"]'
        
        box_score = {}
        active_players, inactive_players = self._get_players()
        box_score['PLAYER_NAME'] = active_players

        col_names = self.tree.xpath(self.table + '/thead/tr[2]/th/text()')
        col_names.pop(0)   # Remove player name col

        data_stats = self.tree.xpath(self.table + '/thead/tr[2]/th/@data-stat')
        data_stats.pop(0)  # Remove player name data attribute 

        for col, stat in zip(col_names, data_stats):
            box_score[col] = self._get_col(stat)
    
        box_score = pd.DataFrame(box_score)
        box_score.fillna(value=np.nan, inplace=True)

        box_score['MP'] = box_score['MP'].apply(BasicBoxScore._format_time)
    
        float_cols = ['MP', 'FG%', '3P%', 'FT%']
        int_cols = [x for x in box_score.columns if x not in float_cols]
        int_cols.remove('PLAYER_NAME')
        int_cols.remove('+/-')

        for col in box_score.columns:
            if col in float_cols:
                box_score[col] = box_score[col].astype('float')
            if col in int_cols:
                box_score[col] = box_score[col].astype('int')
    
        return box_score
    

class AdvancedBoxScore(BoxScore):
    def get(self, team_name):
        self.table = f'//*[@id="box_{team_name}_advanced"]'
        
        box_score = {}
        active_players, inactive_players = self._get_players()
        box_score['PLAYER_NAME'] = active_players

        col_names = self.tree.xpath(self.table + '/thead/tr[2]/th/text()')
        col_names.pop(0)   # Remove player name col

        data_stats = self.tree.xpath(self.table + '/thead/tr[2]/th/@data-stat')
        data_stats.pop(0)  # Remove player name data attribute 

        for col, stat in zip(col_names, data_stats):
            box_score[col] = self._get_col(stat)
    
        box_score = pd.DataFrame(box_score)
        box_score.fillna(value=np.nan, inplace=True)

        box_score['MP'] = box_score['MP'].apply(BoxScore._format_time)
    
        for col in box_score.columns:
            if col != 'PLAYER_NAME':
                box_score[col] = box_score[col].astype('float')
    
        return box_score  

In [242]:
def get_box_scores(date, team_name, url):
    page = requests.get(url)
    tree = html.fromstring(page.content)
    
    basic = BasicBoxScore(tree).get(team_name)
    adv = AdvancedBoxScore(tree).get(team_name)
    
    basic['USG%'] = adv['USG%']
    
    return basic, adv

In [246]:
def get_daily_box_scores(sched):
    for index, row in sched.iterrows():
        game_date = row['DATE']
        road_team = row['ROAD_TM'].lower()
        home_team = row['HOME_TM'].lower()
        box_score_url = row['BOX_SCORE_URL']
        
        road_basic, road_adv = get_box_scores(game_date, road_team, box_score_url)
        home_basic, home_adv = get_box_scores(game_date, home_team, box_score_url)
        
        print(road_basic, '\n', home_basic)

In [247]:
sched = get_daily_schedule('2018-01-13')
print(sched)

         DATE              ROAD_TEAM ROAD_TM ROAD_PTS             HOME_TEAM  \
0  2018-01-13        Detroit Pistons     DET      105         Chicago Bulls   
1  2018-01-13  Oklahoma City Thunder     OKC      101     Charlotte Hornets   
2  2018-01-13     Los Angeles Lakers     LAL      107      Dallas Mavericks   
3  2018-01-13       Sacramento Kings     SAC      105  Los Angeles Clippers   
4  2018-01-13         Denver Nuggets     DEN       80     San Antonio Spurs   
5  2018-01-13  Golden State Warriors     GSW      127       Toronto Raptors   
6  2018-01-13          Brooklyn Nets     BRK      113    Washington Wizards   

  HOME_TM HOME_PTS                                      BOX_SCORE_URL  
0     CHI      107  https://www.basketball-reference.com/boxscores...  
1     CHO       91  https://www.basketball-reference.com/boxscores...  
2     DAL      101  https://www.basketball-reference.com/boxscores...  
3     LAC      126  https://www.basketball-reference.com/boxscores...  
4     S

In [248]:
get_daily_box_scores(sched)

        PLAYER_NAME      MP  FG  FGA   FG%  3P  3PA   3P%  FT  FTA   FT%  ORB  \
0     Avery Bradley   37.98  10   19  0.53   6   10  0.60   0    0   NaN    0   
1    Reggie Bullock   35.37   3    9  0.33   1    1  1.00   0    0   NaN    1   
2    Andre Drummond   33.52   8   13  0.61   0    0   NaN   5    6  0.83    8   
3     Tobias Harris   30.15   3   14  0.21   0    6  0.00   1    2  0.50    0   
4         Ish Smith   26.78   5   10  0.50   0    1  0.00   2    2  1.00    0   
5      Luke Kennard   22.65   4    8  0.50   1    4  0.25   0    0   NaN    1   
6     Dwight Buycks   21.22   4   12  0.33   1    3  0.33   1    1  1.00    1   
7  Anthony Tolliver   18.32   3    5  0.60   3    4  0.75   1    2  0.50    1   
8     Eric Moreland   14.02   1    2  0.50   0    0   NaN   1    1  1.00    1   
9       Team Totals  240.00  41   92  0.45  12   29  0.41  11   14  0.79   13   

   DRB  TRB  AST  STL  BLK  TOV  PF  PTS  +/-   USG%  
0    3    3    1    2    1    4   3   26   -4   26.4 

            PLAYER_NAME      MP  FG  FGA   FG%  3P  3PA   3P%  FT  FTA   FT%  \
0          De'Aaron Fox   35.35   6   17  0.35   1    4  0.25   4    5  0.80   
1   Willie Cauley-Stein   32.52  10   16  0.62   1    1  1.00   2    4  0.50   
2        Garrett Temple   26.18   2    5  0.40   0    2  0.00   0    0   NaN   
3           George Hill   19.43   1    4  0.25   0    2  0.00   0    0   NaN   
4         Zach Randolph   19.27   3   10  0.30   0    1  0.00   2    2  1.00   
5     Bogdan Bogdanovic   25.47   5    9  0.56   1    2  0.50   2    3  0.67   
6       Skal Labissiere   21.67   5    8  0.62   2    2  1.00   2    4  0.50   
7           Buddy Hield   21.47   5    8  0.62   0    2  0.00   0    0   NaN   
8          Kosta Koufos   17.77   3    4  0.75   0    0   NaN   0    2  0.00   
9          Vince Carter   14.45   2    3  0.67   1    1  1.00   1    2  0.50   
10       Justin Jackson    6.43   1    1  1.00   0    0   NaN   0    0   NaN   
11          Team Totals  240.00  43   85

                PLAYER_NAME      MP  FG  FGA   FG%  3P  3PA   3P%  FT  FTA  \
0   Rondae Hollis-Jefferson   38.40   9   15  0.60   0    1  0.00   4    5   
1         Spencer Dinwiddie   35.17   4   16  0.25   2   10  0.20   3    3   
2           DeMarre Carroll   32.73   6   13  0.46   1    6  0.17   0    0   
3              Allen Crabbe   32.60   5   15  0.33   3   10  0.30   0    0   
4                Quincy Acy   25.43   2    5  0.40   2    5  0.40   2    2   
5              Caris LeVert   35.47   5   16  0.31   1    7  0.14   1    2   
6             Jarrett Allen   28.85   6   10  0.60   0    0   NaN   4    5   
7                Joe Harris   21.33   3    6  0.50   2    3  0.67   3    3   
8             Jahlil Okafor    7.72   1    2  0.50   0    0   NaN   0    0   
9              Nik Stauskas    7.32   1    1  1.00   1    1  1.00   0    0   
10              Team Totals  265.00  42   99  0.42  12   43  0.28  17   20   

     FT%  ORB  DRB  TRB  AST  STL  BLK  TOV  PF  PTS  +/-   USG