In [409]:
import requests
from lxml import html
from dateutil import parser
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('precision', 1)

In [71]:
team_name_abbrev = {
    'Atlanta Hawks':           'ATL',
    'Boston Celtics':          'BOS',
    'Brooklyn Nets':           'BRK',
    'Charlotte Hornets':       'CHO',
    'Cleveland Cavaliers':     'CLE',
    'Chicago Bulls':           'CHI',
    'Dallas Mavericks':        'DAL',
    'Denver Nuggets':          'DEN',
    'Detroit Pistons':         'DET',
    'Golden State Warriors':   'GSW',
    'Houston Rockets':         'HOU',
    'Indiana Pacers':          'IND',
    'Los Angeles Clippers':    'LAC',
    'Los Angeles Lakers':      'LAL',
    'Memphis Grizzlies':       'MEM',
    'Miami Heat':              'MIA',
    'Milwaukee Bucks':         'MIL',
    'Minnesota Timberwolves':  'MIN',
    'New Orleans Pelicans':    'NOP',
    'New York Knicks':         'NYK',
    'Oklahoma City Thunder':   'OKC',
    'Orlando Magic':           'ORL',
    'Philadelphia 76ers':      'PHI',
    'Phoenix Suns':            'PHO',
    'Portland Trailblazers':   'POR',
    'Sacramento Kings':        'SAC',
    'San Antonio Spurs':       'SAS',
    'Toronto Raptors':         'TOR',
    'Utah Jazz':               'UTA',
    'Washington Wizards':      'WAS',
}

In [222]:
def get_monthly_schedule(year, month):
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html'
    page = requests.get(url)
    tree = html.fromstring(page.content)
    
    game_date = tree.xpath('//*[@data-stat="date_game"]/a/text()')

    road_team = tree.xpath('//*[@data-stat="visitor_team_name"]/a/text()')
    road_pts = tree.xpath('//*[@data-stat="visitor_pts"]/text()')
    road_pts.pop(0)  # Remove col name

    home_team = tree.xpath('//*[@data-stat="home_team_name"]/a/text()')
    home_pts = tree.xpath('//*[@data-stat="home_pts"]/text()')
    home_pts.pop(0)  # Remove col name

    box_score = tree.xpath('//*[@data-stat="box_score_text"]/a/@href')
    
    sched = {
        'DATE':       game_date,
        'ROAD_TEAM':  road_team,
        'ROAD_PTS':   road_pts,
        'HOME_TEAM':  home_team,
        'HOME_PTS':   home_pts,
        'BOX_SCORE':  box_score,
    }
    
    sched = pd.DataFrame(sched)
    sched['ROAD_TM'] = sched['ROAD_TEAM'].map(team_name_abbrev)
    sched['HOME_TM'] = sched['HOME_TEAM'].map(team_name_abbrev)
    sched = sched[['DATE', 'ROAD_TEAM', 'ROAD_TM', 'ROAD_PTS',
                           'HOME_TEAM', 'HOME_TM', 'HOME_PTS', 'BOX_SCORE']]
    
    BBALLREF = 'https://www.basketball-reference.com'
    sched['BOX_SCORE'] = sched['BOX_SCORE'].apply(lambda x: BBALLREF + x)
    
    def format_date(date):
        date = parser.parse(date)
        return date.strftime('%Y-%m-%d')
    
    sched['DATE'] = sched['DATE'].apply(format_date)
    
    return sched

In [223]:
def get_daily_schedule(date):
    """
    date: a string with format 'YYYY-MM-DD'
    """
    
    # Get month and day from date
    parsed_date = parser.parse(date)
    month = parsed_date.strftime('%B').lower()
    year = parsed_date.strftime('%Y')
    
    if month in ['october, november, december']:
        year = str(int(year) + 1)  # Increment year
    
    sched = get_monthly_schedule(year, month)
    
    return sched.query('DATE == @date').reset_index(drop=True)
    

In [224]:
sched = get_daily_schedule('2018-01-13')

In [225]:
sched

Unnamed: 0,DATE,ROAD_TEAM,ROAD_TM,ROAD_PTS,HOME_TEAM,HOME_TM,HOME_PTS,BOX_SCORE
0,2018-01-13,Detroit Pistons,DET,105,Chicago Bulls,CHI,107,https://www.basketball-reference.com/boxscores...
1,2018-01-13,Oklahoma City Thunder,OKC,101,Charlotte Hornets,CHO,91,https://www.basketball-reference.com/boxscores...
2,2018-01-13,Los Angeles Lakers,LAL,107,Dallas Mavericks,DAL,101,https://www.basketball-reference.com/boxscores...
3,2018-01-13,Sacramento Kings,SAC,105,Los Angeles Clippers,LAC,126,https://www.basketball-reference.com/boxscores...
4,2018-01-13,Denver Nuggets,DEN,80,San Antonio Spurs,SAS,112,https://www.basketball-reference.com/boxscores...
5,2018-01-13,Golden State Warriors,GSW,127,Toronto Raptors,TOR,125,https://www.basketball-reference.com/boxscores...
6,2018-01-13,Brooklyn Nets,BRK,113,Washington Wizards,WAS,119,https://www.basketball-reference.com/boxscores...


In [None]:
def get_basic_box_score(url, road_team, home_team):
    """
    """
    
    pass

In [168]:
url = 'https://www.basketball-reference.com/boxscores/201710170CLE.html'
page = requests.get(url)
tree = html.fromstring(page.content)

In [206]:
#player_name = tree.xpath('//*[@id="box_bos_basic"]//th/text()')

In [367]:
class BasicBoxScore:
    def __init__(self, team_name, tree):
        self.team_name = team_name.lower()
        self.tree = tree
        self.table = f'//*[@id="box_{self.team_name}_basic"]'
        
    def get_col(self, col_name):
        subtree = self.tree.xpath(self.table + f'//td[@data-stat="{col_name}"]')
        return [el.text for el in subtree]
    
    def get_players(self):
        DNP = []  # Players who did not play
        rows = self.tree.xpath(self.table + '/tbody/tr')
        for row in rows:
            player = row.xpath('th/a/text()')
            stats = row.xpath('td/text()')
            if (len(stats) > 0) and stats[0] == 'Did Not Play':
                DNP.append(player[0])
                
        players = self.tree.xpath(self.table + '//th[@data-stat="player"]/a/text()')
        for inactive_player in DNP:
            players.remove(inactive_player)
        players.append('Team Totals')
        
        return players


In [421]:
box = BasicBoxScore('CLE', tree)

In [422]:
PLAYER_NAME = box.get_players()
MP = box.get_col('mp')
FG = box.get_col('fg')
FGA = box.get_col('fga')
FG_PCT = box.get_col('fg_pct')
FG3 = box.get_col('fg3')
FG3A = box.get_col('fg3a')
FG3_PCT = box.get_col('fg3_pct')
FT = box.get_col('ft')
FTA = box.get_col('fta')
FT_PCT = box.get_col('ft_pct')
ORB = box.get_col('orb')
DRB = box.get_col('drb')
TRB = box.get_col('trb')
AST = box.get_col('ast')
STL = box.get_col('stl')
BLK = box.get_col('blk')
TOV = box.get_col('tov')
PF = box.get_col('pf')
PTS = box.get_col('pts')
PM = box.get_col('plus_minus')

In [423]:
box_score = {
    'PLAYER_NAME':  PLAYER_NAME,
    'MP':           MP,
    'FG':           FG,
    'FGA':          FGA,
    'FG_PCT':       FG_PCT,
    'FG3':          FG3,
    'FG3A':         FG3A,
    'FG3_PCT':      FG3_PCT,
    'FT':           FT,
    'FTA':          FTA,
    'FT_PCT':       FT_PCT,
    'ORB':          ORB,
    'DRB':          DRB,
    'TRB':          TRB,
    'AST':          AST,
    'STL':          STL,
    'BLK':          BLK,
    'TOV':          TOV,
    'PF':           PF,
    'PTS':          PTS,
    'PM':           PM,
}

In [424]:
box_score = pd.DataFrame(box_score)
box_score.fillna(value=np.nan, inplace=True)

In [425]:
def format_time(MP):
    if len(MP.split(':')) > 1:
        (m, s) = MP.split(':')
        return int(m) + int(s) / 60
    else:
        return int(MP)

In [426]:
box_score['MP'] = box_score['MP'].apply(format_time)

In [427]:
box_score

Unnamed: 0,PLAYER_NAME,MP,FG,FGA,FG_PCT,FG3,FG3A,FG3_PCT,FT,FTA,FT_PCT,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,PM
0,LeBron James,41.2,12,19,0.632,1,5,0.2,4,4,1.0,1,15,16,9,0,2,4,3,29,2.0
1,Jae Crowder,34.7,3,10,0.3,1,5,0.2,4,4,1.0,1,4,5,2,2,0,1,2,11,7.0
2,Derrick Rose,31.2,5,14,0.357,1,3,0.333,3,4,0.75,1,3,4,2,0,0,2,2,14,-7.0
3,Dwyane Wade,28.5,3,10,0.3,0,1,0.0,2,2,1.0,1,1,2,3,0,2,4,1,8,0.0
4,Kevin Love,28.4,4,9,0.444,1,4,0.25,6,7,0.857,3,8,11,0,0,0,2,2,15,1.0
5,J.R. Smith,21.9,4,7,0.571,1,3,0.333,1,1,1.0,0,4,4,1,0,0,0,4,10,7.0
6,Tristan Thompson,19.6,2,3,0.667,0,0,,1,3,0.333,1,5,6,2,0,0,2,3,5,2.0
7,Jeff Green,14.2,3,8,0.375,0,1,0.0,0,0,,0,0,0,0,0,0,1,3,6,-2.0
8,Iman Shumpert,12.8,2,3,0.667,0,0,,0,0,,1,1,2,0,1,0,1,3,4,6.0
9,Kyle Korver,7.3,0,0,,0,0,,0,0,,0,0,0,0,0,0,0,2,0,-1.0
