In [106]:
import requests
from lxml import html
from dateutil import parser
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)

In [71]:
team_name_abbrev = {
    'Atlanta Hawks':           'ATL',
    'Boston Celtics':          'BOS',
    'Brooklyn Nets':           'BRK',
    'Charlotte Hornets':       'CHO',
    'Cleveland Cavaliers':     'CLE',
    'Chicago Bulls':           'CHI',
    'Dallas Mavericks':        'DAL',
    'Denver Nuggets':          'DEN',
    'Detroit Pistons':         'DET',
    'Golden State Warriors':   'GSW',
    'Houston Rockets':         'HOU',
    'Indiana Pacers':          'IND',
    'Los Angeles Clippers':    'LAC',
    'Los Angeles Lakers':      'LAL',
    'Memphis Grizzlies':       'MEM',
    'Miami Heat':              'MIA',
    'Milwaukee Bucks':         'MIL',
    'Minnesota Timberwolves':  'MIN',
    'New Orleans Pelicans':    'NOP',
    'New York Knicks':         'NYK',
    'Oklahoma City Thunder':   'OKC',
    'Orlando Magic':           'ORL',
    'Philadelphia 76ers':      'PHI',
    'Phoenix Suns':            'PHO',
    'Portland Trailblazers':   'POR',
    'Sacramento Kings':        'SAC',
    'San Antonio Spurs':       'SAS',
    'Toronto Raptors':         'TOR',
    'Utah Jazz':               'UTA',
    'Washington Wizards':      'WAS',
}

In [222]:
def get_monthly_schedule(year, month):
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html'
    page = requests.get(url)
    tree = html.fromstring(page.content)
    
    game_date = tree.xpath('//*[@data-stat="date_game"]/a/text()')

    road_team = tree.xpath('//*[@data-stat="visitor_team_name"]/a/text()')
    road_pts = tree.xpath('//*[@data-stat="visitor_pts"]/text()')
    road_pts.pop(0)  # Remove col name

    home_team = tree.xpath('//*[@data-stat="home_team_name"]/a/text()')
    home_pts = tree.xpath('//*[@data-stat="home_pts"]/text()')
    home_pts.pop(0)  # Remove col name

    box_score = tree.xpath('//*[@data-stat="box_score_text"]/a/@href')
    
    sched = {
        'DATE':       game_date,
        'ROAD_TEAM':  road_team,
        'ROAD_PTS':   road_pts,
        'HOME_TEAM':  home_team,
        'HOME_PTS':   home_pts,
        'BOX_SCORE':  box_score,
    }
    
    sched = pd.DataFrame(sched)
    sched['ROAD_TM'] = sched['ROAD_TEAM'].map(team_name_abbrev)
    sched['HOME_TM'] = sched['HOME_TEAM'].map(team_name_abbrev)
    sched = sched[['DATE', 'ROAD_TEAM', 'ROAD_TM', 'ROAD_PTS',
                           'HOME_TEAM', 'HOME_TM', 'HOME_PTS', 'BOX_SCORE']]
    
    BBALLREF = 'https://www.basketball-reference.com'
    sched['BOX_SCORE'] = sched['BOX_SCORE'].apply(lambda x: BBALLREF + x)
    
    def format_date(date):
        date = parser.parse(date)
        return date.strftime('%Y-%m-%d')
    
    sched['DATE'] = sched['DATE'].apply(format_date)
    
    return sched

In [223]:
def get_daily_schedule(date):
    """
    date: a string with format 'YYYY-MM-DD'
    """
    
    # Get month and day from date
    parsed_date = parser.parse(date)
    month = parsed_date.strftime('%B').lower()
    year = parsed_date.strftime('%Y')
    
    if month in ['october, november, december']:
        year = str(int(year) + 1)  # Increment year
    
    sched = get_monthly_schedule(year, month)
    
    return sched.query('DATE == @date').reset_index(drop=True)
    

In [224]:
sched = get_daily_schedule('2018-01-13')

In [225]:
sched

Unnamed: 0,DATE,ROAD_TEAM,ROAD_TM,ROAD_PTS,HOME_TEAM,HOME_TM,HOME_PTS,BOX_SCORE
0,2018-01-13,Detroit Pistons,DET,105,Chicago Bulls,CHI,107,https://www.basketball-reference.com/boxscores...
1,2018-01-13,Oklahoma City Thunder,OKC,101,Charlotte Hornets,CHO,91,https://www.basketball-reference.com/boxscores...
2,2018-01-13,Los Angeles Lakers,LAL,107,Dallas Mavericks,DAL,101,https://www.basketball-reference.com/boxscores...
3,2018-01-13,Sacramento Kings,SAC,105,Los Angeles Clippers,LAC,126,https://www.basketball-reference.com/boxscores...
4,2018-01-13,Denver Nuggets,DEN,80,San Antonio Spurs,SAS,112,https://www.basketball-reference.com/boxscores...
5,2018-01-13,Golden State Warriors,GSW,127,Toronto Raptors,TOR,125,https://www.basketball-reference.com/boxscores...
6,2018-01-13,Brooklyn Nets,BRK,113,Washington Wizards,WAS,119,https://www.basketball-reference.com/boxscores...


In [None]:
def get_basic_box_score(url, road_team, home_team):
    """
    """
    
    pass

In [168]:
url = 'https://www.basketball-reference.com/boxscores/201710170CLE.html'
page = requests.get(url)
tree = html.fromstring(page.content)

In [206]:
#player_name = tree.xpath('//*[@id="box_bos_basic"]//th/text()')

In [269]:
class BasicBoxScore:
    def __init__(self, team_name, tree):
        self.team_name = team_name.lower()
        self.tree = tree

    def get_col(self, col_name):
        return self.tree.xpath(f'//*[@id="box_{self.team_name}_basic"]//*[@data-stat="{col_name}"]/text()')

In [270]:
box = BasicBoxScore('BOS', tree)

In [275]:
MIN = box.get_col('mp')
FG = box.get_col('fg')
FGA = box.get_col('fga')
FG_PCT = box.get_col('fg_pct')
FG3 = box.get_col('fg3')
FG3A = box.get_col('fg3a')
FG3_PCT = box.get_col('fg3_pct')
FT = box.get_col('ft')
FTA = box.get_col('fta')
FT_PCT = box.get_col('ft_pct')
ORB = box.get_col('orb')
DRB = box.get_col('drb')
TRB = box.get_col('trb')
AST = box.get_col('ast')
STL = box.get_col('stl')
BLK = box.get_col('blk')
TOV = box.get_col('tov')
PF = box.get_col('pf')
PTS = box.get_col('pts')
PM = box.get_col('plus_minus')

In [277]:
box_score = {
    'MIN':      MIN,
    'FG':       FG,
    'FGA':      FGA,
    'FG_PCT':   FG_PCT,
    'FG3':      FG3,
    'FG3A':     FG3A,
    'FG3_PCT':  FG3_PCT,
    'FT':       FT,
    'FTA':      FTA,
    'FT_PCT':   FT_PCT,
    'ORB':      ORB,
    'DRB':      DRB,
    'TRB':      TRB,
    'AST':      AST,
    'STL':      STL,
    'BLK':      BLK,
    'TOV':      TOV,
    'PF':       PF,
    'PTS':      PTS,
    'PM':       PM,
}

In [278]:
box_score = pd.DataFrame(box_score)

ValueError: arrays must all be same length

In [263]:
MP = tree.xpath('//*[@id="box_bos_basic"]//*[@data-stat="mp"]/text()')

def get_stats_col(team_name, col_name, tree):
    return tree.xpath(f'//*[@id="box_{team_name}_basic"]//*[@data-stat="{col_name}"]/text()')

In [264]:
get_stats_col('bos', 'mp', tree)

['MP',
 '39:36',
 '39:21',
 '36:32',
 '32:07',
 '5:15',
 'MP',
 '35:03',
 '19:32',
 '19:06',
 '8:39',
 '4:49',
 '240']

In [250]:
stats_table = tree.xpath('//*[@id="box_bos_basic"]/tbody/tr')

In [253]:
for row in stats_table:
    player_name = row.xpath('th/a/text()')
    player_stats = row.xpath('td/text()')
    print(player_name + player_stats)

['Jaylen Brown', '39:36', '11', '23', '.478', '2', '9', '.222', '1', '2', '.500', '1', '5', '6', '0', '2', '0', '3', '5', '25', '-5']
['Kyrie Irving', '39:21', '8', '17', '.471', '4', '9', '.444', '2', '2', '1.000', '2', '2', '4', '10', '3', '0', '2', '4', '22', '-1']
['Jayson Tatum', '36:32', '5', '12', '.417', '1', '2', '.500', '3', '3', '1.000', '4', '6', '10', '3', '0', '0', '1', '4', '14', '+6']
['Al Horford', '32:07', '2', '7', '.286', '0', '2', '.000', '5', '7', '.714', '0', '7', '7', '5', '0', '1', '0', '2', '9', '+8']
['Gordon Hayward', '5:15', '1', '2', '.500', '0', '1', '.000', '0', '0', '0', '1', '1', '0', '0', '0', '0', '1', '2', '+3']
[]
['Marcus Smart', '35:03', '5', '16', '.313', '0', '4', '.000', '2', '3', '.667', '0', '9', '9', '3', '2', '2', '2', '2', '12', '-8']
['Terry Rozier', '19:32', '2', '6', '.333', '1', '3', '.333', '4', '4', '1.000', '0', '3', '3', '2', '4', '0', '0', '0', '9', '+4']
['Aron Baynes', '19:06', '2', '2', '1.000', '0', '0', '2', '4', '.500', '2'

In [None]:
//*[@id="box_bos_basic"]/tbody/tr[1]/td[1]