In [76]:
import yaml

from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import requests

ModuleNotFoundError: No module named 'yaml'

In [2]:
def get_requested_schedule(date):
    """
    :param date: a string with format 'YYYY-MM-DD' or 'YYYY-MM'

    :return schedule: a pd.DataFrame containing game info for games
                      played on the requested date, either a day or month
    """

    # Get year and month from date
    year = arrow.get(date).datetime.strftime('%Y')           # e.g. 2018
    month = arrow.get(date).datetime.strftime('%B').lower()  # e.g. january

    # BBallRef takes the season year as the calendar year when the Playoffs
    # are played; therefore, the 2017-2018 season is the 2018 season
    if month in ['october', 'november', 'december']:
        year = str(int(year) + 1)  # Increment year

    monthly_schedule = get_monthly_schedule(year, month)

    # If year year, month, and day given in date, return daily schedule
    if len(date.split('-')) == 3:
        return monthly_schedule.query('DATE == @date').reset_index(drop=True)
    return schedule  # monthly schedule

In [3]:
year = '2019'
month = 'january'
url = f'https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html'
page = requests.get(url).text
soup = BeautifulSoup(page, 'lxml')

In [71]:
class MonthlySchedule:
    def __init__(self, year, month):
        self.year = year
        self.month = month
        self.url = url = f'https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html'
        page = requests.get(url).text
        self.soup = BeautifulSoup(page, 'lxml')
        
        self._get_schedule()

        
    def _get_col(self, tag, data_stat, pop_title=False):
        result = self.soup.find_all(tag, {'data-stat': data_stat})
        if pop_title:
            result.pop(0)
        return [row.a.text if row.a else row.text for row in result]

    
    def _get_col_box_score_url(self):
        result = self.soup.find_all('td', {'data-stat': 'box_score_text'})
        return [row.a.get('href') if row.a else '' for row in result]

    
    def _get_schedule(self):
        col_game_date = self._get_col('th', 'date_game', pop_title=True)
        col_road_team = self._get_col('td', 'visitor_team_name')
        col_road_pts = self._get_col('td', 'visitor_pts')
        col_home_team = self._get_col('td', 'home_team_name')
        col_home_pts = self._get_col('td', 'home_pts')
        col_box_score_url = self._get_col_box_score_url()
        
        schedule = {
            'DATE':          col_game_date,
            'ROAD_TEAM':     col_road_team,
            'ROAD_PTS':      col_road_pts,
            'HOME_TEAM':     col_home_team,
            'HOME_PTS':      col_home_pts,
            'BOX_SCORE_URL': col_box_score_url,
        }
        
        self.schedule = pd.DataFrame(schedule)
        self.schedule = self.schedule.replace('', np.nan)
        self.schedule = self.schedule.dropna(how='any')
        
        self._abbrev_team_names()
        self._reorder_cols()
        self._complete_box_score_url()
    
    
    def _abbrev_team_names(self):
        self.schedule['ROAD_TEAM_ABBR'] = \
                self.schedule['ROAD_TEAM'].apply(team_name_abbr)
        self.schedule['HOME_TEAM_ABBR'] = \
                self.schedule['HOME_TEAM'].apply(team_name_abbr)
            
    
    def _reorder_cols(self):
        reordered_cols = ['DATE',
            'ROAD_TEAM', 'ROAD_TEAM_ABBR', 'ROAD_PTS',
            'HOME_TEAM', 'HOME_TEAM_ABBR', 'HOME_PTS',
            'BOX_SCORE_URL'    
        ]
        self.schedule = self.schedule[reordered_cols]
        
        
    def _complete_box_score_url(self):
        BBALLREF = 'https://www.basketball-reference.com'
        self.schedule['BOX_SCORE_URL'] = \
                self.schedule['BOX_SCORE_URL'].apply(lambda x: BBALLREF + x)

In [72]:
ms = MonthlySchedule('2019', 'april')
ms.schedule

DATE 79
ROAD_TEAM 79
ROAD_PTS 79
HOME_TEAM 79
HOME_PTS 79
BOX_SCORE_URL 79


Unnamed: 0,DATE,ROAD_TEAM,ROAD_PTS,HOME_TEAM,HOME_PTS,BOX_SCORE_URL
0,"Mon, Apr 1, 2019",Miami Heat,105,Boston Celtics,110,/boxscores/201904010BOS.html
1,"Mon, Apr 1, 2019",Milwaukee Bucks,131,Brooklyn Nets,121,/boxscores/201904010BRK.html
2,"Mon, Apr 1, 2019",Philadelphia 76ers,102,Dallas Mavericks,122,/boxscores/201904010DAL.html
3,"Mon, Apr 1, 2019",Detroit Pistons,102,Indiana Pacers,111,/boxscores/201904010IND.html
4,"Mon, Apr 1, 2019",Portland Trail Blazers,132,Minnesota Timberwolves,122,/boxscores/201904010MIN.html
5,"Mon, Apr 1, 2019",Chicago Bulls,105,New York Knicks,113,/boxscores/201904010NYK.html
6,"Mon, Apr 1, 2019",Cleveland Cavaliers,113,Phoenix Suns,122,/boxscores/201904010PHO.html
7,"Mon, Apr 1, 2019",Orlando Magic,109,Toronto Raptors,121,/boxscores/201904010TOR.html
8,"Mon, Apr 1, 2019",Charlotte Hornets,102,Utah Jazz,111,/boxscores/201904010UTA.html
9,"Tue, Apr 2, 2019",Denver Nuggets,102,Golden State Warriors,116,/boxscores/201904020GSW.html


In [75]:
ms.schedule

Unnamed: 0,DATE,ROAD_TEAM,ROAD_PTS,HOME_TEAM,HOME_PTS,BOX_SCORE_URL
0,"Mon, Apr 1, 2019",Miami Heat,105,Boston Celtics,110,/boxscores/201904010BOS.html
1,"Mon, Apr 1, 2019",Milwaukee Bucks,131,Brooklyn Nets,121,/boxscores/201904010BRK.html
2,"Mon, Apr 1, 2019",Philadelphia 76ers,102,Dallas Mavericks,122,/boxscores/201904010DAL.html
3,"Mon, Apr 1, 2019",Detroit Pistons,102,Indiana Pacers,111,/boxscores/201904010IND.html
4,"Mon, Apr 1, 2019",Portland Trail Blazers,132,Minnesota Timberwolves,122,/boxscores/201904010MIN.html
5,"Mon, Apr 1, 2019",Chicago Bulls,105,New York Knicks,113,/boxscores/201904010NYK.html
6,"Mon, Apr 1, 2019",Cleveland Cavaliers,113,Phoenix Suns,122,/boxscores/201904010PHO.html
7,"Mon, Apr 1, 2019",Orlando Magic,109,Toronto Raptors,121,/boxscores/201904010TOR.html
8,"Mon, Apr 1, 2019",Charlotte Hornets,102,Utah Jazz,111,/boxscores/201904010UTA.html
9,"Tue, Apr 2, 2019",Denver Nuggets,102,Golden State Warriors,116,/boxscores/201904020GSW.html


In [None]:
def get_monthly_schedule(year, month):
    """
    :param year: a string, e.g. 2018
    :param month: a string, e.g. january

    :return schedule: a pd.DataFrame containing game info for the month
    """
    

In [None]:
def get_monthly_schedule(year, month):
    """
    :param year: a string, e.g. 2018
    :param month: a string, e.g. january

    :return schedule: a pd.DataFrame containing game info for the month
    """

    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html'
    page = requests.get(url)
    tree = html.fromstring(page.content)

    game_date = tree.xpath('//*[@data-stat="date_game"]/a/text()')

    road_team = tree.xpath('//*[@data-stat="visitor_team_name"]/a/text()')
    road_pts = tree.xpath('//*[@data-stat="visitor_pts"]/text()')
    road_pts.pop(0)  # Remove column name

    home_team = tree.xpath('//*[@data-stat="home_team_name"]/a/text()')
    home_pts = tree.xpath('//*[@data-stat="home_pts"]/text()')
    home_pts.pop(0)  # Remove column name

    box_score_url = tree.xpath('//*[@data-stat="box_score_text"]/a/@href')

    schedule = {
        'DATE':           game_date,
        'ROAD_TEAM':      road_team,
        'ROAD_PTS':       road_pts,
        'HOME_TEAM':      home_team,
        'HOME_PTS':       home_pts,
        'BOX_SCORE_URL':  box_score_url,
    }

    # Create a dictionary with different length columns (Series) that is
    # suitable for a DataFrame
    schedule = dict([ (k, pd.Series(v)) for k, v in schedule.items() ])
    schedule = pd.DataFrame(schedule)
    schedule.dropna(how='any', inplace=True)
    schedule['ROAD_TM'] = schedule['ROAD_TEAM'].map(team_name_abbrev)
    schedule['HOME_TM'] = schedule['HOME_TEAM'].map(team_name_abbrev)
    schedule = schedule[['DATE', 'ROAD_TEAM', 'ROAD_TM', 'ROAD_PTS',
                         'HOME_TEAM', 'HOME_TM', 'HOME_PTS', 'BOX_SCORE_URL']]

    BBALLREF = 'https://www.basketball-reference.com'
    schedule['BOX_SCORE_URL'] = \
            schedule['BOX_SCORE_URL'].apply(lambda x: BBALLREF + x)

    def format_date(date):
        return arrow.get(date, 'ddd, MMM D, YYYY').datetime.strftime('%Y-%m-%d')

    schedule['DATE'] = schedule['DATE'].apply(format_date)

    return schedule