In [1]:
import arrow
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import requests
import yaml

In [102]:
def get_schedule(year, month, day=None):
    """
    :param str year:
    :param str month:
    :param str day:
    
    :return pd.DataFrame schedule: contains game info for games played on date,
                                   either a day or a month
    """

    if day:
        schedule = DaySchedule(year, month, day)
    else:
        schedule = MonthSchedule(year, month)

    return schedule.schedule

In [103]:
class MonthSchedule:
    def __init__(self, year, month):
        """
        :param str year:
        :param str month:
        """
        
        date = '-'.join([year, month])
        month = arrow.get(date).datetime.strftime('%B').lower()  # e.g. 'january'
        
        # BBallRef takes the season year as the calendar year when the Playoffs
        # are played; hence, the 2017-2018 season is the 2018 season
        if month in ['october', 'november', 'december']:
            year = str(int(year) + 1)  # Increment year
        
        url = f'https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html'
        page = requests.get(url).text  # TODO: Handle request error
        self.soup = BeautifulSoup(page, 'lxml')
        self._get_schedule()

    
    def _get_schedule(self):
        col_game_date = self._get_col('th', 'date_game', has_title=True)
        col_road_team = self._get_col('td', 'visitor_team_name')
        col_road_team_pts = self._get_col('td', 'visitor_pts')
        col_home_team = self._get_col('td', 'home_team_name')
        col_home_team_pts = self._get_col('td', 'home_pts')
        col_box_score_url = self._get_col_box_score_url()
        
        schedule = {
            'DATE':          col_game_date,
            'ROAD_TEAM':     col_road_team,
            'ROAD_TEAM_PTS': col_road_team_pts,
            'HOME_TEAM':     col_home_team,
            'HOME_TEAM_PTS': col_home_team_pts,
            'BOX_SCORE_URL': col_box_score_url,
        }
        
        self.schedule = pd.DataFrame(schedule)
        self.schedule = self.schedule.replace('', np.nan)
        self.schedule = self.schedule.dropna(how='any')
        
        self._abbrev_team_names()
        self._reorder_cols()
        self._complete_box_score_url()
        self._format_date()
        
        
    def _get_col(self, tag, data_stat, has_title=False):
        """
        :param str tag: e.g. 'td'
        :param str data_stat: e.g. 'home_team_name'
        :param bool has_title: indicates whether the column has a title row
                               for its first row, which will be removed
        """
        
        result = self.soup.find_all(tag, {'data-stat': data_stat})
        if has_title:
            result.pop(0)
        return [row.a.text if row.a else row.text for row in result]

    
    def _get_col_box_score_url(self):
        result = self.soup.find_all('td', {'data-stat': 'box_score_text'})
        return [row.a.get('href') if row.a else '' for row in result]

    
    def _abbrev_team_names(self):
        with open('./grabstats/teams.yaml', 'r') as f:
            team_name_abbrev = yaml.safe_load(f)
            
        self.schedule['ROAD_TEAM_ABBR'] = \
                self.schedule['ROAD_TEAM'].map(team_name_abbrev)
        self.schedule['HOME_TEAM_ABBR'] = \
                self.schedule['HOME_TEAM'].map(team_name_abbrev)
            
    
    def _reorder_cols(self):
        reordered_cols = ['DATE',
            'ROAD_TEAM', 'ROAD_TEAM_ABBR', 'ROAD_TEAM_PTS',
            'HOME_TEAM', 'HOME_TEAM_ABBR', 'HOME_TEAM_PTS',
            'BOX_SCORE_URL'    
        ]
        self.schedule = self.schedule[reordered_cols]
        
        
    def _complete_box_score_url(self):
        BBALLREF = 'https://www.basketball-reference.com'
        self.schedule['BOX_SCORE_URL'] = \
                self.schedule['BOX_SCORE_URL'].apply(lambda x: BBALLREF + x)
        

    def _format_date(self):
        def format_date(date):
            return arrow.get(date, 'ddd, MMM D, YYYY').datetime.strftime('%Y-%m-%d')
        self.schedule['DATE'] = self.schedule['DATE'].apply(format_date)

In [104]:
class DaySchedule(MonthSchedule):
    def __init__(self, year, month, day):
        super().__init__(year, month)
        date = '-'.join([year, month, day])
        self.schedule = self.schedule.query('DATE == @date').reset_index(drop=True)

In [109]:
def box_scores_get_one(team_name, url):
    """Get the basic and advanced box scores for one team and one game.
    
    :param str team_name: the capitalized abbreviated name, e.g. 'DEN'
    :param str url: the URL to the box score page on basketball-reference.com
    """
    
    page = requests.get(url).text
    soup = BeautifulSoup(page, 'lxml')
    
    basic = BasicBoxScore(soup).get(team_name.lower())
    adv = AdvBoxScore(soup).get(team_name.lower())
    
    basic['usg_pct'] = adv['usg_pct']
    
    return basic, adv

In [114]:
def box_scores_get_many(schedule):
    """
    :param pd.DataFrame schedule: contains game info for the schedule of games
    
    :return tuple: to be finished ...
    """
    
    basic_box_scores = []
    adv_box_scores = []
    
    for idx, row in schedule.iterrows():
        game_date = row['DATE']
        road_team_abbr = row['ROAD_TEAM_ABBR']
        home_team_abbr = row['HOME_TEAM_ABBR']
        box_score_url = row['BOX_SCORE_URL']        

        road_basic, road_adv = box_scores_get_one(road_team_abbr, box_score_url)
        home_basic, home_adv = box_scores_get_one(home_team_abbr, box_score_url)
    
        # BASIC BOX SCORE
        # Road team
        road_basic['DATE'] = game_date
        road_basic['OWN_TEAM'] = road_team_abbr
        road_basic['OPP_TEAM'] = home_team_abbr
        road_basic['VENUE'] = 'R' 

        # Home team
        home_basic['DATE'] = game_date
        home_basic['OWN_TEAM'] = home_team_abbr
        home_basic['OPP_TEAM'] = road_team_abbr
        home_basic['VENUE'] = 'H' 

        basic = pd.concat([road_basic, home_basic])

#         reordered_cols = [
#             'DATE', 'PLAYER_NAME', 'OWN_TEAM', 'OPP_TEAM', 'VENUE', 'MP',
#             'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%',
#             'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
#             '+/-', 'USG%', 'PACE'
#         ]
#         basic = basic[reordered_cols]
        basic_box_scores.append(basic)
        
        # ADVANCED BOX SCORE
        # Road team
        road_adv['DATE'] = game_date
        road_adv['OWN_TEAM'] = road_team_abbr
        road_adv['OPP_TEAM'] = home_team_abbr
        road_adv['VENUE'] = 'R'

        # Home team
        home_adv['DATE'] = game_date
        home_adv['OWN_TEAM'] = home_team_abbr
        home_adv['OPP_TEAM'] = road_team_abbr
        home_adv['VENUE'] = 'H'

        adv = pd.concat([road_adv, home_adv])

#         reordered_cols = [
#             'DATE', 'PLAYER_NAME', 'OWN_TEAM', 'OPP_TEAM', 'VENUE', 'MP',
#             'TS%', 'eFG%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%',
#             'STL%', 'BLK%', 'TOV%', 'USG%', 'ORtg', 'DRtg'
#         ]
#         adv = adv[reordered_cols]
        adv_box_scores.append(adv)
    
    print(basic_box_scores, adv_box_scores)
    #return basic, adv

In [101]:
def _get_data_stat(row, data_stat, is_header=False):
    if is_header:
        return row.find('th', {'data-stat': data_stat}).text
    return row.find('td', {'data-stat': data_stat}).text


def format_time(mp):
    """Convert minutes played from analog time to digital time.
    
    :param str mp: minutes played, e.g '24:30'
    
    :return int: e.g. 24.5
    """
    (m, s) = mp.split(':')
    digital = int(m) + int(s) / 60
    return round(digital, 1)


class BoxScore:
    def __init__(self, soup):
        self.soup = soup

    def get(self, team_name):
        box_score = pd.DataFrame()  
        table = self.soup.find('table', {'id': f'box_{team_name}_{self.box_score_type}'})
        rows = table.find('tbody').find_all('tr')

        player_rows = [row for row in rows if row.td]

        active_player_rows = [row for row in player_rows if row.td.get('data-stat') == 'mp']
        inactive_player_rows = [row for row in player_rows if row.td.get('data-stat') == 'reason']
        
        for data_stat in self.data_stats:
            if data_stat == 'player':
                is_header = True
            else:
                is_header = False
            box_score[data_stat] = [_get_data_stat(row, data_stat, is_header) for row in active_player_rows]

        box_score['mp'] = box_score['mp'].apply(format_time)
        return box_score
    
    
class BasicBoxScore(BoxScore):
    def __init__(self, soup):
        super().__init__(soup)
        
        self.box_score_type = 'basic'
        self.data_stats = [
            'player', 'mp',
            'fg', 'fga', 'fg_pct',
#             'fg3', 'fg3a', 'fg3_pct',
#             'ft', 'fta', 'ft_pct',
#             'orb', 'drb', 'trb',
#             'ast', 'stl', 'blk',
#             'tov', 'pf',
            'pts',
            'plus_minus',
        ]
        

class AdvBoxScore(BoxScore):
    def __init__(self, soup):
        super().__init__(soup)
        
        self.box_score_type = 'advanced'
        self.data_stats = [
            'player', 'mp',
#             'ts_pct', 'efg_pct',
#             'fg3a_per_fga_pct', 'fta_per_fga_pct',
#             'orb_pct', 'drb_pct', 'trb_pct',
#             'ast_pct', 'stl_pct', 'blk_pct',
            'tov_pct', 'usg_pct',
            'off_rtg', 'def_rtg',
        ]    

In [None]:
schedule = get_schedule('2019', '04', '23')
box_scores_get_many(schedule)

In [116]:
def write(box_score, outfile):
    if os.path.isfile(outfile):
        header = False
    header = True
    
    with open(outfile, 'a') as f:
        box_score.to_csv(f, header=header, index=False)

In [None]:
def grabstats(date, basic_box_score_file, adv_box_score_file):
    #year =
    #month =
    #day = 
    schedule = get_schedule(year, month, day)
    basic_box_scores, adv_box_scores = box_scores_get_many(schedule)
    
    # Write to files
    for box_score in basic_box_scores:
        write(box_score, basic_box_score_file)
        
    for box_score in adv_box_scores:
        write(box_score, adv_box_score_file)