From 183d83107133944e536772606ab5cf32aac9febf Mon Sep 17 00:00:00 2001 From: Jae Bradley Date: Tue, 27 Nov 2018 20:22:02 -0800 Subject: [PATCH] Improve schedule parser (#61) (#66) Add logic so that the schedule scraper can get all the games for the current season without crashing, leaving scores off of the returned game hashes Adds `None` as default values if away team score or home team score are not available --- .../parsers/schedule.py | 4 ++-- tests/test_integration_parse_schedule.py | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/basketball_reference_web_scraper/parsers/schedule.py b/basketball_reference_web_scraper/parsers/schedule.py index 01c7c69e..095918b2 100644 --- a/basketball_reference_web_scraper/parsers/schedule.py +++ b/basketball_reference_web_scraper/parsers/schedule.py @@ -46,9 +46,9 @@ def parse_game(row): return { "start_time": start_time, "away_team": TEAM_NAME_TO_TEAM[row[2].text_content().upper()], - "away_team_score": int(row[3].text_content()), "home_team": TEAM_NAME_TO_TEAM[row[4].text_content().upper()], - "home_team_score": int(row[5].text_content()), + "away_team_score": int(row[3].text_content()) if row[3].text_content() else None, + "home_team_score": int(row[5].text_content()) if row[5].text_content() else None, } diff --git a/tests/test_integration_parse_schedule.py b/tests/test_integration_parse_schedule.py index cd9937b0..d7a7cd63 100644 --- a/tests/test_integration_parse_schedule.py +++ b/tests/test_integration_parse_schedule.py @@ -9,12 +9,14 @@ october_2001_schedule_html = os.path.join(os.path.dirname(__file__), './NBA_2001_games-october.html') october_2018_schedule_html = os.path.join(os.path.dirname(__file__), './NBA_2018_games-october.html') +april_2019_schedule_html = os.path.join(os.path.dirname(__file__), './NBA_2019_games-april.html') class TestSchedule(TestCase): def setUp(self): self.october_2001_html = open(october_2001_schedule_html).read() self.october_2018_html = open(october_2018_schedule_html).read() + self.april_2019_html = open(april_2019_schedule_html).read() def test_parse_october_2001_schedule_for_month_url_paths_(self): urls = schedule.parse_schedule_for_month_url_paths(self.october_2001_html) @@ -48,3 +50,18 @@ def test_parse_october_2001_schedule(self): def test_parse_october_2018_schedule(self): parsed_schedule = schedule.parse_schedule(self.october_2018_html) self.assertEqual(len(parsed_schedule), 104) + + def test_parse_future_game(self): + parsed_schedule = schedule.parse_schedule(self.april_2019_html) + first_game = parsed_schedule[0] + expected_first_game_start_time = pytz.timezone("US/Eastern") \ + .localize(datetime(year=2019, month=4, day=1, hour=19, minute=30)) \ + .astimezone(pytz.utc) + + self.assertIsNotNone(parsed_schedule) + self.assertEqual(len(parsed_schedule), 79) + self.assertEqual(first_game["start_time"], expected_first_game_start_time) + self.assertEqual(first_game["away_team"], Team.MIAMI_HEAT) + self.assertEqual(first_game["home_team"], Team.BOSTON_CELTICS) + self.assertIsNone(first_game["away_team_score"]) + self.assertIsNone(first_game["home_team_score"])