In [1]:
import os
import glob
import time

import requests
import pandas as pd
pd.set_option('display.max_columns', 100)

from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

In [2]:
import templates
import convert

from _info import DATA_STATS

In [3]:
class Filing:
    # Class to take care of filing each dataframe + additional functionality later on
    def __init__(self, season: str):

        self.season = season
        self.data_dir = os.getcwd().replace('src', 'data')
        self.season_dir = os.path.join(self.data_dir, season)
        self.boxscores_dir = os.path.join(self.season_dir, 'boxscores')

        # Check to make sure if directories exist, if not create them
        for directory in (self.data_dir, self.season_dir, self.boxscores_dir):
            if not os.path.exists(directory):
                os.mkdir(directory)


    def save_boxscore(self, df: pd.DataFrame, weeknum: int) -> None:
        """
        Saves boxscore as csv (later on can configure different formats)
        Saves in form of away-home-week#.csv
        """
        teams = sorted(list(df['team'].drop_duplicates()))
        filename = f'{"-".join(teams)}-week{weeknum}.csv'
        
        fpath = os.path.join(self.boxscores_dir, filename)
        df.to_csv(fpath, index=False)

        return


        

In [4]:
class BoxscoreScraper:

    def __init__(self, **kwargs):

        self.year: int = int(kwargs.get('year', 2022))
        self.season: str = f'{self.year}-{self.year+1}'

        # Initialize filing object
        self.filing = Filing(self.season)
        
        # Going to start with just regular season
        self.week_pages = {
            week: templates.week_url(self.year, week)
            for week in range(1,19)
        }

        self.data_stats = DATA_STATS

    def get_week_boxscores(self, week: int, url: str):
        """
        Returns every boxscore for given week and saves it to directory
        """

        root_url: str = 'https://www.pro-football-reference.com/'
        
        # Seems a little redundant having self.week_pages rn
        week_games_soup = BeautifulSoup(
            requests.get(url).text,
            'html.parser'
        )


        for game in week_games_soup.find_all('div', class_='game_summary expanded nohover'):
            
            game_url: str = f"{root_url}{game.find_all('td', class_='right gamelink')[0].find('a')['href']}"
            game_soup = BeautifulSoup(
                requests.get(game_url).text,
                'html.parser'
            )

            stat_table = game_soup.find_all('table', id='player_offense')[0]
    
            # Different for names because th not td
            names = [
                tag.get_text() for tag in stat_table.find_all('th', attrs={'data-stat': 'player'})
                if tag.get_text() != 'Player'
            ]
            
            table_data = {
                stat: [td.get_text() for td in stat_table.find_all('td', attrs={'data-stat': stat})]
                for stat in DATA_STATS[1:]
            }
            
            
            # Will do rest of cleaning later on, just wanted to not have any NA values in saved files and have standardized team names
            fix_rating = lambda rating_str: float(rating_str) if len(rating_str) else 0.0
            table_data['pass_rating'] = [fix_rating(rating) for rating in table_data['pass_rating']]
            table_data['team'] = [convert.initials(team) for team in table_data['team']]
            
            df = pd.DataFrame(data={**{'name': names}, **table_data})
            self.filing.save_boxscore(df, week)
        

        return

    def get_season_boxscores(self) -> None:
        """
        Iterates through every boxscore for every game of every week
        Saves to data directory
        """

        for weeknum, url in tqdm(self.week_pages.items()):
            print(f'Scraping boxscores for Week {weeknum}')
            self.get_week_boxscores(weeknum, url)
            # Need to sleep for 60 seconds so requests do not get blocked
            time.sleep(60)
            print(f'Succesfully scraped boxscores for Week {weeknum}\n')
        
        
        return
        



In [5]:
scraper = BoxscoreScraper()
scraper.get_season_boxscores()

  0%|          | 0/18 [00:00<?, ?it/s]

Scraping boxscores for Week 1


NameError: name 'week' is not defined

In [None]:
# week_games_soup = BeautifulSoup(
#     requests.get('https://www.pro-football-reference.com/years/2022/week_1.htm').text,
#     'html.parser'
# )

# root_url: str = 'https://www.pro-football-reference.com/'

# for game in week_games_soup.find_all('div', class_='game_summary expanded nohover')[:1]:
#     game_url: str = f"{root_url}{game.find_all('td', class_='right gamelink')[0].find('a')['href']}"
#     game_soup = BeautifulSoup(
#         requests.get(game_url).text,
#         'html.parser'
#     )

#     stat_table = game_soup.find_all('table', id='player_offense')[0]

#     # Different for names because th not td
#     names = [
#         tag.get_text() for tag in stat_table.find_all('th', attrs={'data-stat': 'player'})
#         if tag.get_text() != 'Player'
#     ]


In [None]:
# ## Get jailed for an hour if >= 20 requests / minute

# single_url = 'https://www.pro-football-reference.com/boxscores/202209080ram.htm'

# game_soup = BeautifulSoup(
#     # requests.get(game_url).text,
#     requests.get(single_url).text,
#     'html.parser'
# )

# stat_table = game_soup.find_all('table', id='player_offense')[0]

# # Different for names because th not td
# names = [
#     tag.get_text() for tag in stat_table.find_all('th', attrs={'data-stat': 'player'})
#     if tag.get_text() != 'Player'
# ]

# table_data = {
#     stat: [td.get_text() for td in stat_table.find_all('td', attrs={'data-stat': stat})]
#     for stat in DATA_STATS[1:]
# }


# # Will do rest of cleaning later on, just wanted to not have any NA values
# fix_rating = lambda rating_str: float(rating_str) if len(rating_str) else 0.0
# table_data['pass_rating'] = [fix_rating(rating) for rating in table_data['pass_rating']]

# df = pd.DataFrame(data={**{'name': names}, **table_data})
# teams = tuple(sorted(list(df['team'].drop_duplicates())))

In [None]:
# stat_table.find_all('td', attrs={'data-stat': 'team'})

In [None]:


# table_data_with_names = {
#     **{'name': names},
#     **table_data
# }

# pd.DataFrame(data = table_data_with_names)