In [1]:
import os
import glob
import time

import requests
import pandas as pd
pd.set_option('display.max_columns', 100)

from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

In [2]:
import templates
import convert

from _info import DATA_COLUMNS

In [3]:
class Filing:
    # Class to take care of filing each dataframe + additional functionality later on
    def __init__(self, season: str):

        self.season = season
        self.data_dir = os.getcwd().replace('src', 'data')
        self.season_dir = os.path.join(self.data_dir, season)
        self.boxscores_dir = os.path.join(self.season_dir, 'boxscores')

        # Check to make sure if directories exist, if not create them
        for directory in (self.data_dir, self.season_dir, self.boxscores_dir):
            if not os.path.exists(directory):
                os.mkdir(directory)


    def save_boxscore(self, df: pd.DataFrame, weeknum: int) -> None:
        """
        Saves boxscore as csv (later on can configure different formats)
        Saves in form of away-home-week#.csv
        """
        teams = sorted(list(df['team'].drop_duplicates()))
        filename = f'{"-".join(teams)}-week{weeknum}.csv'
        
        fpath = os.path.join(self.boxscores_dir, filename)
        df.to_csv(fpath, index=False)

        return

    
    def combined(self, **kwargs) -> pd.DataFrame:
        """
        Will create massive dataset if not created yet and save it
        If master dataset already created, will return csv on file
        This will not be cleaned by default
        """

        self.combined_fpath: str = os.path.join(self.season_dir, f'{self.season}-raw.csv')

        # If exists return and exit
        if os.path.exists(self.combined_fpath):
            return pd.read_csv(self.combined_fpath)

        combined =  (pd
                     .concat([ pd.read_csv(file) for file in glob.glob(self.boxscores_dir + '/*.csv') ])
                     .reset_index(drop=True)
                    )

        combined.to_csv(self.combined_fpath, index=False)

        return combined

        

In [4]:
class BoxscoreScraper:

    def __init__(self, **kwargs):

        self.year: int = int(kwargs.get('year', 2022))
        self.season: str = f'{self.year}-{self.year+1}'

        # Initialize filing object
        self.filing = Filing(self.season)

        # NFL changed number of weeks in 2022
        num_weeks = 18 if self.year == 2022 else 17
        
        # Going to start with just regular season
        self.week_pages = {
            week: templates.week_url(self.year, week)
            for week in range(1,num_weeks+1)
        }

        self.data_columns = DATA_COLUMNS

    def get_week_boxscores(self, week: int, url: str):
        """
        Returns every boxscore for given week and saves it to directory
        """

        root_url: str = 'https://www.pro-football-reference.com/'
        
        # Seems a little redundant having self.week_pages rn
        week_games_soup = BeautifulSoup(
            requests.get(url).text,
            'html.parser'
        )


        for game in week_games_soup.find_all('div', class_='game_summary expanded nohover'):
            
            game_url: str = f"{root_url}{game.find_all('td', class_='right gamelink')[0].find('a')['href']}"
            game_soup = BeautifulSoup(
                requests.get(game_url).text,
                'html.parser'
            )

            stat_table = game_soup.find_all('table', id='player_offense')[0]
    
            # Different for names because th not td
            names = [
                tag.get_text() for tag in stat_table.find_all('th', attrs={'data-stat': 'player'})
                if tag.get_text() != 'Player'
            ]
            
            table_data = {
                stat: [td.get_text() for td in stat_table.find_all('td', attrs={'data-stat': stat})]
                for stat in self.data_columns[1:]
            }
            
            
            # Will do rest of cleaning later on, just wanted to not have any NA values in saved files and have standardized team names
            fix_rating = lambda rating_str: float(rating_str) if len(rating_str) else 0.0
            table_data['pass_rating'] = [fix_rating(rating) for rating in table_data['pass_rating']]
            table_data['team'] = [convert.initials(team) for team in table_data['team']]

            teams = tuple(set(table_data['team']))
            get_opp = lambda team_: teams[1] if team_ == teams[0] else teams[0]
            table_data['opp'] = [get_opp(team) for team in table_data['team']]
            
            df = pd.DataFrame(data={**{'name': names}, **table_data})
            self.filing.save_boxscore(df, week)
        

        return

    def get_season_boxscores(self) -> None:
        """
        Iterates through every boxscore for every game of every week
        Saves to data directory
        """
        # Add check for if already done
        for weeknum, url in tqdm(self.week_pages.items()):
            print(f'Scraping boxscores for Week {weeknum}')
            self.get_week_boxscores(weeknum, url)
            # Need to sleep for 60 seconds so requests do not get blocked
            time.sleep(60)
            print(f'Succesfully scraped boxscores for Week {weeknum}\n')
        
        
        return
        



In [None]:
scraper = BoxscoreScraper()
scraper.get_season_boxscores()

In [6]:
class Cleaning:

    def __init__(self, **kwargs):
        """
        This class will contain the functionality to combine, clean, wrangle, and partition all the boxscores
        """

        self.year: int = int(kwargs.get('year', 2022))
        self.season: str = f'{self.year}-{self.year+1}'

        # Initialize filing object
        self.filing = Filing(self.season)
        self.raw: pd.DataFrame = self.filing.combined()



In [7]:
cleaning = Cleaning()

In [8]:
df = cleaning.raw

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5685 entries, 0 to 5684
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             5685 non-null   object 
 1   team             5685 non-null   object 
 2   pass_cmp         5685 non-null   int64  
 3   pass_att         5685 non-null   int64  
 4   pass_yds         5685 non-null   int64  
 5   pass_td          5685 non-null   int64  
 6   pass_int         5685 non-null   int64  
 7   pass_sacked      5685 non-null   int64  
 8   pass_sacked_yds  5685 non-null   int64  
 9   pass_long        5685 non-null   int64  
 10  pass_rating      5685 non-null   float64
 11  rush_att         5685 non-null   int64  
 12  rush_yds         5685 non-null   int64  
 13  rush_td          5685 non-null   int64  
 14  rush_long        5685 non-null   int64  
 15  targets          5685 non-null   int64  
 16  rec              5685 non-null   int64  
 17  rec_yds       

In [29]:
late_slate = [
    'GB',
    'CHI',
    'LV',
    'DEN',
    'MIA',
    'LAC',
    'PHI',
    'NE',
    'LAR',
    'SEA'
]

agg_stats = ['rush_td', 'rush_yds', 'rec_td', 'rec_yds', 'rec', 'pass_sacked']

In [30]:
agg_df = (df
          .groupby('opp')
          [agg_stats]
          .agg(['sum'])
          .reset_index()
          .set_axis(['opp'] + agg_stats, axis=1)
          .pipe(lambda df_: df_.loc[df_['opp'].isin(late_slate)])
          .set_index('opp')
          .assign(
              total_yds=lambda df_: df_.rush_yds + df_.rec_yds,
              total_td=lambda df_: df_.rush_td + df_.rec_td 
          )
         )

In [39]:
agg_df.sort_values([
    # 'total_td',
    # 'total_yds',
    'rec',
    # 'rush_td',
    'rush_yds',
], ascending=False)

Unnamed: 0_level_0,rush_td,rush_yds,rec_td,rec_yds,rec,pass_sacked,total_yds,total_td
opp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MIA,15,1751,27,4282,416,40,6033,42
LV,20,2087,25,4321,399,27,6408,45
DEN,12,1866,20,3819,392,36,5685,32
LAR,12,1956,23,4092,375,38,6048,35
NE,7,1793,28,4029,364,54,5822,35
PHI,15,2068,22,3548,350,70,5616,37
SEA,21,2554,23,3891,343,45,6445,44
CHI,31,2674,22,3840,323,20,6514,53
GB,18,2372,22,3553,314,34,5925,40
LAC,17,2478,24,3693,310,40,6171,41
