In [1]:
from scraper import Scraper as BoxscoreScraper

In [2]:
scraper = BoxscoreScraper(year=2023)

In [3]:
scraper.get_season_boxscores()

Beginning scraping for 2023-2024 season



  0%|          | 0/1 [00:00<?, ?it/s]

Scraping boxscores for Week 6
Succesfully scraped boxscores for Week 6



In [1]:
import pandas as pd
import perspective
from filing import Filing
pd.options.display.max_rows = 100

In [2]:
class Cleaning:

    def __init__(self, **kwargs):
        """
        This class will contain the functionality to combine, clean, wrangle, and partition all the boxscores
        """

        self.year: int = int(kwargs.get('year', 2023))
        self.season: str = f'{self.year}-{self.year+1}'
        self.site: str = kwargs.get('site', 'draftkings').lower()

        # Initialize filing object
        self.filing = Filing(self.season)


        self.fpts_df = self.filing.combined_boxscores()
        self.snapcounts_df = self.filing.combined_snapcounts()

        # Need to convert if necessary
        if self.site == 'fanduel':
            # HPPR and -2.0 for fumble lost
            self.fpts_df = (self.fpts_df
                            .assign(fpts=lambda df: 0.04*df.pass_yds + 4.0*df.pass_td - 1.0*df.pass_int + 0.1*df.rush_yds + 6.0*df.rush_td + 0.5*df.rec + 0.1*df.rec_yds + 6.0*df.rec_td - 2.0*df.fumbles_lost)
                            .drop('bonus', axis=1)
                           )

            # Since bonus not include in recalculation, no need to account
            

    def load_team_boxscores(self, team: str) -> pd.DataFrame:
        return self.filing.load_boxscores()[team]

    def load_team_snapcounts(self, team: str) -> pd.DataFrame:
        return self.filing.load_snapcounts()[team]

    def load_team_adv_stats(self, team: str, category: str) -> pd.DataFrame:
        return self.filing.load_advanced_stats()[category][team]

    # Most useful info in here
    def merge_fpts_snaps(self):

        if hasattr(self, 'fpts_snaps'):
            return self.fpts_snaps

        create_index = lambda row: f'{row.iloc[0]}-{row.iloc[1]}'

        self.fpts_df['index_'] = self.fpts_df[['name', 'week']].apply(create_index, axis=1)
        self.snapcounts_df['index_'] = self.snapcounts_df[['name', 'week']].apply(create_index, axis=1)

        # Not all people in boxscores in snapcounts, but all people in snapcounts in boxscores
        shared = self.snapcounts_df['index_'].drop_duplicates()

        # Dont want to change class dfs --> make shallow copies
        columns_to_keep = ['index_', 'name', 'team', 'opp', 'pos', 'week']
        fpts_columns_to_keep = ['targets', 'rec', 'rec_yds', 'rec_td', 'rush_yds', 'rush_td', 'spread', 'total', 'winner']
        fpts = (self.fpts_df
                .loc[self.fpts_df['index_'].isin(shared)]
                [columns_to_keep + fpts_columns_to_keep + ['fpts']]
                .sort_values('index_')
                .set_index('index_')
               )

        snaps = (self.snapcounts_df
                 [columns_to_keep + ['snap_total', 'snap_percent']]
                 .sort_values('index_')
                 .set_index('index_')
                )

        self.fpts_snaps = fpts.merge(snaps)
        
        return self.fpts_snaps

    def get_pos_depths(self):

        if hasattr(self, 'pos_depths'):
            return self.pos_depths

        # Reference to self.fpts_snaps, need to figure out injury stuff --> AJ Dillon comes out as RB1 everyway except fpts rn
        df = (self.merge_fpts_snaps()
              # .pipe(lambda df_: df_.loc[df_['fpts'] > 0.0])
             )

        self.pos_depths = {
            team: {
                pos: dict()
                for pos in df['pos'].drop_duplicates()
            }
            for team in df['team'].drop_duplicates()
        }

        # Info to get for each position
        # by --> What to determine depth by, becomes more accurate with regression / increased sample size
        # depth --> amount of players to care about
        # agg --> method to determine by
        pos_depth_info = {
            'QB': {
                'by': 'snap_total',
                'depth': range(1,2)
            },
            'WR': {
                'by': 'targets',
                'depth': range(1,4)
            },
            'RB': {
                'by': 'snap_total',
                'depth': range(1,3)
            },
            'TE': {
                'by': 'targets',
                'depth': range(1,3)
            }
        }

        for team in self.pos_depths:
            # Team dataframe
            tdf = df.loc[df['team'] == team]

            for pos in pos_depth_info:

                info = pos_depth_info[pos]

                determine_by = info.get('by', 'snap_total')
                agg_by = info.get('agg', 'sum')
                
                # Team position dataframe
                tpdf = tdf.loc[tdf['pos'] == pos]
                # Agg default is sum
                aggdf = tpdf.groupby('name')[determine_by].agg([agg_by]).sort_values(agg_by, ascending=False)

                names = tuple(aggdf.index)

                for i, name in enumerate(names):
                    depth_ = i+1
                    if depth_ in info['depth']:
                        self.pos_depths[team][pos][depth_] = name
                    else:
                        depth_ = list(info['depth'])[-1]+1
                        if depth_ in self.pos_depths[team][pos]:
                            self.pos_depths[team][pos][depth_].append(name)
                        else:
                            self.pos_depths[team][pos][depth_] = [name]
            
        
        return self.pos_depths

    def add_depths(self):

        pos_depths = self.get_pos_depths()

        # Returns itself if already defined, creates itself if not
        df = self.merge_fpts_snaps().copy(deep=True)

        for team, pos_info in pos_depths.items():
            # Team: {pos: {n: name, ...}}
            for pos, depths in pos_info.items():
                # Get the top players as defined in pos_depth_info above, aka ones not in list
                top_n_players = {depth_: name_ for depth_, name_ in depths.items() if isinstance(name_, str)}
                # Need better naming --> this is the default for lower depth players (4 for WR, 2 for QB etc)
                bad_players_depth = max(depths.keys())

                for depth_, name_ in top_n_players.items():
                    pos_depth_value = f'{pos}{depth_}'
                    df.loc[(df['team'] == team) & (df['pos'] == pos) & (df['name'] == name_), 'depth'] = pos_depth_value

                for name_ in depths[bad_players_depth]:
                    pos_depth_value = f'{pos}{bad_players_depth}'
                    df.loc[(df['team'] == team) & (df['pos'] == pos) & (df['name'] == name_), 'depth'] = pos_depth_value
                    

        self.fpts_snaps = df
        
        return self.fpts_snaps



In [3]:
cleaning = Cleaning(
    year=2023,
    # site='fanduel'
)

In [4]:
df = cleaning.add_depths()

In [5]:
df['abs-spread'] = abs(df['spread'])

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 936 entries, 0 to 935
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          936 non-null    object 
 1   team          936 non-null    object 
 2   opp           936 non-null    object 
 3   pos           936 non-null    object 
 4   week          936 non-null    int64  
 5   targets       936 non-null    float64
 6   rec           936 non-null    float64
 7   rec_yds       936 non-null    float64
 8   rec_td        936 non-null    float64
 9   rush_yds      936 non-null    float64
 10  rush_td       936 non-null    float64
 11  spread        936 non-null    int64  
 12  total         936 non-null    int64  
 13  winner        936 non-null    int64  
 14  fpts          936 non-null    float64
 15  snap_total    936 non-null    int64  
 16  snap_percent  936 non-null    float64
 17  depth         936 non-null    object 
 18  abs-spread    936 non-null    

In [None]:
# df_blog = df.sort_values(['team', 'week', 'fpts'], ascending=[True, True, False])
# df_blog.to_csv('../data/perespective-blog-data-2023.csv', index=False)

In [None]:
perspective.PerspectiveWidget(df)

In [None]:
# table = perspective.Table(both)
perspective.PerspectiveWidget(df)

In [None]:
pd.options.display.max_rows = 999

In [8]:
# stats = ['targets', 'rec', 'fpts']
stats = ['rush_td', 'snap_total', 'fpts']
agg_by = ['sum']
rename_columns = [f'{agg}-{stat}' for stat in stats for agg in agg_by]

In [9]:
rename_columns

['sum-rush_td', 'sum-snap_total', 'sum-fpts']

In [10]:
(df
 .loc[df['depth'] == 'RB1']
 .groupby('opp')
 [stats]
 .agg(agg_by)
 .set_axis(rename_columns, axis=1)
 .sort_values('sum-fpts', ascending=False)
 # .sort_values('fpts', ascending=False)
)

Unnamed: 0_level_0,sum-rush_td,sum-snap_total,sum-fpts
opp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DEN,5.0,122,81.2
GB,3.0,169,79.9
CAR,4.0,111,66.3
PIT,1.0,160,65.0
NYG,4.0,135,64.7
ARI,1.0,179,58.9
LAR,2.0,147,50.2
LAC,2.0,161,49.0
SEA,4.0,134,48.0
BAL,0.0,147,45.7


In [None]:
both.info()

In [None]:
df['game-id'] = df[['team', 'opp', 'week']].apply(lambda row: f'{row.iloc[0]}-{row.iloc[1]}-week{row.iloc[2]}' if row.iloc[0] > row.iloc[1] else f'{row.iloc[1]}-{row.iloc[0]}-week{row.iloc[2]}', axis=1)

In [None]:
df.info()

In [11]:
def receiving_stats(team: str, **kwargs):
    rec = cleaning.load_team_adv_stats(team, 'receiving')

    stats = ['targets', 'rec', 'rec_yds', 'rec_adot'] + kwargs.get('stats', [])

    if kwargs.get('raw', False):
        return (rec
                [['name', 'pos'] + stats]
                .sort_values(stats, ascending=False)
                .reset_index(drop=True)
               )

    return (rec
            .groupby('name')
            [stats]
            .agg(['mean'])
            .sort_values([('targets', 'mean'), ('rec_adot', 'mean')], ascending=False)
           )

In [14]:
receiving_stats('BUF', raw=True)

Unnamed: 0,name,pos,targets,rec,rec_yds,rec_adot
0,Stefon Diggs,WR,13,10,102,10.4
1,Stefon Diggs,WR,12,8,111,10.2
2,Stefon Diggs,WR,7,7,66,4.9
3,Gabe Davis,WR,7,6,92,10.9
4,Dalton Kincaid,TE,6,5,43,6.2
5,James Cook,RB,6,4,17,4.2
6,Dawson Knox,TE,5,3,10,0.8
7,James Cook,RB,4,4,36,-1.0
8,Dalton Kincaid,TE,4,4,26,1.3
9,Dawson Knox,TE,4,3,25,7.0


In [None]:
def get_medians(df: pd.DataFrame):

    return (df
            .groupby('name')
            ['snap_percent']
            .agg(['mean', 'median'])
            .sort_values('median', ascending=False)
           )

In [None]:
# get_medians(nyg_boxscores)
get_medians(nyg_snapcounts)
# get_medians(nyg_boxscores)
# get_medians(sf_snapcounts)

In [None]:
games_df: pd.DataFrame = (df
                          .drop_duplicates('game-id')
                          .loc[:, ['week', 'team', 'opp', 'home', 'score', 'opp_score', 'spread', 'total']]
                          .reset_index(drop=True)
                         )
print(f'Total games: {games_df.shape[0]}')

In [None]:
n_bins = 27

In [None]:
(games_df
 ['total']
 .hist(figsize=(15,5), bins=n_bins)
);

In [None]:
(games_df
 ['spread']
 .hist(figsize=(15,5), bins=n_bins)
);

In [None]:
# Need to figure out how to classify position depth --> May run into issues with injuries

In [None]:
[name for name in df['name'].drop_duplicates() if 'Amon' in name]

In [None]:
import numpy as np
import scipy.stats as stats

import itertools

In [None]:

team_cores = {
    'DET': [
        'Jared Goff',
        'Amon-Ra St',
        # 'Josh Reynolds',
        # 'Kalif Raymond',
        'Sam LaPorta'
    ],

    'GB': [
        'Jordan Love',
        'Romeo Doubs',
        'Luke Musgrave'

    ]
}
team_dfs = {
    team: df.loc[(df['team'] == team) & (df['name'].isin(core)) & (df['fpts'] > 0.0)] 
    for team, core in team_cores.items()
}

In [None]:
# Get the games core players have played in together
# Example: If AJ Brown is missing, DeVonta Smith might have a much better game than usual and will skew results OR
# he might have a much worse game since a better corner is most likely guarding him
team_core_together_weeks = {
    team: tuple(team_df
                .groupby('week')
                ['week']
                .agg(['count'])
                .pipe(lambda df_: df_.loc[df_['count'] == len(team_cores[team])])
                .index
               )
    for team, team_df in team_dfs.items()
}

In [None]:
team_core_fpts = {
    team: {
        name: [ (team_df.loc[(team_df['name'] == name) & (team_df['week'] == week ), 'fpts'].item()) for week in team_core_together_weeks[team] ]
        for name in team_cores[team]
    }

    for team, team_df in team_dfs.items()
}

In [None]:
agg_stats = ['mean']

In [None]:
team_combo_corrs = {
    team: dict(sorted({
        combo: round(stats.pearsonr(core_fpts[combo[0]], core_fpts[combo[1]])[0],3)
        for combo in itertools.combinations(team_cores[team], 2)
    }.items(), key=lambda item: item[1], reverse=True))
    for team, core_fpts in team_core_fpts.items()
}

In [None]:
team_combo_corrs['GB']

In [None]:
team_combo_corrs['DET']

In [None]:
agg_stats = ['rush_yds', 'rush_td', 'rec_yds', 'rec_td']
teams = ['PIT', 'CLE']

In [None]:
agg_df = (df
          .groupby('opp')
          [agg_stats]
          .agg(['sum'])
          .reset_index()
          .set_axis(['opp'] + agg_stats, axis=1)
          .pipe(lambda df_: df_.loc[df_['opp'].isin(['NYG', 'SF'])])
          .set_index('opp')
          .assign(
              total_yds=lambda df_: df_.rush_yds + df_.rec_yds,
              total_td=lambda df_: df_.rush_td + df_.rec_td 
          )
         )
agg_df

In [None]:
# agg_df.sort_values([
#     'fpts',
#     # 'total_td',
#     # 'total_yds',
#     'rec',
#     # 'rush_td',
#     'rush_yds',
# ], ascending=False)