In [None]:
from scraper import Scraper as BoxscoreScraper

In [None]:
scraper = BoxscoreScraper(year=2022)

In [None]:
scraper.get_season_boxscores()

In [None]:
from filing import Filing

import pandas as pd
class Cleaning:

    def __init__(self, **kwargs):
        """
        This class will contain the functionality to combine, clean, wrangle, and partition all the boxscores
        """

        self.year: int = int(kwargs.get('year', 2023))
        self.season: str = f'{self.year}-{self.year+1}'

        # Initialize filing object
        self.filing = Filing(self.season)

        hppr = kwargs.get('hppr', False)
        ppr_coeff = 0.5 if hppr else 1.0
        
        self.raw: pd.DataFrame = (self.filing.combined()
                                  .assign(fpts=lambda df: 0.04*df.pass_yds + 4.0*df.pass_td - 1.0*df.pass_int + 0.1*df.rush_yds + 6.0*df.rush_td + ppr_coeff*df.rec + 0.1*df.rec_yds + 6.0*df.rec_td - 2.0*df.fumbles_lost)
                                 )
        self.positions: pd.DataFrame = self.filing.positions()



In [None]:
cleaning = Cleaning(year=2022)

In [None]:
df = cleaning.raw

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5685 entries, 0 to 5684
Data columns (total 26 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             5685 non-null   object 
 1   team             5685 non-null   object 
 2   pass_cmp         5685 non-null   int64  
 3   pass_att         5685 non-null   int64  
 4   pass_yds         5685 non-null   int64  
 5   pass_td          5685 non-null   int64  
 6   pass_int         5685 non-null   int64  
 7   pass_sacked      5685 non-null   int64  
 8   pass_sacked_yds  5685 non-null   int64  
 9   pass_long        5685 non-null   int64  
 10  pass_rating      5685 non-null   float64
 11  rush_att         5685 non-null   int64  
 12  rush_yds         5685 non-null   int64  
 13  rush_td          5685 non-null   int64  
 14  rush_long        5685 non-null   int64  
 15  targets          5685 non-null   int64  
 16  rec              5685 non-null   int64  
 17  rec_yds       

In [118]:
rb_against_nyj = (df
                  .loc[(df['pos'] == 'RB') & (df['opp'] == 'NYJ') & (df['fpts'] >= 5.0)]
                  [['name', 'team', 'rush_att', 'rush_yds', 'rush_td', 'targets', 'rec', 'rec_yds', 'rec_td', 'fpts']]
                  .sort_values('fpts', ascending=False)
                 )

# rb_against_nyj

In [56]:
wr_against_nyj = (df
                  .loc[(df['pos'] == 'WR') & (df['opp'] == 'NYJ') & (df['fpts'] >= 5.0)]
                  [['name', 'team', 'targets', 'rec', 'rec_yds', 'rec_td', 'fpts']]
                  .sort_values('fpts', ascending=False)
                 )

# wr_against_nyj

Unnamed: 0,name,team,targets,rec,rec_yds,rec_td,fpts
2466,Amari Cooper,CLE,10,9,101,1,25.1
2697,Devin Duvernay,BAL,4,4,54,2,21.4
4737,Jakobi Meyers,NE,13,9,60,1,21.0
3464,Tyler Boyd,CIN,5,4,105,1,20.5
1936,Justin Jefferson,MIN,11,7,45,1,18.6
1061,Allen Lazard,GB,9,4,76,1,17.6
3271,Jerry Jeudy,DEN,11,7,96,0,16.6
4467,George Pickens,PIT,8,6,102,0,16.2
3831,Amon-Ra St,DET,10,7,76,0,15.2
410,Stefon Diggs,BUF,10,5,93,0,14.3


In [96]:
buf = df.loc[df['team'] == 'BUF']
nyj = df.loc[df['team'] == 'NYJ']
min = df.loc[df['team'] == 'MIN']
gb = df.loc[df['team'] == 'GB']

In [139]:
nyj['name'].drop_duplicates()

39           Mike White
40           Joe Flacco
41       Zonovan Knight
42       Michael Carter
43       Garrett Wilson
44         Elijah Moore
45          Denzel Mims
46        Tyler Conklin
47      Braxton Berrios
48          Corey Davis
49            CJ Uzomah
185          Ty Johnson
415         Zach Wilson
417      James Robinson
420        Ashtyn Davis
424          Jeff Smith
1049        Breece Hall
1509        Braden Mann
2460     Jeremy Ruckert
2711     Lawrence Cager
4842    Chris Streveler
Name: name, dtype: object

In [None]:
import numpy as np
import scipy.stats as stats

import itertools

In [113]:
gb_core = [
    'Aaron Rodgers',
    'Aaron Jones',
    'Allen Lazard',
    'Randall Cobb',
    # 'Robert Tonyan'
]

gb_core_df = gb.loc[(gb['name'].isin(gb_core))] # & (buf['fpts'] > 0.0)]
# min_core_df

In [114]:
gb_together_weeks = tuple(gb_core_df
                           .groupby('week')
                           ['week']
                           .agg(['count'])
                           .pipe(lambda df_: df_.loc[df_['count'] == len(nyj_core)])
                           .index
                          )
gb_together_weeks

(2.0, 3.0, 4.0, 5.0, 6.0, 11.0, 12.0, 13.0, 15.0, 16.0, 17.0, 18.0)

In [115]:
gb_core_values = {
    name: [(gb_core_df
            .loc[(gb_core_df['name'] == name) & (gb_core_df['week'] == week)]
            ['fpts']
            .item()
           )
           for week in gb_together_weeks
          ]
    for name in gb_core
}

In [117]:
gb_combo_corrs = {
    combo: stats.pearsonr(gb_core_values[combo[0]], gb_core_values[combo[1]])[0]
    for combo in itertools.combinations(gb_core, 2)
}

dict(sorted(gb_combo_corrs.items(), key=lambda item: item[1], reverse=True))

{('Aaron Rodgers', 'Randall Cobb'): 0.5230416863584136,
 ('Aaron Rodgers', 'Allen Lazard'): 0.34835103454734073,
 ('Aaron Jones', 'Randall Cobb'): 0.18189818131186036,
 ('Aaron Rodgers', 'Aaron Jones'): 0.04739933024705792,
 ('Allen Lazard', 'Randall Cobb'): -0.12936128506175001,
 ('Aaron Jones', 'Allen Lazard'): -0.5555532940787442}

In [154]:
nyj_core = [
    'Zach Wilson',
    'Garrett Wilson',
    'Michael Carter',
    'Elijah Moore',
    'Tyler Conklin'
]

nyj_core_df = nyj.loc[(nyj['name'].isin(nyj_core))]

(nyj_core_df
 .groupby('name')
 ['fpts']
 .agg([np.mean, np.median, np.std])
 .sort_values('median', ascending=False)
)

Unnamed: 0_level_0,mean,median,std
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Garrett Wilson,12.688235,12.0,8.567809
Zach Wilson,11.768889,10.56,7.196215
Tyler Conklin,7.735294,7.4,6.537196
Michael Carter,7.875,5.65,5.128938
Elijah Moore,6.292857,5.15,4.19312


In [155]:
nyj_together_weeks = tuple(nyj_core_df
                           .groupby('week')
                           ['week']
                           .agg(['count'])
                           .pipe(lambda df_: df_.loc[df_['count'] == len(nyj_core)])
                           .index
                          )
nyj_together_weeks

(4.0, 5.0, 8.0, 11.0, 15.0, 16.0)

In [156]:
nyj_core_values = {
    name: [(nyj_core_df
            .loc[(nyj_core_df['name'] == name) & (nyj_core_df['week'] == week)]
            ['fpts']
            .item()
           )
           for week in nyj_together_weeks
          ]
    for name in nyj_core
}

In [157]:
nyj_combo_corrs = {
    combo: stats.pearsonr(nyj_core_values[combo[0]], nyj_core_values[combo[1]])[0]
    for combo in itertools.combinations(nyj_core, 2) 
    # if 'Stefon Diggs' in combo
}

dict(sorted(nyj_combo_corrs.items(), key=lambda item: item[1], reverse=True))

{('Garrett Wilson', 'Tyler Conklin'): 0.6582119655786053,
 ('Zach Wilson', 'Garrett Wilson'): 0.6563318034008577,
 ('Zach Wilson', 'Elijah Moore'): 0.34820511380417646,
 ('Zach Wilson', 'Tyler Conklin'): 0.2652629426894818,
 ('Michael Carter', 'Tyler Conklin'): 0.036061025484311254,
 ('Garrett Wilson', 'Michael Carter'): 0.004286989530080158,
 ('Zach Wilson', 'Michael Carter'): -0.033842838829028304,
 ('Garrett Wilson', 'Elijah Moore'): -0.09217581227573082,
 ('Elijah Moore', 'Tyler Conklin'): -0.4984651001081421,
 ('Michael Carter', 'Elijah Moore'): -0.6111886615549306}

In [None]:
gb_combo_corrs = {
    combo: stats.pearsonr(gb_core_values[combo[0]], gb_core_values[combo[1]])[0]
    for combo in itertools.combinations(nyj_core, 2)
}

dict(sorted(gb_combo_corrs.items(), key=lambda item: item[1], reverse=True))

In [62]:
min_core = [
    'Justin Jefferson',
    'Dalvin Cook',
    'TJ Hockenson',
    'Kirk Cousins',
    'Adam Thielen'
]

min_core_df = min.loc[(min['name'].isin(min_core))] # & (buf['fpts'] > 0.0)]
# min_core_df

In [63]:
min_together_weeks = tuple(min_core_df
                           .groupby('week')
                           ['week']
                           .agg(['count'])
                           .pipe(lambda df_: df_.loc[df_['count'] == len(min_core)])
                           .index
                          )

In [64]:
min_core_values = {
    name: [(min_core_df
            .loc[(min_core_df['name'] == name) & (min_core_df['week'] == week)]
            ['fpts']
            .item()
           )
           for week in min_together_weeks
          ]
    for name in min_core
}

In [153]:
min_combo_corrs = {
    combo: stats.pearsonr(min_core_values[combo[0]], min_core_values[combo[1]])[0]
    for combo in itertools.combinations(min_core, 2)
}


dict(sorted(min_combo_corrs.items(), key=lambda item: item[1], reverse=True))

{('Justin Jefferson', 'Kirk Cousins'): 0.7891543411540461,
 ('Justin Jefferson', 'Dalvin Cook'): 0.6047931286297824,
 ('Kirk Cousins', 'Adam Thielen'): 0.5490438704384552,
 ('Justin Jefferson', 'Adam Thielen'): 0.5294422443085328,
 ('Dalvin Cook', 'Kirk Cousins'): 0.5003357817949433,
 ('Justin Jefferson', 'TJ Hockenson'): 0.3869530406397159,
 ('TJ Hockenson', 'Kirk Cousins'): 0.28799418139025634,
 ('Dalvin Cook', 'Adam Thielen'): 0.1106548495986295,
 ('Dalvin Cook', 'TJ Hockenson'): -0.07555040805200897,
 ('TJ Hockenson', 'Adam Thielen'): -0.24850056241652038}

In [None]:
stats.pearsonr(
    min_core_values['Justin Jefferson'],
    min_core_values['Dalvin Cook']
)[0]

In [136]:
buf_core = [
    'Josh Allen',
    'Stefon Diggs',
    'Dawson Knox',
    'Gabriel Davis',
    'James Cook',
    'Isaiah McKenzie',
    'Devin Singletary',
    'Khalil Shakir'
]

buf_core_df = buf.loc[(buf['name'].isin(buf_core))] # & (buf['fpts'] > 0.0)]
(buf_core_df
 .groupby('name')
 ['fpts']
 .agg([np.mean, np.median, np.std])
 .sort_values('median', ascending=False)
)

Unnamed: 0_level_0,mean,median,std
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Josh Allen,25.345,25.44,6.464733
Stefon Diggs,19.7875,21.95,10.081923
Devin Singletary,11.11875,9.65,6.054994
Gabriel Davis,11.44,9.5,7.7987
Dawson Knox,9.046667,9.0,5.556061
James Cook,6.60625,6.2,5.556674
Isaiah McKenzie,7.986667,5.8,6.564174
Khalil Shakir,3.21,1.8,4.907013


In [130]:
buf_together_weeks = tuple(buf_core_df
                           .groupby('week')
                           ['week']
                           .agg(['count'])
                           .pipe(lambda df_: df_.loc[df_['count'] == len(buf_core)])
                           .index
                          )

buf_core_values = {
    name: [(buf_core_df
            .loc[(buf_core_df['name'] == name) & (buf_core_df['week'] == week)]
            ['fpts']
            .item()
           )
           for week in buf_together_weeks
          ]
    for name in buf_core
}

In [152]:
buf_combo_corrs = {
    combo: stats.pearsonr(buf_core_values[combo[0]], buf_core_values[combo[1]])[0]
    for combo in itertools.combinations(buf_core, 2)
    # if 'Josh Allen' in combo
    # if 'Stefon Diggs' in combo
}

dict(sorted(buf_combo_corrs.items(), key=lambda item: item[1], reverse=True))

{('Josh Allen', 'Dawson Knox'): 0.5832355663541106,
 ('Gabriel Davis', 'Devin Singletary'): 0.5605418929396544,
 ('Josh Allen', 'Khalil Shakir'): 0.5022843324341338,
 ('Stefon Diggs', 'Gabriel Davis'): 0.4792517800111933,
 ('James Cook', 'Devin Singletary'): 0.2670018086428258,
 ('Josh Allen', 'Devin Singletary'): 0.2515638864207145,
 ('Dawson Knox', 'Gabriel Davis'): 0.1867769611368109,
 ('Josh Allen', 'Gabriel Davis'): 0.16919168331338916,
 ('Isaiah McKenzie', 'Khalil Shakir'): 0.16260636480438118,
 ('Stefon Diggs', 'Khalil Shakir'): 0.1326750685342753,
 ('Dawson Knox', 'Devin Singletary'): 0.10159695307110637,
 ('Stefon Diggs', 'Isaiah McKenzie'): 0.08742814635421764,
 ('Josh Allen', 'Stefon Diggs'): 0.03947466383993915,
 ('Gabriel Davis', 'James Cook'): -0.0606348552239777,
 ('Devin Singletary', 'Khalil Shakir'): -0.07348263443374012,
 ('Stefon Diggs', 'Devin Singletary'): -0.11184508529576606,
 ('Gabriel Davis', 'Khalil Shakir'): -0.13448640665369999,
 ('Isaiah McKenzie', 'Devin S

In [None]:
stats.pearsonr(
    buf_core_values['Josh Allen'],
    buf_core_values['Stefon Diggs']
)[0]

In [None]:
df.info()

In [None]:
teams = [
    'NYJ',
    'BUF'
]

agg_stats = ['rush_td', 'rush_yds', 'rec_td', 'rec_yds', 'rec', 'fpts']

In [None]:
agg_df = (df
          .groupby('opp')
          [agg_stats]
          .agg(['sum'])
          .reset_index()
          .set_axis(['opp'] + agg_stats, axis=1)
          .pipe(lambda df_: df_.loc[df_['opp'].isin(teams)])
          .set_index('opp')
          .assign(
              total_yds=lambda df_: df_.rush_yds + df_.rec_yds,
              total_td=lambda df_: df_.rush_td + df_.rec_td 
          )
         )

In [None]:
agg_df.sort_values([
    'fpts',
    # 'total_td',
    # 'total_yds',
    'rec',
    # 'rush_td',
    'rush_yds',
], ascending=False)