# To do

Requires new scraping:
* Add actual injury info, dummy variables for different injuries in the previous year(s)
* Need some sort of analysis for rookies
    * Does the team they were drafted by have a star at the same position?
    * Where were they picked in the draft?
    * Winning pct of team picked?
    * Typical college metrics including injury history
* Need some sort of comparison with ADP eventually
    * Look at ADP this year, ADP following year type of thing. Delta ADP for risers, sinkers
    * Look at second half performance and how it affects ADP??
* New coach flag

Do-able with current data:
* Can get a flag for backup RB
* "On a good team flag," maybe it's the winning percentage of the team, or a flag on > .500 winning pct


# Import packages and define functions

In [1]:
###################### Import Packages #############################################
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import statsmodels.formula.api as smf

####################################### Functions ######################################

#Pull data from Pro Football Reference
def data_assembly(start_year, current_year):
    database = []
    for x in range(start_year, current_year):
        page = requests.get("https://www.pro-football-reference.com/years/%d/fantasy.htm" % x)
        soup = BeautifulSoup(page.content, 'html.parser')
        table = soup.find_all('table')[0]
        df = pd.read_html(str(table), header = 1)
        dfyear = df[0]
        dfyear['Year'] = x
        if x == start_year:
            database = dfyear
        else: database = database.append(dfyear, ignore_index = True)

    #Rename columns, eliminate duplicate column titles as rows
    database = database.rename(columns = {'Player':'Name', 'Att':'PaAtt', 'Yds':'PaYds', 'TD':'PaTD','Att.1':'RuAtt', 'Yds.1':'RuYds', 'TD.1':'RuTD', 'Y/A':'RuY/A', 'Y/R':'ReYds/R', 'Att.2':'ReAtt', 'Yds.2':'ReYds', 'TD.2':'ReTD'})
    database = database[database.Rk != 'Rk']
    
    # clean up artifacts at the end of names
    database['Name'] = database['Name'].apply(lambda x: x[0:len(x)-1] if x[len(x)-1] == '+' else x)
    database['Name'] = database['Name'].apply(lambda x: x[0:len(x)-1] if x[len(x)-1] == '*' else x)
    return database

# For shifting columns to prepare across years
def shift_col(frame, new_name, col_to_shift, magnitude):
    frame1 = frame.sort_values(['Name', 'Year']).reset_index(drop = True)
    frame1[new_name] = frame1[col_to_shift].shift(magnitude)
    for i in range(len(frame1) - 1):
        if frame1.loc[i, 'Name'] != frame1.loc[i+1, 'Name']:
            if magnitude == -1:
                frame1.loc[i, new_name] = np.nan
            elif magnitude == 1:
                frame1.loc[i+1, new_name] = np.nan
    return frame1

# To make a flag for switching teams
def new_team(tm_prev, tm_curr):
    if pd.notna(tm_prev) and pd.notna(tm_curr):
        if (tm_prev != tm_curr) or (tm_curr == '2TM'):
            val = 1
        else:
            val = 0
    else:
        val = np.nan
    return val


# Assemble the database and create metrics

In [None]:
############## Assemble the database #########################################
database = data_assembly(2012, 2019)   # pull based on years
database.to_csv('ffb_db.csv')          # save as csv just in case

In [136]:
# change columns to floats
cols_to_change = ['VBD'
                  , 'PosRank'
                  , 'Age'
                  , 'PPR'
                  , 'GS'
                  , 'G'
                  , 'PaTD'
                  , 'Cmp'
                  , 'Int'
                  , 'RuTD'
                  , 'ReTD'
                  , 'PaYds'
                  , 'RuYds'
                  , 'ReYds'
                 , 'Rec'
                 , 'RuY/A'
                , 'ReYds/R'
                 , 'PaAtt'
                 , 'RuAtt'
                 , 'Tgt']
for col in cols_to_change:
    database[col] = database[col].astype(float)
    
# create new metrics
shifted = shift_col(database, 'pts_next_year', 'PPR', -1)    # create pts_next_year column for regressions
shifted = shift_col(shifted, 'g_next_year', 'G', -1)        # create G_next_year col for regressions
shifted = shift_col(shifted, 'G_prev_year', 'G', 1)          # create games prev year for injury history
shifted = shift_col(shifted, 'G_prev_year_prev_year', 'G_prev_year', 1)  # create games prev year for injury history
shifted['ppg_next_year'] = shifted['pts_next_year'] / shifted['g_next_year']    # create ppg next year for y var
shifted['ppg_this_year'] = shifted['PPR'] / shifted['G']                              # current year ppg
shifted['delta_ppg'] = shifted['ppg_next_year'] - shifted['ppg_this_year']

shifted['season_frac_1'] = shifted['G'] / 16                                          # 1 year injury history
shifted['season_frac_2'] = (shifted['G'] + shifted['G_prev_year']) / 32               # 2 year injury history
shifted['season_frac_3'] = (shifted['G'] + shifted['G_prev_year_prev_year']) / 48     # 3 year injury history
shifted['ReYds_per_R'] = shifted['ReYds/R']                                           # ReYds per reception
shifted['RuYds_per_A'] = shifted['RuY/A']                                             # RuYds per attempt
shifted['RuTD_per_Att'] = shifted['RuTD'] / shifted['RuAtt']                          # RuTds per attempt
shifted['PaYds_per_PaAtt'] = shifted['PaYds'] / shifted['PaAtt']                      # PaYds per attempt
shifted['PaTD_per_PaAtt'] = shifted['PaTD'] / shifted['PaAtt']                        # PaTds per attempt
shifted['Cmp_per_PaAtt'] = shifted['Cmp'] / shifted['PaAtt']                        # Completions per attempt
shifted['Int_per_PaAtt'] = shifted['Int'] / shifted['PaAtt']                        # Interceptions per attempt
shifted['Rec_per_tgt'] = shifted['Rec'] / shifted['Tgt']                # Receptions per target
shifted['ReTD_per_rec'] = shifted['ReTD'] / shifted['Rec']              # ReTD's per reception
shifted['start_frac'] = shifted['GS'] / shifted['G']

# Create some per game metrics
for metric in ['Tgt', 'PaAtt', 'RuAtt']:
    new_str = metric + '_per_game'
    shifted[new_str] = shifted[metric] / shifted['G']

# Create new team flag
shifted = shift_col(shifted, 'Tm_prev_year', 'Tm', 1)
shifted['Tm_change_flag'] = shifted.apply(\
                                             lambda x: new_team(x['Tm_prev_year'], x['Tm']), axis = 1)

# Create share of team's fantasy points
gb = shifted.groupby(['Tm', 'Year']).sum().reset_index()[['Tm', 'Year', 'PPR']]
gb = gb[(gb.Tm != '2TM') & (gb.Tm != '3TM') & (gb.Tm != '4TM')]

shifted['Fant_Share'] = 0
for i in range(len(shifted)):
    if 'TM' not in shifted.Tm[i]:
        temp_tm = shifted.loc[i, 'Tm']
        temp_yr = shifted.loc[i, 'Year']
        gb_val = gb.loc[(gb.Tm == temp_tm) & (gb.Year == temp_yr), 'PPR']
        shifted.loc[i, 'Fant_Share'] = shifted.loc[i, 'PPR'] / gb_val.iloc[0]
    else:
        shifted.loc[i, 'Fant_Share'] = np.nan

In [138]:
################ Seperate database by position ####################################################
pos_dict = {}
for pos in shifted.FantPos.unique():
    pos_dict[pos] = shifted.loc[shifted.FantPos == pos].reset_index(drop = True)
    
    # center age to prepare for power relationship to reduce multicollinearity
    pos_dict[pos]['Age'] = pos_dict[pos]['Age'] - np.mean(pos_dict[pos]['Age']) 
    pos_dict[pos]['agesq'] = pos_dict[pos]['Age'] * pos_dict[pos]['Age']

# PPG Regressions

In [134]:
# Toggles
pos_dict_reg = pos_dict

#pos_dict_reg = {}
#for pos in pos_dict:
 #   pos_dict_reg[pos] = pos_dict[pos].loc[pd.notna(pos_dict[pos].VBD)].reset_index(drop = True)

# Non-position specific variables
all_cols = ['Age'
            , 'agesq'
            , 'season_frac_2'
            , 'Tm_change_flag'
            , 'PosRank'
            #, 'start_frac'
            #, 'Fant_Share'
            #, 'VBD'
           ]

# Position specific variables
xcols_dict = {
    'TE': ['ReTD_per_rec',
           'Rec_per_tgt', 'ReYds_per_R', 'Tgt_per_game'],
    'WR': ['ReTD_per_rec', 'Rec_per_tgt', 'ReYds_per_R', 'Tgt_per_game'],
    'QB': ['RuAtt_per_game', 'RuYds_per_A', 'RuTD_per_Att',\
           'PaYds_per_PaAtt', 'PaTD_per_PaAtt', 'PaAtt_per_game', 'Cmp_per_PaAtt', 'Int_per_PaAtt'],
    'RB': ['RuTD_per_Att','RuAtt_per_game', 'RuYds_per_A',\
           'ReTD_per_rec', 'Rec_per_tgt', 'ReYds_per_R', 'Tgt_per_game'],
}

for pos in xcols_dict:
    print(pos)
    ycol = 'ppg_next_year'
    xcols = xcols_dict[pos] + all_cols
    frame1 = pos_dict_reg[pos]
    frame = frame1[['Name', ycol] + xcols].dropna()
    xform = ' + '.join(xcols)
    formula = ycol + " ~ " + xform
    model = smf.ols(formula, frame)
    results = model.fit(cov_type='cluster', cov_kwds={'groups': frame['Name']})
    print(results.summary())
    print('\n')

TE
                            OLS Regression Results                            
Dep. Variable:          ppg_next_year   R-squared:                       0.473
Model:                            OLS   Adj. R-squared:                  0.457
Method:                 Least Squares   F-statistic:                     39.94
Date:                Sun, 04 Aug 2019   Prob (F-statistic):           6.61e-32
Time:                        16:29:53   Log-Likelihood:                -799.42
No. Observations:                 307   AIC:                             1619.
Df Residuals:                     297   BIC:                             1656.
Df Model:                           9                                         
Covariance Type:              cluster                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         -3.9052      3.085     

In [117]:
#Season Frac 2
#TE: .366
#WR: .469
#QB: .575
#RB: .371

#Season Frac 1
#TE: .360
#WR: .471
#QB: .584
#RB: .363

#Season Frac 3


list(shifted.columns)

['Rk',
 'Name',
 'Tm',
 'FantPos',
 'Age',
 'G',
 'GS',
 'Cmp',
 'PaAtt',
 'PaYds',
 'PaTD',
 'Int',
 'RuAtt',
 'RuYds',
 'RuY/A',
 'RuTD',
 'Tgt',
 'Rec',
 'ReYds',
 'ReYds/R',
 'ReTD',
 'Fmb',
 'FL',
 'TD.3',
 '2PM',
 '2PP',
 'FantPt',
 'PPR',
 'DKPt',
 'FDPt',
 'VBD',
 'PosRank',
 'OvRank',
 'Year',
 'g_next_year',
 'G_prev_year',
 'G_prev_year_prev_year']

# Games next year regressions

In [128]:
# Toggles
pos_dict_reg = pos_dict

#pos_dict_reg = {}
#for pos in pos_dict:
 #   pos_dict_reg[pos] = pos_dict[pos].loc[pd.notna(pos_dict[pos].VBD)].reset_index(drop = True)

# Non-position specific variables
all_cols = ['Age'
            , 'agesq'
            , 'start_frac'
            , 'season_frac_1'
            , 'season_frac_2'
            , 'season_frac_3'
            , 'Tm_change_flag'
            #, 'Fant_Share'
            #, 'VBD'
           ]

# Position specific variables
xcols_dict = {
    'TE': ['ReTD_per_rec',
           'Rec_per_tgt', 'ReYds_per_R', 'Tgt_per_game'],
    'WR': ['ReTD_per_rec', 'Rec_per_tgt', 'ReYds_per_R', 'Tgt_per_game'],
    'QB': ['RuTD_per_Att','RuAtt_per_game', 'RuYds_per_A',\
           'PaYds_per_PaAtt', 'PaTD_per_PaAtt', 'PaAtt_per_game', 'Cmp_per_PaAtt', 'Int_per_PaAtt'],
    'RB': ['RuTD_per_Att','RuAtt_per_game', 'RuYds_per_A',\
           'ReTD_per_rec', 'Rec_per_tgt', 'ReYds_per_R', 'Tgt_per_game'],
}

for pos in xcols_dict:
    print(pos)
    ycol = 'g_next_year'
    xcols = xcols_dict[pos] + all_cols
    frame1 = pos_dict_reg[pos]
    frame = frame1[['Name', ycol] + xcols].dropna()
    xform = ' + '.join(xcols)
    formula = ycol + " ~ " + xform
    model = smf.ols(formula, frame)
    results = model.fit(cov_type='cluster', cov_kwds={'groups': frame['Name']})
    print(results.summary())
    print('\n')

TE
                            OLS Regression Results                            
Dep. Variable:            g_next_year   R-squared:                       0.075
Model:                            OLS   Adj. R-squared:                  0.024
Method:                 Least Squares   F-statistic:                     1.276
Date:                Sun, 04 Aug 2019   Prob (F-statistic):              0.250
Time:                        16:20:21   Log-Likelihood:                -594.49
No. Observations:                 214   AIC:                             1213.
Df Residuals:                     202   BIC:                             1253.
Df Model:                          11                                         
Covariance Type:              cluster                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          8.2393      3.679     

# Regress on change in PPG

In [141]:
# Toggles
pos_dict_reg = pos_dict

#pos_dict_reg = {}
#for pos in pos_dict:
 #   pos_dict_reg[pos] = pos_dict[pos].loc[pd.notna(pos_dict[pos].VBD)].reset_index(drop = True)

# Non-position specific variables
all_cols = ['Age'
            , 'agesq'
            , 'season_frac_2'
            , 'Tm_change_flag'
            , 'PosRank'
            , 'start_frac'
            #, 'Fant_Share'
            #, 'VBD'
           ]

# Position specific variables
xcols_dict = {
    'TE': ['ReTD_per_rec',
           'Rec_per_tgt', 'ReYds_per_R', 'Tgt_per_game'],
    'WR': ['ReTD_per_rec', 'Rec_per_tgt', 'ReYds_per_R', 'Tgt_per_game'],
    'QB': ['RuAtt_per_game', 'RuYds_per_A', 'RuTD_per_Att',\
           'PaYds_per_PaAtt', 'PaTD_per_PaAtt', 'PaAtt_per_game', 'Cmp_per_PaAtt', 'Int_per_PaAtt'],
    'RB': ['RuTD_per_Att','RuAtt_per_game', 'RuYds_per_A',\
           'ReTD_per_rec', 'Rec_per_tgt', 'ReYds_per_R', 'Tgt_per_game'],
}

for pos in xcols_dict:
    print(pos)
    ycol = 'delta_ppg'
    xcols = xcols_dict[pos] + all_cols
    frame1 = pos_dict_reg[pos]
    frame = frame1[['Name', ycol] + xcols].dropna()
    xform = ' + '.join(xcols)
    formula = ycol + " ~ " + xform
    model = smf.ols(formula, frame)
    results = model.fit(cov_type='cluster', cov_kwds={'groups': frame['Name']})
    print(results.summary())
    print('\n')

TE
                            OLS Regression Results                            
Dep. Variable:              delta_ppg   R-squared:                       0.192
Model:                            OLS   Adj. R-squared:                  0.165
Method:                 Least Squares   F-statistic:                     4.985
Date:                Sun, 04 Aug 2019   Prob (F-statistic):           5.01e-06
Time:                        16:35:10   Log-Likelihood:                -785.16
No. Observations:                 307   AIC:                             1592.
Df Residuals:                     296   BIC:                             1633.
Df Model:                          10                                         
Covariance Type:              cluster                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.1747      3.250     