# To do
* Fix Adrian Peterson
* Fix Le'Veon Bell
* Fix N'Keal Harry

Can do with current data:
* Need to integrate the rookie analysis in the targets joining vs not joining
    * Does the team they were drafted by have a star at the same position?
    * Where were they picked in the draft?
    * Winning pct of team picked?
    * Typical college metrics including injury history

Requires new scraping:
* Add type of injury info, dummy variables for different injuries in the previous year(s)
* Get game level data: look at second half performance and how it affects ADP, a momentum metric
* New coach flag, maybe a coach name dummy variable or a quarter back name dummy

# Import packages and define functions

In [1]:
###################### Import Packages #############################################
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import statsmodels.formula.api as smf
from ffb_functions import *
pd.set_option('display.max_columns', 500)
pd.options.display.max_rows = 999

# Assemble pro football focus database and create metrics

In [2]:
############## Assemble the database #########################################
try:
    database = pd.read_csv('profootballfocus.csv')
except:
    database = data_assembly(2004, 2019)                            # can go back to at least 2000, limited by adp data
    database.to_csv('profootballfocus.csv', index = False)          # save original db

## change columns to floats for metric creation
cols_to_change = ['VBD', 'PosRank', 'OvRank', 'Rk', 'Age'
                  , 'PPR', 'GS', 'G'
                  , 'PaTD', 'Cmp', 'Int', 'RuTD', 'ReTD'
                  , 'PaYds', 'RuYds', 'ReYds', 'Rec'
                  , 'RuY/A', 'ReYds/R', 'PaAtt', 'RuAtt', 'Tgt'
                  , '2PM', '2PP', 'Fmb', 'FL', 'TD.3']
for col in cols_to_change:
    database[col] = database[col].astype(float)
    
## create new metrics #############################################
# points next year for regressions
shifted = shift_col(database, 'pts_next_year', 'PPR', -1)
shifted['pts_next_year'] = shifted['pts_next_year'].replace(np.nan, 0)                # impute 0 if they arent in top 500 next year

## impute stuff early #####
shifted['VBD'] = shifted['VBD'].replace(np.nan, 999)
shifted['OvRank'] = shifted['OvRank'].replace(np.nan, 999)

# games next year for regressions
shifted = shift_col(shifted, 'g_next_year', 'G', -1)                                  # create G_next_year col for regressions
shifted['g_next_year'] = shifted['g_next_year'].replace(np.nan, 0)                    # impute 0 if they arent in top 500 next year

# games previous years for injury history
shifted = shift_col(shifted, 'G_prev_year', 'G', 1) 
shifted = shift_col(shifted, 'G_prev_year_prev_year', 'G_prev_year', 1)               

# create ppg next year for y var
shifted['ppg_next_year'] = shifted['pts_next_year'] / shifted['g_next_year']
shifted['ppg_next_year'] = shifted['ppg_next_year'].replace(np.nan, 0)
shifted['ppg_this_year'] = shifted['PPR'] / shifted['G']                              # current year ppg
shifted['delta_ppg'] = shifted['ppg_next_year'] - shifted['ppg_this_year']            # change in ppg

# create injury history metrics
shifted['season_frac_1'] = shifted['G'] / 16                                          # 1 year injury history
shifted['season_frac_2'] = (shifted['G'] + shifted['G_prev_year']) / 32               # 2 year injury history
shifted['season_frac_3'] = (shifted['G'] + shifted['G_prev_year_prev_year']) / 48     # 3 year injury history

# create per attempt metrics to reduce multicollinearity
shifted['ReYds_per_R'] = shifted['ReYds/R']                                           # ReYds per reception
shifted['RuYds_per_A'] = shifted['RuY/A']                                             # RuYds per attempt
shifted['RuTD_per_Att'] = shifted['RuTD'] / shifted['RuAtt']                          # RuTds per attempt
shifted['PaYds_per_PaAtt'] = shifted['PaYds'] / shifted['PaAtt']                      # PaYds per attempt
shifted['PaTD_per_PaAtt'] = shifted['PaTD'] / shifted['PaAtt']                        # PaTds per attempt
shifted['Cmp_per_PaAtt'] = shifted['Cmp'] / shifted['PaAtt']                          # Completions per attempt
shifted['Int_per_PaAtt'] = shifted['Int'] / shifted['PaAtt']                          # Interceptions per attempt
shifted['Rec_per_tgt'] = shifted['Rec'] / shifted['Tgt']                              # Receptions per target
shifted['ReTD_per_rec'] = shifted['ReTD'] / shifted['Rec']                            # ReTD's per reception

# fraction of games played that they started
shifted['start_frac'] = shifted['GS'] / shifted['G']

# Create some per game metrics
for metric in ['Tgt', 'PaAtt', 'RuAtt']:
    new_str = metric + '_per_game'
    shifted[new_str] = shifted[metric] / shifted['G']
    
## share of team's fantasy points
# get fantasy points of team
gb = shifted.groupby(['Tm', 'Year']).sum().reset_index()[['Tm', 'Year', 'PPR']]
gb = gb[(gb.Tm != '2TM') & (gb.Tm != '3TM') & (gb.Tm != '4TM')]
# compare fantasy points of player to fantasy points of team
shifted['Fant_Share'] = 0
for i in range(len(shifted)):
    if 'TM' not in shifted.Tm[i]:
        temp_tm = shifted.loc[i, 'Tm']
        temp_yr = shifted.loc[i, 'Year']
        gb_val = gb.loc[(gb.Tm == temp_tm) & (gb.Year == temp_yr), 'PPR']
        shifted.loc[i, 'Fant_Share'] = shifted.loc[i, 'PPR'] / gb_val.iloc[0]
    else:
        shifted.loc[i, 'Fant_Share'] = 0 #this isnt technically true, may need to change this
        
# impute season frac by position, impute games prev by position
cols = ['season_frac_2', 'season_frac_3', 'G_prev_year', 'G_prev_year_prev_year']
for i in cols:
    shifted[i] = shifted[i].astype(float)
    shifted[i] = shifted[i].fillna(shifted.groupby('FantPos')[i].transform('mean'))

# some players for IR reasons dont have a fantasy position for one year
# need to fill in with their other fantasy positions from other years
shifted.loc[shifted.Name == 'Travis Kelce', 'FantPos'] = 'TE' #on the IR his first year, positionless in the stats
shifted.loc[shifted.Name == 'Sam Bradford', 'FantPos'] = 'QB' #on the IR one year
shifted.loc[shifted.Name == 'Chad Johnson', 'FantPos'] = 'WR' #on the IR one year
shifted.loc[shifted.Name == 'Marvin Jones', 'FantPos'] = 'WR' #on the IR one year
shifted.loc[shifted.Name == 'Brandon Coleman', 'FantPos'] = 'WR' #on the IR one year
shifted.loc[shifted.Name == 'Chris Givens', 'FantPos'] = 'WR' #on the IR one year
shifted.loc[shifted.Name == 'Vincent Brown', 'FantPos'] = 'WR' #on the IR one year

#shifted['FantPos'] = shifted['FantPos'].fillna(shifted.groupby('FantPos')[i].transform('mean'))
# hopefully figure out a clever way to impute modes here. for now, fix this thing

# impute all other stats with 0, because remaining nans are from 0 passes etc
shifted = shifted.fillna(0)
shifted = shifted.loc[shifted.Rk < 301].reset_index(drop = True) # keep relevant players

# save output so far
shifted.to_csv('profootballfocus_withmetrics.csv', index = False)
shifted.groupby('Year').count()

Unnamed: 0_level_0,Rk,Name,Tm,FantPos,Age,G,GS,Cmp,PaAtt,PaYds,PaTD,Int,RuAtt,RuYds,RuY/A,RuTD,Tgt,Rec,ReYds,ReYds/R,ReTD,Fmb,FL,TD.3,2PM,2PP,FantPt,PPR,DKPt,FDPt,VBD,PosRank,OvRank,pts_next_year,g_next_year,G_prev_year,G_prev_year_prev_year,ppg_next_year,ppg_this_year,delta_ppg,season_frac_1,season_frac_2,season_frac_3,ReYds_per_R,RuYds_per_A,RuTD_per_Att,PaYds_per_PaAtt,PaTD_per_PaAtt,Cmp_per_PaAtt,Int_per_PaAtt,Rec_per_tgt,ReTD_per_rec,start_frac,Tgt_per_game,PaAtt_per_game,RuAtt_per_game,Fant_Share
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1
2004,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300
2005,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300
2006,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300
2007,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300
2008,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300
2009,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300
2010,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300
2011,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300
2012,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300
2013,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300


# Assemble and merge ADP database (ppr)

In [5]:
## new assembly gets us many more observations per year and more years, in return lose std dev variable
try:
    adp_frame_0 = pd.read_csv('adp_2.csv')
except:
    adp_frame_0 = new_assembly(2004, 2020, 'not_rookie')         # sometimes this link is a problem. to fix it, open up the link in a browser and run again
    adp_frame_0.to_csv('adp_2.csv', index = False)

# Dont include pos, too many differences of wr's at te's
# merge the databases, inner bc only want fantasy relevant players
frame_w_adp = shifted.merge(adp_frame_0, on = ['Name', 'Year'], how = 'inner')
frame_w_adp = frame_w_adp.rename(columns = {'Tm_x': 'Old_Team', 'Tm_y':'New_Team'})

# flag if team in adp database is different from pff database, means switched teams
frame_w_adp['Tm_change_flag'] = frame_w_adp.apply(\
                                             lambda x: new_team(x['Old_Team'], x['New_Team']), axis = 1)

# dont include unranked because that includes rookies. Separate analysis for them
frame_w_adp = frame_w_adp.loc[pd.notna(frame_w_adp.Rk)].reset_index(drop = True)

# hard code in Mike Williams problem eventually
frame_w_adp = frame_w_adp.loc[frame_w_adp.Name != 'Mike Williams'].reset_index(drop = True)

########### keep this bit because STD DEV is an important variable ###################
try:
    adp_frame_alt = pd.read_csv('adp_8.csv')
except:
    adp_frame_alt = ADP_assembly(2010, 2020, 'not_rookie')         # sometimes this link is a problem. to fix it, open up the link in a browser and run again
    adp_frame_alt.to_csv('adp_8.csv', index = False)
    
# maybe merge on pos too? cant merge on team because players switch from profootballfocus to adp
# merge the databases, inner bc only want fantasy relevant players
frame_w_adp_2 = frame_w_adp.merge(adp_frame_alt, on = ['Name', 'Year'], how = 'left')

frame_w_adp_2['Std.Dev'] = frame_w_adp_2['Std.Dev'].fillna(frame_w_adp_2['Std.Dev'].mean())
frame_w_adp = frame_w_adp_2

## see the number of targets leaving or entering a team
tgt_avail = frame_w_adp.groupby(['Year', 'Old_Team']).sum().reset_index()[['Year', 'Old_Team', 'Tgt']]
tgt_avail = tgt_avail.rename(columns = {'Old_Team': 'New_Team', 'Tgt': 'Prev_Tgt'})
opportunity = frame_w_adp.groupby(['Year', 'New_Team']).sum().reset_index()[['Year','New_Team', 'Tgt']]
opp_frame = tgt_avail.merge(opportunity, on = ['Year', 'New_Team'], how = 'outer')
opp_frame['opp_difference'] = opp_frame['Prev_Tgt'] - opp_frame['Tgt']
frame_w_adp = frame_w_adp.merge(opp_frame[['Year', 'New_Team', 'opp_difference']]
                                , on = ['Year', 'New_Team'], how = 'outer')

## see the number of rushing attempts leaving or entering a team
ru_avail = frame_w_adp.groupby(['Year', 'Old_Team']).sum().reset_index()[['Year', 'Old_Team', 'RuAtt']]
ru_avail = ru_avail.rename(columns = {'Old_Team': 'New_Team', 'RuAtt':'Prev_Ru'})
new_ru = frame_w_adp.groupby(['Year', 'New_Team']).sum().reset_index()[['Year', 'New_Team', 'RuAtt']]
ru_frame = new_ru.merge(ru_avail, on = ['Year', 'New_Team'], how = 'outer')
ru_frame['ru_opp'] = ru_frame['Prev_Ru'] - ru_frame['RuAtt']

frame_w_adp = frame_w_adp.merge(ru_frame[['Year', 'New_Team', 'ru_opp']], on = ['Year', 'New_Team'], how = 'outer')
frame_w_adp = frame_w_adp[pd.notna(frame_w_adp.Rk)].reset_index(drop = True)

In [None]:
frame_w_adp.groupby('Year').count()
#frame_w_adp

# Assemble and merge college database and draft/combine database

In [20]:
# college data pull 1: draft pick: college, age, some stats
try:
    rk_0 = pd.read_csv('draft_pick.csv')
except:
    rk_0 = rookie_assembly(2000, 2020)
    rk_0.to_csv('draft_pick.csv', index = False)
    
rk_0['Name'] = rk_0['Name'].str.replace('DJ Chark', 'D.J. Chark')
rk_0.loc[rk_0.Name == 'Devin Funchess', 'FantPos'] = 'WR'
# college data pull 2: combine numbers: school, height, weight, combine stats, position, name
try:
    comb_0 = pd.adfread_csv('combine.csv')
except:
    comb_0 = combine_assembly(2000, 2020)
    comb_0.to_csv('combine.csv', index = False)
    
# merge rookie names, draft capital with combine statistics
draft_and_combine = rk_0.merge(comb_0, on = ['Name', 'FantPos', 'Year'], how = 'outer')
draft_and_combine = draft_and_combine.loc[draft_and_combine.Name != 'Mike Williams'].reset_index(drop = True)


## merge to big frame with adp
for_redraft = draft_and_combine[['Name', 'Pick', 'FantPos', 'College', 'height', 'Wt'
                    , 'Dash', 'Vertical', 'Bench', 'Broad_Jump', 'Three_Cone', 'Shuttle']]

final_frame_0 = frame_w_adp.merge(for_redraft, on = ['Name', 'FantPos'], how = 'outer')
final_frame = final_frame_0.loc[pd.notna(final_frame_0.Old_Team)].reset_index(drop = True)
## positionally impute mean values for combine
cols = ['Wt', 'Dash', 'Vertical', 'Bench', 'Broad_Jump', 'Three_Cone', 'Shuttle']
for i in cols:
    final_frame[i] = final_frame[i].astype(float)
    final_frame[i] = final_frame[i].fillna(final_frame.groupby("FantPos")[i].transform('mean'))

## impute 0's for everything else
final_frame = final_frame.fillna(0)
final_frame = final_frame.rename(columns = {'Year_x': 'Year'})
final_frame = final_frame.rename(columns = {'Tm': 'Other_Tm'})
final_frame = final_frame.rename(columns = {'Tm_x': 'Tm'})
final_frame = final_frame.rename(columns = {'Tm_y': 'New_Tm'})
final_frame = final_frame.rename(columns = {'New_Team':'Tm'})
final_frame.to_csv('big_redraft_frame.csv', index = False)

In [None]:
final_frame.loc[final_frame.Shuttle == 0]
#for_redraft.loc[for_redraft.Name == 'Amari Cooper']
#frame_w_adp.loc[frame_w_adp.Name == 'Amari Cooper']
#adp_frame.loc[adp_frame.Name == 'Antonio Brown']
#test2.loc[test2.Name == 'Antonio Brown']

# Assemble and merge data from teams to get winning pct

In [7]:
try:
    team_temp = pd.read_csv('team_frame.csv')
except:
    team_temp = team_assembly(2004, 2019)
    team_temp.to_csv('team_frame.csv', index = False)
    
real_final = final_frame.merge(team_temp, on = ['Year', 'Tm'], how = 'outer')
real_final = real_final.loc[pd.notna(real_final.Rk)].reset_index(drop = True)
real_final.Win_PCT = real_final.Win_PCT.fillna(0.500)
real_final = real_final.loc[(real_final.FantPos == 'QB') 
         | (real_final.FantPos == 'WR')
        | (real_final.FantPos == 'RB')
        | (real_final.FantPos == 'TE')].reset_index(drop = True)
real_final.to_csv('final_frame_teams.csv', index = False)