In [2]:
import re
import pickle
import requests
from datetime import datetime
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
# Dictionary mapping fantasydata.com team abbreviations to pro-football-reference team abbreviations
adp_to_pfr = {'ARI':'ARI','ATL':'ATL','BAL':'BAL','BUF':'BUF','CAR':'CAR',
                'CHI':'CHI','CIN':'CIN','CLE':'CLE','DAL':'DAL','DEN':'DEN',
                'DET':'DET','GB':'GNB','HOU':'HOU','IND':'IND','JAX':'JAX',
                'KC':'KAN','LAC':'LAC','LAR':'LAR','LV':'LVR','MIA':'MIA',
                'MIN':'MIN','NO':'NOR','NE':'NWE','NYG':'NYG','NYJ':'NYJ',
                'PHI':'PHI','PIT':'PIT','SEA':'SEA','SF':'SFO','TB':'TAM',
                'TEN':'TEN','WAS':'WAS'}

In [4]:
# Source: https://fantasydata.com/nfl/adp - PPR
def import_adp_data(files_loc = '../data/historical_adp/'):
    df_dict = {}
    for i in range(2014, 2023):
        cols = ["Name", "Team", "Position", "PositionRank", "AverageDraftPositionPPR"]
        tmp= pd.read_csv(f'{files_loc}ppr-adp-{i}.csv',
                        usecols = cols)
        tmp['Year'] = i
        df_dict[i] = tmp
    return df_dict

def prep_adp_df(adp_data, adp_to_pfr):
    # Limit to only top 200 in ADP per year

    # 1. Concat
    adp_df = pd.concat(adp_data.values())
    # 2. Re-order columns
    adp_df = adp_df[['Name', 'Year', 'Team', 'Position', 'PositionRank', 'AverageDraftPositionPPR']]

    # 3. Get position rank as a number
    adp_df['PositionRank'] = adp_df['PositionRank'].str.extract('(\d+)')[0]
    
    # 4. Reset index
    adp_df.reset_index(inplace=True)
    adp_df.drop('index', axis = 1, inplace=True)
    # adp_df = adp_df.join(pd.get_dummies(adp_df['Position']))

    # 5. Remove III's from end of names
    adp_df['Name'] = adp_df['Name'].str.replace('([I ]+$)', '',regex= True)
    adp_df['Name'] = adp_df['Name'].str.replace('CJ ', 'C.J. ')
    adp_df['Name'] = adp_df['Name'].str.replace('DJ ', 'D.J. ')
    adp_df['Name'] = adp_df['Name'].str.replace('DK ', 'D.K. ')
    adp_df['Name'] = adp_df['Name'].str.replace('Steve Smith', 'Steve Smith Sr.')
    adp_df['Name'] = adp_df['Name'].str.replace('Marvin Jones Jr.', 'Marvin Jones', regex = False)
    adp_df['Name'] = adp_df['Name'].str.replace('Darrell Henderson Jr.', 'Darrell Henderson', regex = False)
    adp_df['Name'] = adp_df['Name'].str.replace('Gabe Davis', 'Gabriel Davis')
    # adp_df = adp_df[adp_df['AverageDraftPositionPPR'] < 173].copy()
    # Changing to 170 to have consistent cutoff for position-based regression

    # 6. Limit to standard, relevant fantas positions
    adp_df = adp_df[adp_df['Position'].isin(['RB','WR','QB','WR','TE'])]

    # 7. Update team names for those teams that have moved in last 10 years
    adp_df['Team'] = adp_df['Team'].replace(adp_to_pfr) 
    adp_df.loc[(adp_df['Team'] == 'LVR') & (adp_df['Year'] <= 2019),'Team'] = 'OAK'
    adp_df.loc[(adp_df['Team'] == 'LAC') & (adp_df['Year'] <= 2016),'Team'] = 'SDG'
    adp_df.loc[(adp_df['Team'] == 'LAR') & (adp_df['Year'] <= 2015),'Team'] = 'STL'

    adp_df.loc[adp_df['Name'].str.contains('Jordan Matthews'), 'Position'] = "WR"
    adp_df.loc[adp_df['Name'].str.contains('Funchess'), 'Position'] = "WR"
    adp_df.loc[adp_df['Name'].str.contains('Trubisky'), 'Name'] = "Mitchell Trubisky"
    adp_df.loc[adp_df['Name'].str.contains('Minshew'), 'Name'] = 'Gardner Minshew II'
    adp_df.loc[adp_df['Name'].str.contains('Chark'), 'Name'] = 'DJ Chark'
    adp_df.loc[adp_df['Name'].str.contains('Robert Griffin'), 'Name'] = 'Robert Griffin III'
    adp_df.loc[adp_df['Name'].str.contains('Willie Snead'), 'Name'] = 'Willie Snead'
    adp_df.loc[adp_df['Name'].str.contains('William Fuller'), 'Name'] = 'Will Fuller'
    adp_df.loc[adp_df['Name'].str.contains('Ronald Jones'), 'Name'] = 'Ronald Jones II'
    adp_df.loc[adp_df['Name'].str.contains('Benjamin Watson'), 'Name'] = 'Ben Watson'
    adp_df.loc[adp_df['Name'].str.contains('Rob Kelley'), 'Name'] = 'Robert Kelley'
    adp_df.loc[adp_df['Name'].str.contains('Henry Ruggs'), 'Name'] = 'Henry Ruggs III'
    adp_df.loc[adp_df['Name'].str.contains('Kenneth Walker'), 'Name'] = 'Kenneth Walker III'
    return adp_df 
    

In [44]:
df_dict = import_adp_data()
adp_df = prep_adp_df(df_dict, adp_to_pfr)

with open('../data/created/adp_info.p', 'wb') as handle:
    pickle.dump(adp_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [5]:
# Check col names same across years
def checkColNames(df_dict):
    a = None
    for k, v in df_dict.items():
        if a is None:
            a = list(v.columns)
        if list(v.columns) != a:
            print('There are some years where df columns have different names')

# Check Teams are correct
def checkTeamNames(df):
    test = df[['Team','Year']].drop_duplicates().sort_values(['Year','Team'])
    a = test.groupby(['Team'], as_index = False).min()[['Team','Year']]
    b = test.groupby(['Team'], as_index = False).max()[['Team','Year']]
    c = a.merge(b, on = 'Team')
    print(c[(c['Year_y'] - c['Year_x']) < 9])

checkColNames(df_dict)
checkTeamNames(adp_df)

NameError: name 'df_dict' is not defined

In [25]:
PFR_LINK = 'https://www.pro-football-reference.com/years/{yr}/fantasy.htm'

def player_info_from_link(link):
    info_dict = {'born' : None,
                    'draft': None,
                    'pos' : None}

    page = requests.get(link)
    if page.status_code == 200:
        soup = BeautifulSoup(page.content, 'html.parser')
    player_info  = soup.find_all('p')
    
    for info in player_info:
        if len(info.find_all('strong', text = 'Position')) > 0 :
            info_dict['pos'] = info.text.split(' ')[1].strip()[:2]
        if len(info.find_all('strong', text = 'Born:')) > 0 :
            dob = (info.text.replace('\n', ' ').replace('\xa0', ' '))
            dob = re.search('\w+\s\d{1,2},\s\d{4}', dob).group(0)
            info_dict['born'] = datetime.strptime(dob, '%B %d, %Y')
        if len(info.find_all('strong', text = 'Draft')) > 0:
            draft_info = info.text
            draft_pos = re.search('\([0-9]*', draft_info)
            draft_year = re.search(r'(\d{4}) NFL Draft', draft_info)
            try:
                info_dict['draft'] = int(draft_pos.group(0)[1:])
                info_dict['drat_year'] = int(draft_year.group(1))
            except:
                pass

    return info_dict
        
# ID players by uniqe (name, position)
# Add to a dictionary containing their
    # 1. Birthdates
    # 2. NFL Draft Position
    # 3. Position
                    
def get_player_info(link = PFR_LINK):
    player_dict = {}
    for i in range(2013, 2022):
        use_link = link.format(yr = i)
        print(use_link)
        
        page = requests.get(use_link)
        if page.status_code == 200:
            soup = BeautifulSoup(page.content, 'html.parser')
        table = soup.find_all('table', id = 'fantasy')
        players = table[0].find_all('tr')
        
        for player in players:
            atag = player.find_all('a')
            if len(atag) > 0:
                table_cols = player.find_all('td', class_="right")
                name = atag[0].text 
                position = table_cols[0].text
                if (name, position) not in player_dict:
                    print(f"Added {(name, position)} to dictionary")
                    player_link = 'https://www.pro-football-reference.com' + atag[0]['href']
                    info = player_info_from_link(player_link)
                    print(info)
                    player_dict[name, position] = info
    return player_dict
            
player_dict = get_player_info()

In [41]:
def getDrafteeInfo(player_dict, year = 2022):
    link = f"https://www.pro-football-reference.com/years/{year}/draft.htm"
    page = requests.get(link)
    if page.status_code == 200:
        soup = BeautifulSoup(page.content, 'html.parser')
    table = soup.find_all('table', id = 'drafts')
    players = table[0].find_all('tr')
    for player in players:
        res = player.find_all('a')
        if len(res) > 0:
            a = player.find_all('td')
            name = res[1].text 
            position = a[3].text
            if name == 'North Dakota St.':
                continue
            print(name, position)
            if (name, position) not in player_dict:
                print(f"Added {(name, position)} to dictionary")
                player_link = 'https://www.pro-football-reference.com' + res[1]['href']
                info = player_info_from_link(player_link)

                player_dict[(name, position)] = info
    return player_dict

# with open('../data/pfr_player_info.p', 'rb') as handle:
#     player_dict = pickle.load(handle)
# player_dict = getDrafteeInfo(player_dict)
# player_dict = getDrafteeInfo(player_dict, 2021)

# with open('../data/created/pfr_player_info.p', 'wb') as handle:
#     pickle.dump(player_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

Trevor Lawrence QB
Zach Wilson QB
Trey Lance QB
Kyle Pitts TE
Ja'Marr Chase WR
Jaylen Waddle WR
Penei Sewell OL
Added ('Penei Sewell', 'OL') to dictionary
Jaycee Horn DB
Added ('Jaycee Horn', 'DB') to dictionary
Patrick Surtain II DB
Added ('Patrick Surtain II', 'DB') to dictionary
DeVonta Smith WR
Justin Fields QB
Micah Parsons LB
Added ('Micah Parsons', 'LB') to dictionary
Rashawn Slater OL
Added ('Rashawn Slater', 'OL') to dictionary
Alijah Vera-Tucker OL
Added ('Alijah Vera-Tucker', 'OL') to dictionary
Mac Jones QB
Zaven Collins LB
Added ('Zaven Collins', 'LB') to dictionary
Alex Leatherwood OL
Added ('Alex Leatherwood', 'OL') to dictionary
Jaelan Phillips DL
Added ('Jaelan Phillips', 'DL') to dictionary
Jamin Davis LB
Added ('Jamin Davis', 'LB') to dictionary
Kadarius Toney WR
Kwity Paye DL
Added ('Kwity Paye', 'DL') to dictionary
Caleb Farley DB
Added ('Caleb Farley', 'DB') to dictionary
Christian Darrisaw OL
Added ('Christian Darrisaw', 'OL') to dictionary
Najee Harris RB
Travis

In [17]:
def import_pts_data(link = PFR_LINK):
    fpts_dict = {}
    for i in range(2013, 2022):
        use_link = link.format(yr = i)
        print(use_link)
        page = requests.get(use_link)
        if page.status_code == 200:
            soup = BeautifulSoup(page.content, 'html.parser')
            
        table = soup.find_all('table', id = 'fantasy')
        df = pd.read_html(str(table), flavor = 'html5lib')[0]
        df.columns = df.columns.get_level_values(1)
        df['Year'] = i
        fpts_dict[i] = df
    return fpts_dict

fpts_dict = import_pts_data(PFR_LINK)

with open('../data/created/pfr_player_pts.p', 'wb') as handle:
    pickle.dump(fpts_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

https://www.pro-football-reference.com/years/2013/fantasy.htm
https://www.pro-football-reference.com/years/2014/fantasy.htm
https://www.pro-football-reference.com/years/2015/fantasy.htm
https://www.pro-football-reference.com/years/2016/fantasy.htm
https://www.pro-football-reference.com/years/2017/fantasy.htm
https://www.pro-football-reference.com/years/2018/fantasy.htm
https://www.pro-football-reference.com/years/2019/fantasy.htm
https://www.pro-football-reference.com/years/2020/fantasy.htm
https://www.pro-football-reference.com/years/2021/fantasy.htm


In [14]:
checkColNames(fpts_dict)