In [10]:
import os
from functools import partial
from bs4 import BeautifulSoup
import requests
import re
from datetime import datetime
import pickle

import pandas as pd
import numpy as np

In [28]:
with open('../data/created/adp_info.p', 'rb') as handle:
    adp_df = pickle.load(handle)

with open('../data/created/pfr_player_info.p', 'rb') as handle:
    player_dict = pickle.load(handle)

with open('../data/created/pfr_player_pts.p', 'rb') as handle:
    fpts_dict = pickle.load(handle)


scoring_type = 'PPR' # or HPPR or NPPR
SCORING = 'Pts_' + scoring_type


In [None]:
print(player_dict)

In [12]:
fpts_dict[2021]

Unnamed: 0,Rk,Player,Tm,FantPos,Age,G,GS,Cmp,Att,Yds,...,2PM,2PP,FantPt,PPR,DKPt,FDPt,VBD,PosRank,OvRank,Year
0,1,Jonathan Taylor*+,IND,RB,22,17,17,0,0,0,...,,,333,373.1,381.1,353.1,187,1,1,2021
1,2,Cooper Kupp*+,LAR,WR,28,17,17,0,1,0,...,1,,295,439.5,442.5,367.0,173,1,2,2021
2,3,Deebo Samuel*+,SFO,WR,25,16,15,1,2,24,...,,,262,339.0,347.0,300.5,140,2,3,2021
3,4,Josh Allen,BUF,QB,25,17,17,409,646,4407,...,2,1,403,402.6,426.6,417.6,134,1,4,2021
4,5,Austin Ekeler,LAC,RB,26,16,16,0,0,0,...,2,,274,343.8,352.8,308.8,128,2,5,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
687,666,Travis Benjamin,SFO,,32,10,0,0,0,0,...,,,-2,-2.0,-1.0,-2.0,,260,,2021
688,667,Trenton Cannon,2TM,RB,27,12,0,0,0,0,...,,,-2,-1.6,-0.6,-1.6,,179,,2021
689,668,John Wolford,LAR,QB,26,3,0,1,4,5,...,,,-2,-1.9,-0.9,-0.9,,84,,2021
690,669,Josh Rosen,ATL,QB,24,4,0,2,11,19,...,,,-3,-3.2,-1.2,-1.2,,85,,2021


### 1. Prep points scored portion of dataframe

#### 1a. Calculate points scored

In [13]:
pfref_colNames = ['Rk', 'Player', 'Tm', 'FantPos', 'Age', 'G', 'GS', 'PassCmp', 'PassAtt', 'PassYds',
                    'PassTD', 'PassInt', 'RushAtt', 'RushYds', 'RushY/A', 'RushTD', 'RecTgt', 'Rec', 'RecYds', 'RecY/R',
                    'RecTD', 'Fmb', 'FL', 'TD', '2PM', '2PP', 'FantPt', 'PPR', 'DKPt', 'FDPt',
                    'VBD', 'PosRank', 'OvRank', 'Year']

ppr = {'Pts_PPR' : 1, 
        'Pts_HPPR' : 0.5, 
        'Pts_NPPR' : 0}

score_dict = {'PassYds' : 0.04,
                'PassTD' : 4,
                'PassInt' : -2,
                'RushYds' : 0.1,
                'RushTD' : 6,
                'Rec': ppr[SCORING],
                'RecYds' : 0.1,
                'RecTD' : 6,
                'FL' : -2,
                '2PM' : 2,
                '2PP' : 2
                }

def score_row(row):
    sum = 0.0
    for cat, score in score_dict.items():
        addval = float(row[cat]) * score
        sum += addval
    return sum

def prep_pts_df(fpts_dict, pfref_colNames = pfref_colNames, score_dict = score_dict):
    # 1. Concatenate, rename cols, drop filler rows, reset index
    df = pd.concat(fpts_dict.values())
    df.columns = pfref_colNames    
    df = df.drop(df[df['Player'] == 'Player'].index) 
    df = df.reset_index().drop(['index','Rk'], axis = 1)    
    
    # 2. Convert numerics, fill nas with 0, then score
    score_cols = list(score_dict.keys()) + ['FantPt', 'PPR']
    df[score_cols] = df[score_cols].apply(pd.to_numeric)
    score_dict2 = {k : 0 for (k, v) in score_dict.items()}
    df.fillna(score_dict2, inplace=True)
    
    # 3. Score
    df[SCORING] = df.apply(score_row, axis = 1)
    if SCORING == 'Pts_PPR':
        assert len(df[(df['Pts_PPR'] - df['PPR']) > 0.1]) == 0

    # 4. Clean player names of '*' and '+'
    df['Player'] = df['Player'].str.replace('[\*\+]', '', regex=True).str.strip()

    # 5. Limit to guys with positions, everyone without position has 0 or less pts scored
    df = df[df['FantPos'].notnull()].copy()
    return df

pts_df = prep_pts_df(fpts_dict, pfref_colNames, score_dict)
pts_df.head()

Unnamed: 0,Player,Tm,FantPos,Age,G,GS,PassCmp,PassAtt,PassYds,PassTD,...,2PP,FantPt,PPR,DKPt,FDPt,VBD,PosRank,OvRank,Year,Pts_PPR
0,Jamaal Charles,KAN,RB,27,15,15,0,0,0.0,0.0,...,0.0,308.0,378.0,386.0,343.0,182,1,1,2013,378.0
1,LeSean McCoy,PHI,RB,25,16,16,0,0,0.0,0.0,...,0.0,279.0,330.6,337.6,304.6,152,2,2,2013,330.6
2,Peyton Manning,DEN,QB,37,16,16,450,659,5477.0,55.0,...,0.0,410.0,410.0,429.0,420.0,151,1,3,2013,409.98
3,Matt Forte,CHI,RB,28,16,16,0,0,0.0,0.0,...,0.0,263.0,337.3,345.3,300.3,137,3,4,2013,337.3
4,Jimmy Graham,NOR,TE,27,16,12,0,0,0.0,0.0,...,0.0,218.0,303.5,306.5,260.5,124,1,5,2013,303.5


In [19]:
pts_df

Unnamed: 0,Player,Tm,FantPos,Age,G,GS,PassCmp,PassAtt,PassYds,PassTD,...,2PP,FantPt,PPR,DKPt,FDPt,VBD,PosRank,OvRank,Year,Pts_PPR
0,Jamaal Charles,KAN,RB,27,15,15,0,0,0.0,0.0,...,0.0,308.0,378.0,386.0,343.0,182,1,1,2013,378.00
1,LeSean McCoy,PHI,RB,25,16,16,0,0,0.0,0.0,...,0.0,279.0,330.6,337.6,304.6,152,2,2,2013,330.60
2,Peyton Manning,DEN,QB,37,16,16,450,659,5477.0,55.0,...,0.0,410.0,410.0,429.0,420.0,151,1,3,2013,409.98
3,Matt Forte,CHI,RB,28,16,16,0,0,0.0,0.0,...,0.0,263.0,337.3,345.3,300.3,137,3,4,2013,337.30
4,Jimmy Graham,NOR,TE,27,16,12,0,0,0.0,0.0,...,0.0,218.0,303.5,306.5,260.5,124,1,5,2013,303.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5501,Logan Woodside,TEN,QB,26,5,0,0,0,0.0,0.0,...,0.0,-1.0,-0.6,-0.6,-0.6,,81,,2021,-0.60
5503,Trenton Cannon,2TM,RB,27,12,0,0,0,0.0,0.0,...,0.0,-2.0,-1.6,-0.6,-1.6,,179,,2021,-1.60
5504,John Wolford,LAR,QB,26,3,0,1,4,5.0,0.0,...,0.0,-2.0,-1.9,-0.9,-0.9,,84,,2021,-1.90
5505,Josh Rosen,ATL,QB,24,4,0,2,11,19.0,0.0,...,0.0,-3.0,-3.2,-1.2,-1.2,,85,,2021,-3.24


#### 1b. Calculate starter baselines

In [14]:
TEAMS = 10 # Teams in league
STARTING_QBS = 1
STARTING_TES = 1
STARTING_WRS = 2
STARTING_RBS = 2
FLEX_SPOTS = 1 # Assume normal flex, no super flex

qbs = STARTING_QBS * TEAMS
rbs = STARTING_RBS * TEAMS
wrs = STARTING_WRS * TEAMS
tes = STARTING_TES * TEAMS
flexes = rbs + wrs + TEAMS * FLEX_SPOTS

In [15]:
baseline_template = {'QB': qbs,
    'TE' : tes,
    'RB' : None,
    'WR' : None}
baselines = {k : baseline_template.copy() for k in range(2013, 2022)}
# print(baselines)

def find_baseline(row):
    pos = row['FantPos']
    yr = row['Year']
    if yr not in baselines.keys() or pos not in baselines[yr].keys():
        return 0
    base = baselines[yr][pos]
    if row['MyRk'] == base:
        return 1
    else:
        return 0

def getFlexBases(pts_df, pts):
    tmp = pts_df[pts_df['FantPos'].isin(['RB', 'WR'])].copy()
    tmp = tmp.sort_values(['Year', pts], ascending = [True, False]).reset_index().drop('index', axis = 1)
    tmp['Rk'] = tmp.groupby(['Year'])[pts].rank('first', ascending = False)
    tmp = tmp[tmp['Rk']<= flexes].copy()

    flex_by_yr = tmp.groupby(['Year', 'FantPos'], as_index = False).max(['MyRk'])[['Year','FantPos','MyRk']]
    flex_by_yr = flex_by_yr.pivot(index = 'Year', columns = 'FantPos', values = 'MyRk').reset_index()
    flex_by_yr.set_index('Year', inplace= True)
    newDict = flex_by_yr.to_dict('index')
    for k, v in newDict.items():
        if v['WR'] > 30:
            newDict[k]['WR'] = 30.0
            newDict[k]['RB'] = 20.0
    return newDict
    

def set_baselines(pts_df, baselines = baselines, pts = SCORING):
    # Rank by position, year
    pts_df = pts_df.sort_values(['Year','FantPos',pts], ascending = [True, True, False])
    pts_df['MyRk'] = pts_df.groupby(['Year','FantPos'])[pts].rank('first', ascending = False)
    
    flexDict = getFlexBases(pts_df, pts)
    [baselines[k].update(flexDict[k]) for k in baselines.keys()]
    
    # Get baselines, create VBD
    pts_df['Baseline'] = pts_df.apply(find_baseline, axis = 1)
    bases = pts_df.loc[pts_df['Baseline'] == 1, ['FantPos', 'Year', pts]]
    bases.columns = ['FantPos', 'Year', 'Base']

    pts_df = pts_df.merge(bases, on = ['FantPos', 'Year'], how = 'left')
    pts_df['aboveBase'] = np.where(pts_df[pts] >= pts_df['Base'], 1,0)
    return pts_df, bases

pts_df_base, bases = set_baselines(pts_df)
# print(bases)
# pts_df_base[(pts_df_base['FantPos'] == 'WR') & (pts_df_base['Year'] == 2015)].head(35)

#### 1c. Merge PFR data with itself to get previous years' information

In [25]:
pts_df_base[pts_df_base['Year'] == 2021]

Unnamed: 0,Player,Tm,FantPos,Age,G,GS,PassCmp,PassAtt,PassYds,PassTD,...,FDPt,VBD,PosRank,OvRank,Year,Pts_PPR,MyRk,Baseline,Base,aboveBase
4355,Josh Allen,BUF,QB,25,17,17,409,646,4407.0,36.0,...,417.6,134,1,4,2021,402.58,1.0,0,300.48,1
4356,Justin Herbert,LAC,QB,23,17,17,443,672,5014.0,38.0,...,395.8,112,2,6,2021,380.76,2.0,0,300.48,1
4357,Tom Brady,TAM,QB,44,17,17,485,719,5316.0,43.0,...,386.7,106,3,7,2021,374.74,3.0,0,300.48,1
4358,Patrick Mahomes,KAN,QB,26,17,17,436,658,4839.0,37.0,...,374.7,93,4,13,2021,361.66,4.0,0,300.48,1
4359,Aaron Rodgers,GNB,QB,38,16,16,366,531,4115.0,37.0,...,336.8,64,5,18,2021,333.30,5.0,0,300.48,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4946,Malik Taylor,GNB,WR,26,10,0,0,0,0.0,0.0,...,0.4,,258,,2021,1.40,223.0,0,199.80,0
4947,Racey McMath,TEN,WR,22,9,0,0,0,0.0,0.0,...,-0.2,,259,,2021,0.80,224.0,0,199.80,0
4948,Maurice Ffrench,LAC,WR,23,3,0,0,0,0.0,0.0,...,0.3,,224,,2021,0.30,225.0,0,199.80,0
4949,Alex Bachman,NYG,WR,25,3,0,0,0,0.0,0.0,...,-0.3,,257,,2021,-0.30,226.0,0,199.80,0


In [22]:
import numpy as np

def createPtsForReg(pts_df_base):
    # 1. Create template
    predTemplate = pts_df_base[['Player', 'Tm', 'FantPos', 'Year', SCORING, '`aboveBase`']]
    
    print("1. This is the shape of og dataset...")
    print(predTemplate.shape)
    print()
    
    # 2. Merge on last year's results
    prvYr = predTemplate[['Player', 'FantPos', 'Year', SCORING]].copy()
    prvYr.rename(columns = {'Year' : 'PrvYear', SCORING : 'Prv' + SCORING}, inplace = True)
    prvYr['Year'] = prvYr['PrvYear'] + 1
    merged = predTemplate.merge(prvYr, on = ['Player','FantPos', 'Year'], how = 'outer', indicator= 'foundLastYearStats')
    
    print("2. This is the shape after doing outer join with previous years' data")
    print(merged.shape)
    print(merged['foundLastYearStats'].value_counts())
    

    # 3. Remove right_only obs that aren't from 2022
    print(f"3. In total, {len(merged[merged['foundLastYearStats'] == 'right_only'])} observations are players with some data, but no previous year statistics")
    print(f"-Of these, {len(merged[merged['PrvYear'] == 2021])} observations are associated with year 2022")
    print("\t-These observations don't have a 'y-value' for regression, but this is the year we are trying to predict, so OK")
    print(f"-Remove the remaining {len(merged[(merged['PrvYear'] != 2021) & (merged['foundLastYearStats'] == 'right_only')])} observations")
    print("\t-These observations don't have a 'y-value' for regression, only 'x-values', so ok to delete")
    merged = merged[(merged['Year'] == 2022) | (merged['foundLastYearStats'] != 'right_only')]
    print(merged.shape)
    print()
    
    # 4. Create found last year flag
    # This will help distinguish rookies and other players not in data
    merged['PrvYear'] = merged['Year'] - 1

    # 5. Examine composition of remaining observations  
    # Left_only and both are needed for regression - excludes 2013 observations
    # Right_only needed for prediction - excludes non-2022 observations (right-only's in OG data)
    print(merged['foundLastYearStats'].value_counts())
    print()
    
    return merged.drop('foundLastYearStats', axis = 1)

pts_df_reg = createPtsForReg(pts_df)
pts_df_reg

1. This is the shape of og dataset...
(4951, 5)

2. This is the shape after doing outer join with previous years' data
(6755, 8)
both          3147
left_only     1804
right_only    1804
Name: foundLastYearStats, dtype: int64
3. In total, 1804 observations are players with some data, but no previous year statistics
-Of these, 596 observations are associated with year 2022
	-These observations don't have a 'y-value' for regression, but this is the year we are trying to predict, so OK
-Remove the remaining 1208 observations
	-These observations don't have a 'y-value' for regression, only 'x-values', so ok to delete
(5547, 8)

both          3147
left_only     1804
right_only     596
Name: foundLastYearStats, dtype: int64



Unnamed: 0,Player,Tm,FantPos,Year,Pts_PPR,PrvYear,PrvPts_PPR
0,Jamaal Charles,KAN,RB,2013,378.00,2012,
1,LeSean McCoy,PHI,RB,2013,330.60,2012,
2,Peyton Manning,DEN,QB,2013,409.98,2012,
3,Matt Forte,CHI,RB,2013,337.30,2012,
4,Jimmy Graham,NOR,TE,2013,303.50,2012,
...,...,...,...,...,...,...,...
6750,Logan Woodside,,QB,2022,,2021,-0.60
6751,Trenton Cannon,,RB,2022,,2021,-1.60
6752,John Wolford,,QB,2022,,2021,-1.90
6753,Josh Rosen,,QB,2022,,2021,-3.24


In [23]:
pts_df_reg['Year'].value_counts()

2021    596
2022    596
2020    578
2019    558
2018    550
2016    542
2015    538
2017    536
2013    527
2014    526
Name: Year, dtype: int64

#### 1d. Merge ADP data and do other calculations

##### 1d.i. Get change QB-produced FantPts (proxy for QB's changing teams)

In [26]:
def create_qb_chg(df):
    # 1. Get a team's max QB FantPts in year i - 1
    qb_then = df.loc[df['FantPos'] == 'QB'].groupby(['Tm', 'Year'], as_index = False).max()[['Tm', 'Year', SCORING]]
    qb_then.rename(columns = {'Pts_PPR' : 'OldQBs'}, inplace= True)
    
    qb_then.loc[(qb_then['Tm'] == 'OAK') & (qb_then['Year'] == 2019), 'Tm'] = 'LVR'
    qb_then.loc[(qb_then['Tm'] == 'SDG') & (qb_then['Year'] == 2016), 'Tm'] = 'LAC'
    qb_then.loc[(qb_then['Tm'] == 'STL') & (qb_then['Year'] == 2015), 'Tm'] = 'LAR'
    qb_then['Year'] = qb_then['Year'] + 1
    
    # 2. Of QBs on roster in year i, get max QB Fant Pts in year i - 1
    qb_now = df.loc[df['FantPos'] == 'QB'].groupby(['Tm', 'Year'], as_index = False).max()[['Tm', 'Year', 'Prv' + SCORING]]
    qb_now.rename(columns = {'Prv' + SCORING : 'NewQBs'}, inplace= True)
    final = qb_then.merge(qb_now)

    # 3. Create average across all teams in year i to interpolate for players in > 1 teams
    filler = final[final['Tm'] != '2TM'].groupby(['Year']).mean().reset_index()
    filler.rename(columns = {'OldQBs' : 'OldQBs_y', 'NewQBs' : 'NewQBs_y'}, inplace = True)
    test = df.merge(final, on = ['Tm','Year'], how='left', indicator = True)
    # print(filler)
    # print()
    # print(test['_merge'].value_counts())
    # print(test[((test['_merge'] == 'left_only') & (test['Year'] > 2013) & (test['Tm'].notnull()))].shape)
    # print(test[(test['_merge'] == 'left_only') & (test['Year'] == 2013)].shape)
    # print(test[(test['_merge'] == 'left_only') & (test['Tm'].isnull())].shape)
    
    # 4. 
    test = test[(test['_merge'] != 'left_only') | ((test['_merge'] == 'left_only') & (test['Year'] > 2013) & (test['Tm'].notnull()))]
    test = test.merge(filler, how = 'left', on = 'Year')
    test['OldQBs'].fillna(test['OldQBs_y'], inplace = True)
    test['NewQBs'].fillna(test['NewQBs_y'], inplace = True)
    test['qbDiff'] = test['NewQBs'] - test['OldQBs']
    test.drop(['_merge','OldQBs_y','NewQBs_y'], axis = 1, inplace = True)
    return test
a = create_qb_chg(pts_df_reg)
# print(pts_df_reg[(pts_df_reg['FantPos'] == 'QB') & (pts_df_reg['Year'].isin([2017, 2018, 2019, 2020])) & (pts_df_reg['Tm'] == 'ARI')])
# a.sort_values('qbDiff', ascending = False).head(50)
# a[a['Year'] == 2022]


  filler = final[final['Tm'] != '2TM'].groupby(['Year']).mean().reset_index()


In [27]:
print(a)

               Player   Tm FantPos  Year  Pts_PPR  PrvYear  PrvPts_PPR  \
0      DeMarco Murray  DAL      RB  2014   351.10     2013      258.10   
1        Le'Veon Bell  PIT      RB  2014   370.50     2013      216.90   
2      Marshawn Lynch  SEA      RB  2014   302.30     2013      275.30   
3       Antonio Brown  PIT      WR  2014   380.90     2013      308.90   
4          Matt Forte  CHI      RB  2014   346.60     2013      337.30   
...               ...  ...     ...   ...      ...      ...         ...   
4419   Logan Woodside  TEN      QB  2021    -0.60     2020        1.28   
4420   Trenton Cannon  2TM      RB  2021    -1.60     2020        7.90   
4421     John Wolford  LAR      QB  2021    -1.90     2020       12.84   
4422       Josh Rosen  ATL      QB  2021    -3.24     2020         NaN   
4423  Diontae Spencer  DEN      WR  2021    -3.30     2020        7.50   

          OldQBs    NewQBs     qbDiff  
0     260.920000  260.9200   0.000000  
1     258.840000  258.8400   0.

In [136]:
def merge_adp_dataset(pts_df_reg, adp_df):
    # 1. Merge with adp info
    test = pts_df_reg.merge(adp_df[['Name', 'Year', 'Team', 'Position', 'AverageDraftPositionPPR']],
                        left_on = ['Player','Year','FantPos'], 
                        right_on = ['Name', 'Year', 'Position'],
                        how = 'outer',
                        indicator= 'foundAdp')
    # print(test.shape)
    
    # 2. Create previous year, fill out player name, position, team
    test['PrvYear'] = test['Year'] - 1
    test['Player'].fillna(test['Name'], inplace=True)
    test.drop('Name',axis = 1, inplace= True)
    test['FantPos'].fillna(test['Position'], inplace=True)
    test.drop('Position',axis = 1, inplace= True)
    test['Tm'].fillna(test['Team'], inplace = True)
    test.drop('Team', axis = 1, inplace= True)

    # 3. Create positional dummies
    test[['QB','RB','TE','WR']] = pd.get_dummies(test['FantPos'])
    return test

def prepFinalReg(pts_df_reg, adp_df):
    # 1. Merge adp dataset
    test = merge_adp_dataset(pts_df_reg, adp_df)

    # 2. Create flag to see if we found last year's stats, if not, make last year's scores 0
    test['foundLastYearStats'] = np.where(test['Prv' + SCORING].isnull(), 0, 1)
    test.loc[test['Prv' + SCORING].isnull(), 'Prv' + SCORING] = 0
    
    # 3. Create flag to see if ADP is outside of 173, if so, set to 173 and flag
    test.loc[test['AverageDraftPositionPPR'] > 173, 'AverageDraftPositionPPR'] = 173
    test.loc[test['foundAdp'] == 'left_only', 'AverageDraftPositionPPR'] = 173
    test['foundAdp'] = np.where(test['AverageDraftPositionPPR'] < 173 , 1, 0)
    
    # 4. Subset columns
    reg_set = test[['Player', 'Tm', 'FantPos','Year', 'Prv' + SCORING, 'foundLastYearStats','AverageDraftPositionPPR', 
                    'foundAdp', 'QB','RB','TE','WR', SCORING, 'aboveBase']].copy()
    
    # 5. Create additional features
    reg_set['AverageDraftPositionPPRSq'] = reg_set['AverageDraftPositionPPR'] * reg_set['AverageDraftPositionPPR']
    #reg_set = create_qb_chg(reg_set)
    
    # 6. Split dataset into pre-2022 and 2022
    # pred_set = reg_set[reg_set['Year'] == 2022].copy()
    # reg_set = reg_set[(reg_set['Year'] != 2022)].copy()
    # print(reg_set.shape)
    # print(pred_set.shape)
    reg_set.loc[reg_set[SCORING].isnull(), SCORING] = 0
    return reg_set# , pred_set
    
reg_set = prepFinalReg(pts_df_reg, adp_df)

In [137]:
def load_player_info(info = player_dict):
    pfref_info = pd.DataFrame.from_dict(info, orient= 'index').reset_index()
    pfref_info.columns = ['Player','FantPos','DOB','Draft', 'Pos']
    pfref_info['FantPos'] = pfref_info['FantPos'].str.strip().replace('', np.nan)
    pfref_info['FantPos'] = pfref_info['FantPos'].fillna(pfref_info['Pos'])
    final = pfref_info.drop('Pos', axis = 1)
    final['Player'] = final['Player'].str.strip()
    final = final.drop_duplicates()
    return final

In [168]:
def add_info(df):
    pfref_info = load_player_info()

    pfref_info.loc[pfref_info['Player'].str.contains('Robby Anderson'), 'Player'] = 'Robbie Anderson'
    pfref_info.loc[pfref_info['Player'].str.contains('Tony Jones'), 'Player'] = 'Tony Jones Jr.'
    pfref_info.loc[pfref_info['Player'].str.contains('John Kelly'), 'Player'] = 'John Kelly Jr.'
    pfref_info.loc[pfref_info['Player'].str.contains('Travis Etienne'), 'Player'] = 'Travis Etienne Jr.'

    df = df.merge(pfref_info, on = ['Player','FantPos'], how= 'inner')
    df['Age'] = (pd.to_datetime('September 1, ' + df['Year'].astype(str)) - df['DOB']).dt.days / 365
    
    df['Draft'] = df['Draft'].fillna(270)
    # 1. Check to make sure players getting excluded aren't drafted in top 173
    # df = df.merge(pfref_info, on = ['Player','FantPos'], how= 'outer', indicator = 'hasInfo')
    # print(df['hasInfo'].value_counts())
    # print(df[df['hasInfo']== 'left_only'].sort_values(['AverageDraftPositionPPR','PrvPts_PPR', 'Pts_PPR'], ascending = [True, False, False]).head(30))
    # print(df[(df['Player'].str.contains('Davis')) & (df['Tm'] == 'BUF')])
    return df, pfref_info

In [172]:
reg_set['aboveBase']

0       1.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
7928    NaN
7929    NaN
7930    NaN
7931    NaN
7932    NaN
Name: aboveBase, Length: 7933, dtype: float64

In [170]:
def getPtShare(df):
    df['ones'] = 1
    df_tm = df.groupby(['Tm','FantPos','Year'], as_index = False).sum()[['Tm','FantPos','Year','PrvPts_PPR', 'ones']]
    df_tm = df_tm.rename(columns = {'PrvPts_PPR' : 'PrvYrTmPts', 'ones' : 'PlayersAtPosition'})
    
    df = df.merge(df_tm, on = ['Tm','FantPos','Year'], how = 'inner')
    df['PrvYrPtsShare'] = df['PrvPts_PPR'] / df['PrvYrTmPts']
    df.loc[df['PrvYrPtsShare'].isnull(), 'PrvYrPtsShare'] = 1 / df.loc[df['PrvYrPtsShare'].isnull(), 'PlayersAtPosition']

    df.loc[df['aboveBase'].isnull(),'aboveBase'] = 0
    return df

reg_set1 = create_qb_chg(reg_set)
reg_set1, pfref_info = add_info(reg_set1)
reg_set1 = getPtShare(reg_set1)
reg_set1

with open('../data/created/reg_set.p', 'wb') as handle:
    pickle.dump(reg_set1, handle, protocol=pickle.HIGHEST_PROTOCOL)

# final_reg = reg_set1[reg_set1['Year'] != 2022]
# final_pred = reg_set1[reg_set1['Year'] == 2022]

# final_reg.loc[final_reg[SCORING] == 0, 'aboveBase'] = 0
# final_pred.isnull().sum()
# final_reg[final_reg['aboveBase'].isnull()].sort_values(SCORING, ascending = False)[['Player','Tm','FantPos','Pts_PPR']]
# final_reg = getPtShare(reg_set1)
# final_pred = getPtShare(pred_set1)
# final_reg.head()
# final_reg.columns
# final_reg['Year'].value_counts()
# Revisit this, could make more sense to exclude 2014 and 2015 because not enough included players
        # Therefore share of team points is different
    # df = df[df['Year'] >= 2015].copy()
    