### Data collection

In [5]:
%load_ext autoreload
%autoreload 2

In [67]:
import pandas as pd
pd.options.display.max_columns = None # See all columns
import warnings
warnings.filterwarnings('ignore')
from fantasy_football import Fantasy

# Import fantasy class which will be used for building datasets
football = Fantasy()

##### First, let's import the data

In [68]:
df, y = Fantasy.read_data('2021')

##### Lets examine both the df and y

In [69]:
df.head()

Unnamed: 0,Player,Pos,Tm,GP,Comp,Inc,Passing_Yds,Passing_Td,Int,Pic6,Sks,Passing_1st,Att,Rushing_Yds,Rushing_Td,Rushing_1st,Tgt,Rec,Receiving_Yds,Receiving_Td,Receiving_1st,Return_Yds,Return_Td,2PT,Total,Lost
0,T. Hill,WR,KC,15,0,0,0,0,0,0,0,0,13,123,2,5,135,87,1276,15,57,0,0,0,0,0
1,J. Jefferson,WR,Min,16,0,0,0,0,0,0,0,0,1,2,0,0,125,88,1400,7,58,0,0,2,1,0
2,D. Metcalf,WR,Sea,16,0,0,0,0,0,0,0,0,0,0,0,0,129,83,1303,10,63,0,0,0,1,1
3,D. Adams,WR,GB,14,0,0,0,0,0,0,0,0,0,0,0,0,149,115,1374,18,73,0,0,0,1,1
4,N. Chubb,RB,Cle,12,0,0,0,0,0,0,0,0,190,1067,12,55,18,16,150,0,6,0,0,0,1,1


In [70]:
y

Unnamed: 0,Player,PPG,Pos
0,J. Allen,24.563529,QB
1,J. Herbert,23.280000,QB
2,P. Mahomes,22.038824,QB
3,T. Brady,22.749412,QB
4,J. Burrow,20.515000,QB
...,...,...,...
370,E. St. Brown,0.861538,WR
371,P. Taylor,1.688889,RB
372,K. Smith,0.511765,RB
373,G. Gilbert,7.760000,QB


##### Great, it seems as though we can merge the target variable by the players name

In [71]:
merged = df.merge(y, on=['Player', 'Pos'], how='left').fillna(0)

##### Let's examine the merged dataset:

In [75]:
merged

Unnamed: 0,Player,Pos,Tm,GP,Comp,Inc,Passing_Yds,Passing_Td,Int,Pic6,Sks,Passing_1st,Att,Rushing_Yds,Rushing_Td,Rushing_1st,Tgt,Rec,Receiving_Yds,Receiving_Td,Receiving_1st,Return_Yds,Return_Td,2PT,Total,Lost,PPG
0,T. Hill,WR,KC,15,0,0,0,0,0,0,0,0,13,123,2,5,135,87,1276,15,57,0,0,0,0,0,10.911765
1,J. Jefferson,WR,Min,16,0,0,0,0,0,0,0,0,1,2,0,0,125,88,1400,7,58,0,0,2,1,0,13.082353
2,D. Metcalf,WR,Sea,16,0,0,0,0,0,0,0,0,0,0,0,0,129,83,1303,10,63,0,0,0,1,1,9.958824
3,D. Adams,WR,GB,14,0,0,0,0,0,0,0,0,0,0,0,0,149,115,1374,18,73,0,0,0,1,1,13.831250
4,N. Chubb,RB,Cle,12,0,0,0,0,0,0,0,0,190,1067,12,55,18,16,150,0,6,0,0,0,1,1,13.950000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
313,D. Eskridge,WR,Sea,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,1.830000
314,T. Fulgham,WR,Phi,13,0,0,0,0,0,0,0,0,0,0,0,0,67,38,539,4,26,0,0,0,0,0,0.000000
315,M. Alie-Cox,TE,Ind,15,0,0,0,0,0,0,0,0,0,0,0,0,39,31,394,2,19,0,0,0,1,1,3.270588
316,J. Johnson,TE,NO,7,0,0,0,0,0,0,0,0,0,0,0,0,10,4,39,0,3,0,0,0,0,0,2.850000


##### Let's remove all players who had 0 fantasy points. This could reflect a season ending injury, a retirement, or that the database did not include them in the next year's stats.

In [78]:
merged = merged.loc[merged['PPG'] != '-']

In [79]:
merged.head()

Unnamed: 0,Player,Pos,Tm,GP,Comp,Inc,Passing_Yds,Passing_Td,Int,Pic6,Sks,Passing_1st,Att,Rushing_Yds,Rushing_Td,Rushing_1st,Tgt,Rec,Receiving_Yds,Receiving_Td,Receiving_1st,Return_Yds,Return_Td,2PT,Total,Lost,PPG
0,T. Hill,WR,KC,15,0,0,0,0,0,0,0,0,13,123,2,5,135,87,1276,15,57,0,0,0,0,0,10.911765
1,J. Jefferson,WR,Min,16,0,0,0,0,0,0,0,0,1,2,0,0,125,88,1400,7,58,0,0,2,1,0,13.082353
2,D. Metcalf,WR,Sea,16,0,0,0,0,0,0,0,0,0,0,0,0,129,83,1303,10,63,0,0,0,1,1,9.958824
3,D. Adams,WR,GB,14,0,0,0,0,0,0,0,0,0,0,0,0,149,115,1374,18,73,0,0,0,1,1,13.83125
4,N. Chubb,RB,Cle,12,0,0,0,0,0,0,0,0,190,1067,12,55,18,16,150,0,6,0,0,0,1,1,13.95


##### Let's make sure to remove rookies, as we will take care of their predictions in a different notebook.

In [81]:
merged = merged.loc[merged['GP'] != "-"]

##### Let's add in strength of schedule for each player

In [91]:
strength = pd.read_csv("strength2020.csv", header=0)

# Invert matrix for merging purposes
inverted_strength = pd.melt(strength, id_vars=['Team'], value_vars=['QB', 'RB', 'WR', 'TE'], var_name='Pos', value_name='Rank')

# Make sure the Team columns are the same for merging
inverted_strength = inverted_strength.rename(columns={'Team': 'Tm'})

# Lets capitalize each first letter of team
inverted_strength['Tm'] = inverted_strength['Tm'].str.capitalize()

inverted_strength

Unnamed: 0,Tm,Pos,Rank
0,Ari,QB,20
1,Atl,QB,23
2,Bal,QB,7
3,Buf,QB,31
4,Car,QB,2
...,...,...,...
123,Sf,TE,3
124,Sea,TE,14
125,Tb,TE,22
126,Ten,TE,25


##### Now, let's update our dataset to include these ranks  

To perform this merge, we will need to make our original dataset team values all uppercase

In [92]:
merged['Tm'] = merged['Tm'].str.capitalize()

# Lets perform the merge, fill FA as average strength
updated_df = pd.merge(merged, inverted_strength, on=['Tm', 'Pos'], how='left').fillna(16)

##### We now have our strength of schedule included in our dataset:

In [93]:
updated_df

Unnamed: 0,Player,Pos,Tm,GP,Comp,Inc,Passing_Yds,Passing_Td,Int,Pic6,Sks,Passing_1st,Att,Rushing_Yds,Rushing_Td,Rushing_1st,Tgt,Rec,Receiving_Yds,Receiving_Td,Receiving_1st,Return_Yds,Return_Td,2PT,Total,Lost,PPG,Rank
0,T. Hill,WR,Kc,15,0,0,0,0,0,0,0,0,13,123,2,5,135,87,1276,15,57,0,0,0,0,0,10.911765,15.0
1,J. Jefferson,WR,Min,16,0,0,0,0,0,0,0,0,1,2,0,0,125,88,1400,7,58,0,0,2,1,0,13.082353,22.0
2,D. Metcalf,WR,Sea,16,0,0,0,0,0,0,0,0,0,0,0,0,129,83,1303,10,63,0,0,0,1,1,9.958824,5.0
3,D. Adams,WR,Gb,14,0,0,0,0,0,0,0,0,0,0,0,0,149,115,1374,18,73,0,0,0,1,1,13.831250,3.0
4,N. Chubb,RB,Cle,12,0,0,0,0,0,0,0,0,190,1067,12,55,18,16,150,0,6,0,0,0,1,1,13.950000,25.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,C. Conley,WR,Hou,15,0,0,0,0,0,0,0,0,0,0,0,0,63,40,471,2,24,0,0,0,1,1,2.662500,27.0
219,D. Njoku,TE,Cle,13,0,0,0,0,0,0,0,0,0,0,0,0,29,19,213,2,13,0,0,0,0,0,4.475000,16.0
220,M. Alie-Cox,TE,Ind,15,0,0,0,0,0,0,0,0,0,0,0,0,39,31,394,2,19,0,0,0,1,1,3.270588,13.0
221,J. Johnson,TE,No,7,0,0,0,0,0,0,0,0,0,0,0,0,10,4,39,0,3,0,0,0,0,0,2.850000,17.0


##### Now, lets divide all game-statistics columns by total games played so we receive the per-game stats. This will help factor out injuries and instead focus on production.

In [94]:
updated_df.iloc[:, 4:-2] = updated_df.iloc[:, 4:-2].div(updated_df['GP'], axis=0)

In [95]:
updated_df

Unnamed: 0,Player,Pos,Tm,GP,Comp,Inc,Passing_Yds,Passing_Td,Int,Pic6,Sks,Passing_1st,Att,Rushing_Yds,Rushing_Td,Rushing_1st,Tgt,Rec,Receiving_Yds,Receiving_Td,Receiving_1st,Return_Yds,Return_Td,2PT,Total,Lost,PPG,Rank
0,T. Hill,WR,Kc,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.866667,8.200000,0.133333,0.333333,9.000000,5.800000,85.066667,1.000000,3.800000,0.0,0.0,0.000,0.000000,0.000000,10.911765,15.0
1,J. Jefferson,WR,Min,16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062500,0.125000,0.000000,0.000000,7.812500,5.500000,87.500000,0.437500,3.625000,0.0,0.0,0.125,0.062500,0.000000,13.082353,22.0
2,D. Metcalf,WR,Sea,16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,8.062500,5.187500,81.437500,0.625000,3.937500,0.0,0.0,0.000,0.062500,0.062500,9.958824,5.0
3,D. Adams,WR,Gb,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,10.642857,8.214286,98.142857,1.285714,5.214286,0.0,0.0,0.000,0.071429,0.071429,13.831250,3.0
4,N. Chubb,RB,Cle,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.833333,88.916667,1.000000,4.583333,1.500000,1.333333,12.500000,0.000000,0.500000,0.0,0.0,0.000,0.083333,0.083333,13.950000,25.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,C. Conley,WR,Hou,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,4.200000,2.666667,31.400000,0.133333,1.600000,0.0,0.0,0.000,0.066667,0.066667,2.662500,27.0
219,D. Njoku,TE,Cle,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,2.230769,1.461538,16.384615,0.153846,1.000000,0.0,0.0,0.000,0.000000,0.000000,4.475000,16.0
220,M. Alie-Cox,TE,Ind,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,2.600000,2.066667,26.266667,0.133333,1.266667,0.0,0.0,0.000,0.066667,0.066667,3.270588,13.0
221,J. Johnson,TE,No,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,1.428571,0.571429,5.571429,0.000000,0.428571,0.0,0.0,0.000,0.000000,0.000000,2.850000,17.0


##### Now, lets split up our dataset into positional groups to perform advanced analytics on each group.

In [96]:
RB_data = football.splitByPos(updated_df, "RB")
WR_data = football.splitByPos(updated_df, "WR")
TE_data = football.splitByPos(updated_df, "TE")
QB_data = football.splitByPos(updated_df, "QB")

In [98]:
%store RB_data
%store WR_data
%store QB_data
%store TE_data

Stored 'RB_data' (DataFrame)
Stored 'WR_data' (DataFrame)
Stored 'QB_data' (DataFrame)
Stored 'TE_data' (DataFrame)


Now, We will split up by position in different notebooks and model each separately