# EDA & Model Development 

In [1]:
import pandas as pd
import numpy as np
from python_pkg import python_udf as udf
import matplotlib.pyplot as plt
import seaborn as sb
import random

## Initial Setup

In [2]:
#https://raw.githubusercontent.com/jchristo12/fantasy_football/master/data/full_data.csv
df = pd.read_csv('https://github.com/jchristo12/fantasy_football/blob/master/data/full_data.csv?raw=true')

In [3]:
df.columns

Index(['pk', 'gid', 'seas', 'wk', 'player', 'fname', 'lname', 'full_name',
       'team', 'pos1',
       ...
       'ou', 'sprv', 'ptsv', 'ptsh', 'udog', 'gen_cond', 'udog_binary',
       'gen_dv', 'def_team', 'f_pts'],
      dtype='object', length=171)

In [4]:
#remove rows that have NaN for the shifted variables
df_clean = df[~df.loc[:,'seas_pa':'seas_tdret'].isna().all(axis=1)]

##### We are only focused on offensive players right now. Therefore, we will specifiy the positions we want to retain in the data and remove everyone else

In [5]:
#store positions we are concerned about; will use these to filter out 
pos_of_interest = ['QB', 'RB', 'WR', 'TE']
#filter out positions we don't care about
df_clean2 = df_clean[df_clean['pos1'].isin(pos_of_interest)]

##### Convert each column to the appropriate data type (i.e. make the categorical data categories)

In [6]:
#set the column types
col_dtypes = {'category': ['seas', 'wk', 'pos1', 'team', 'udog', 'v', 'h', 'day', 'stad', 'wdir',
                          'surf', 'gen_cond', 'gen_dv', 'def_team']}
#flip the key and values around so they will work in the argument for 'astype()'
col_dtypes_alt = {old: new for new, old_all in col_dtypes.items() for old in old_all}

In [7]:
df_clean2 = df_clean2.astype(col_dtypes_alt)

##### Remove rookies

In [8]:
#store dataframe of non-rookies
df_clean2_vet = df_clean2.loc[df_clean2['exp']!=1, :]
#store dataframe of rookie data
df_clean2_rook = df_clean2.loc[df_clean2['exp']==1, :]

##### Segment out the WR data

Non-rookies data only

In [9]:
df_wr = df_clean2_vet.loc[df_clean2_vet['pos1']=='WR', :]

##### Only focus on week 10 data

In [10]:
df_wr10 = df_wr.loc[df_wr['wk'] == 10, :]

In [11]:
df_wr10.shape

(1860, 171)

## EDA

Only focusing on the WR data right now. Will need to build this so that each action is extendable to the other positions

### Training/Test split

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
#set the random seed for reproducability
random.seed(837)

In [14]:
#break out the data between training and test
train_wr, test_wr = train_test_split(df_wr10, train_size=0.75, test_size=0.25, shuffle=True)

In [15]:
#shape of the data
print(train_wr.shape, test_wr.shape)

(1395, 171) (465, 171)


In [16]:
#reset index on both dataframes
train_wr = train_wr.reset_index(drop=True)
test_wr = test_wr.reset_index(drop=True)

### Missing Data analysis

#### Basic analysis

In [17]:
#create a series of percent of missing data
missing_data_pct = train_wr.isna().sum() / train_wr.shape[0]

In [18]:
#list all columns with missing data that is greater than 25%
missing_data_pct[missing_data_pct > 0.25].sort_values(ascending=False)

bench       0.715412
hand        0.619355
arm         0.607885
cone        0.524731
shuttle     0.513978
broad       0.486022
vertical    0.443011
forty       0.374194
dtype: float64

In [19]:
#columns with missing values but less than or equal to 25%
impute_cols = missing_data_pct[(missing_data_pct <= 0.25) & (missing_data_pct > 0)].sort_values(ascending=False)
impute_cols

temp    0.151971
humd    0.028674
wdir    0.024373
dtype: float64

#### Take action on missing data

##### Drop columns with too many missing values

In [20]:
#drop columns with missing data greater than 25%
#store the column names
missing_cols_del = missing_data_pct[missing_data_pct > 0.25].sort_values(ascending=False).index
#drop the columns and store as new dataframe
train_wr_miss = train_wr.drop(missing_cols_del, axis=1, inplace=False)

In [21]:
#check the shape
train_wr_miss.shape

(1395, 163)

##### Impute the rest of the missing values

In [22]:
from sklearn.impute import SimpleImputer

In [23]:
#Build simple imputers for both numeric and categorical features
numeric_impute = SimpleImputer(missing_values=np.NaN, strategy='median')
cat_impute = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')

In [24]:
#create a dataframe of all of the features to impute
missing_values_df = train_wr_miss.drop(train_wr_miss.columns.difference(impute_cols.index), axis=1, inplace=False)

In [25]:
#store the columns that need to use the numeric imputation and the categorical imputation
impute_numeric_col = missing_values_df.select_dtypes(include=np.number).columns
impute_cat_col = missing_values_df.select_dtypes(exclude=np.number).columns

This code below attempts to handle infinity values; This is not an issue right now but something to be aware of

In [26]:
#Boolean if the column has 'inf' values or not
#inf_cols = np.isinf(train_wr_miss.loc[:, impute_numeric_col]).any()

In [27]:
#list of columns that have 'inf' values
#inf_cols2 = list(train_wr_miss.loc[:, impute_numeric_col].columns.to_series()[inf_cols].values)

Continue with simple imputation

In [28]:
#impute numeric columns
imputed_numeric_df = pd.DataFrame(numeric_impute.fit_transform(train_wr_miss.loc[:, impute_numeric_col]), columns=impute_numeric_col).add_prefix('imp_')

In [29]:
#impute categorical columns
imputed_cat_df = pd.DataFrame(cat_impute.fit_transform(train_wr_miss.loc[:, impute_cat_col]), columns=impute_cat_col).add_prefix('imp_')

In [49]:
#add imputed columns to original data frame
train_wr_imputed = pd.concat([train_wr_miss, imputed_numeric_df, imputed_cat_df], axis=1)

In [50]:
#drop the original columns that have missing data
train_wr_imputed = train_wr_imputed.drop(list(impute_cat_col) + list(impute_numeric_col), axis=1)

### Feature Exploration

Remove the features that we won't know at time of analysis

In [56]:
#remove current game stats
drop_stat_cols = list(train_wr_imputed.loc[:, 'pa':'tdret'].columns)

In [57]:
#drop from the data frame
wr10_subset1 = train_wr_imputed.drop(drop_stat_cols, axis=1)

In [62]:
#correlation between numerical variables
udf.corr_to_df_summary(wr10_subset1.select_dtypes(include=np.number)).reset_index()

Unnamed: 0,Var1,Var2,Pearson R
0,exp,age,0.904277
1,last_pa,last_pc,0.997346
2,last_pa,last_py,0.985184
3,last_pa,last_ints,0.902541
4,last_pa,recent_ints,0.961991
5,last_pa,recent_pa,0.997589
6,last_pa,recent_pc,0.996850
7,last_pa,recent_py,0.988444
8,last_pa,recent_tdp,0.753741
9,last_pa,seas_pa,0.996386


## Baseline Model

Non modeling variables need to be removed

In [132]:
#non modeling variables
non_model_cols = list(wr10_subset1.loc[:, 'pk':'full_name']) + ['pos1', 'nflid', 'udog', 'v', 'h', 'dob']

In [133]:
model_subset = wr10_subset1.drop(non_model_cols, axis=1)

### Regression Random Forest

In [134]:
dummies = pd.get_dummies(model_subset.select_dtypes(exclude=np.number), drop_first=True)

In [135]:
model_subset1 = pd.concat([model_subset.drop('f_pts', axis=1), dummies], axis=1)

In [136]:
model_subset1 = model_subset1.drop(model_subset.select_dtypes(exclude=np.number).columns, axis=1)

In [137]:
model_subset1_x = model_subset1.values

In [138]:
model_subset1_y = model_subset['f_pts'].values

In [139]:
from sklearn.ensemble import RandomForestRegressor

In [140]:
rf_object = RandomForestRegressor(criterion='mse', random_state=67, n_estimators=100)

In [141]:
rf_object.fit(model_subset1_x, model_subset1_y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=67, verbose=0, warm_start=False)

In [142]:
pd.Series(rf_object.feature_importances_, index=list(model_subset1.columns)).sort_values(ascending=False)

seas_recy                              1.386463e-01
recent_trg                             4.839708e-02
recent_recy                            4.726439e-02
ptsh                                   3.729182e-02
ptsv                                   3.550916e-02
seas_rec                               3.531893e-02
seas_trg                               2.595905e-02
career_rec_to_td                       2.091694e-02
last_recy                              1.746059e-02
seas_catch_pct                         1.680756e-02
weight                                 1.669825e-02
career_catch_pct                       1.660198e-02
seas_yds_per_rec                       1.657556e-02
ou                                     1.613455e-02
career_yds_per_rec                     1.591980e-02
last_yds_per_rec                       1.532331e-02
career_ryds_per_carry                  1.423890e-02
recent_rec                             1.412236e-02
roll_yds_per_rec                       1.340144e-02
roll_catch_p

##### Set up the test data

In [107]:
test_wr

Unnamed: 0,pk,gid,seas,wk,player,fname,lname,full_name,team,pos1,...,ou,sprv,ptsv,ptsh,udog,gen_cond,udog_binary,gen_dv,def_team,f_pts
0,926&JM-3500,926,2003,10,JM-3500,James,McKnight,James McKnight,MIA,WR,...,40.0,5.0,7,31,MIA,fair_cond,True,Other,TEN,0.000000
1,3067&JN-0900,3067,2011,10,JN-0900,Jordy,Nelson,Jordy Nelson,GB,WR,...,50.5,15.5,7,45,MIN,fair_cond,False,Big 12,MIN,18.300000
2,4133&DT-0850,4133,2015,10,DT-0850,De'Anthony,Thomas,De'Anthony Thomas,KC,WR,...,41.5,4.5,29,13,KC,fair_cond,True,Pacific 12,DEN,3.871429
3,1457&DB-3600,1457,2005,10,DB-3600,Deion,Branch,Deion Branch,NE,WR,...,42.0,-2.5,23,16,MIA,fair_cond,False,Other,MIA,8.200000
4,4393&WS-0925,4393,2016,10,WS-0925,Willie,Snead,Willie Snead,NO,WR,...,49.0,3.0,25,23,DEN,indoor_cond,False,Other,DEN,16.700000
5,2252&TW-3200,2252,2008,10,TW-3200,Troy,Williamson,Troy Williamson,JAC,WR,...,44.0,-6.5,38,14,DET,indoor_cond,False,Southeastern (SEC),DET,7.285714
6,4929&DP-0350,4929,2018,10,DP-0350,DeVante,Parker,DeVante Parker,MIA,WR,...,47.5,10.0,12,31,MIA,fair_cond,True,Atlantic Coast (ACC),GB,4.300000
7,918&BS-1700,918,2003,10,BS-1700,Bobby,Shaw,Bobby Shaw,BUF,WR,...,37.0,4.0,6,10,BUF,fair_cond,True,Pacific 12,DAL,0.800000
8,3599&AR-1100,3599,2013,10,AR-1100,Andre,Roberts,Andre Roberts,ARI,WR,...,41.5,3.0,24,27,HOU,indoor_cond,False,Other,HOU,13.200000
9,1728&TW-0400,1728,2006,10,TW-0400,Troy,Walters,Troy Walters,ARI,WR,...,43.5,-7.0,27,10,ARI,fair_cond,True,Pacific 12,DAL,0.114286
