# EDA & Model Development 

In [90]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import random

## Initial Setup

In [91]:
#https://raw.githubusercontent.com/jchristo12/fantasy_football/master/data/full_data.csv
df = pd.read_csv('https://github.com/jchristo12/fantasy_football/blob/master/data/full_data.csv?raw=true')

In [92]:
df.columns

Index(['pk', 'gid', 'seas', 'wk', 'player', 'fname', 'lname', 'full_name',
       'team', 'pos1',
       ...
       'ou', 'sprv', 'ptsv', 'ptsh', 'udog', 'gen_cond', 'udog_binary',
       'gen_dv', 'def_team', 'f_pts'],
      dtype='object', length=171)

In [93]:
#remove rows that have NaN for the shifted variables
df_clean = df[~df.loc[:,'seas_pa':'seas_tdret'].isna().all(axis=1)]

##### We are only focused on offensive players right now. Therefore, we will specifiy the positions we want to retain in the data and remove everyone else

In [94]:
#store positions we are concerned about; will use these to filter out 
pos_of_interest = ['QB', 'RB', 'WR', 'TE']
#filter out positions we don't care about
df_clean2 = df_clean[df_clean['pos1'].isin(pos_of_interest)]

##### Convert each column to the appropriate data type (i.e. make the categorical data categories)

In [95]:
#set the column types
col_dtypes = {'category': ['seas', 'wk', 'pos1', 'team', 'udog', 'v', 'h', 'day', 'stad', 'wdir',
                          'surf', 'gen_cond', 'gen_dv', 'def_team']}
#flip the key and values around so they will work in the argument for 'astype()'
col_dtypes_alt = {old: new for new, old_all in col_dtypes.items() for old in old_all}

In [96]:
df_clean2 = df_clean2.astype(col_dtypes_alt)

##### Remove rookies

In [97]:
#store dataframe of non-rookies
df_clean2_vet = df_clean2.loc[df_clean2['exp']!=1, :]
#store dataframe of rookie data
df_clean2_rook = df_clean2.loc[df_clean2['exp']==1, :]

##### Segment out the WR data

Non-rookies data only

In [98]:
df_wr = df_clean2_vet.loc[df_clean2_vet['pos1']=='WR', :]

##### Only focus on week 10 data

In [99]:
df_wr10 = df_wr#.loc[df_wr['wk'] == 10, :]

In [100]:
df_wr10.shape

(28605, 171)

## EDA

Only focusing on the WR data right now. Will need to build this so that each action is extendable to the other positions

### Training/Test split

In [101]:
from sklearn.model_selection import train_test_split

In [102]:
#set the random seed for reproducability
random.seed(837)

In [103]:
#break out the data between training and test
train_wr, test_wr = train_test_split(df_wr10, train_size=0.75, test_size=0.25, shuffle=True)

In [104]:
#shape of the data
print(train_wr.shape, test_wr.shape)

(21453, 171) (7152, 171)


### Missing Data analysis

#### Basic analysis

In [105]:
#create a series of percent of missing data
missing_data_pct = train_wr.isna().sum() / train_wr.shape[0]

In [106]:
#list all columns with missing data that is greater than 25%
missing_data_pct[missing_data_pct > 0.25].sort_values(ascending=False)

Series([], dtype: float64)

In [107]:
#columns with missing values but less than or equal to 25%
impute_cols = missing_data_pct[(missing_data_pct <= 0.25) & (missing_data_pct > 0)].sort_values(ascending=False)
impute_cols

temp            0.169673
last_pa         0.120076
last_tdr        0.120076
last_ret        0.120076
last_tdrec      0.120076
last_recy       0.120076
last_rec        0.120076
last_trg        0.120076
last_fuml       0.120076
last_ry         0.120076
last_tdret      0.120076
last_sra        0.120076
last_ra         0.120076
last_tdp        0.120076
last_ints       0.120076
last_py         0.120076
last_pc         0.120076
last_rety       0.120076
humd            0.039388
career_pa       0.033422
career_trg      0.033422
career_tdret    0.033422
career_rety     0.033422
career_ret      0.033422
career_tdrec    0.033422
career_recy     0.033422
career_rec      0.033422
career_fuml     0.033422
career_tdr      0.033422
career_ry       0.033422
career_sra      0.033422
career_ra       0.033422
career_tdp      0.033422
career_ints     0.033422
career_py       0.033422
career_pc       0.033422
wdir            0.029693
dtype: float64

#### Take action on missing data

##### Drop columns with too many missing values

In [108]:
#drop columns with missing data greater than 25%
#store the column names
missing_cols_del = missing_data_pct[missing_data_pct > 0.25].sort_values(ascending=False).index
#drop the columns and store as new dataframe
train_wr_miss = train_wr.drop(missing_cols_del, axis=1, inplace=False)

In [109]:
#check the shape
train_wr_miss.shape

(21453, 171)

##### Impute the rest of the missing values

In [110]:
from sklearn.impute import SimpleImputer

In [111]:
#Build simple imputers for both numeric and categorical features
numeric_impute = SimpleImputer(missing_values=np.NaN, strategy='median')
cat_impute = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')

In [112]:
#create a dataframe of all of the features to impute
missing_values_df = train_wr_miss.drop(train_wr_miss.columns.difference(impute_cols.index), axis=1, inplace=False)

In [113]:
missing_values_df.columns

Index(['last_pa', 'last_pc', 'last_py', 'last_ints', 'last_tdp', 'last_ra',
       'last_sra', 'last_ry', 'last_tdr', 'last_fuml', 'last_trg', 'last_rec',
       'last_recy', 'last_tdrec', 'last_ret', 'last_rety', 'last_tdret',
       'career_pa', 'career_pc', 'career_py', 'career_ints', 'career_tdp',
       'career_ra', 'career_sra', 'career_ry', 'career_tdr', 'career_fuml',
       'career_trg', 'career_rec', 'career_recy', 'career_tdrec', 'career_ret',
       'career_rety', 'career_tdret', 'temp', 'wdir', 'humd'],
      dtype='object')

In [114]:
#store the columns that need to use the numeric imputation and the categorical imputation
impute_numeric_col = missing_values_df.select_dtypes(include=np.number).columns
impute_cat_col = missing_values_df.select_dtypes(exclude=np.number).columns

This code below attempts to handle infinity values; This is not an issue right now but something to be aware of

In [115]:
#Boolean if the column has 'inf' values or not
#inf_cols = np.isinf(train_wr_miss.loc[:, impute_numeric_col]).any()

In [116]:
#list of columns that have 'inf' values
#inf_cols2 = list(train_wr_miss.loc[:, impute_numeric_col].columns.to_series()[inf_cols].values)

Continue with simple imputation

In [117]:
#impute numeric columns
imputed_numeric_df = pd.DataFrame(numeric_impute.fit_transform(train_wr_miss.loc[:, impute_numeric_col]), columns=impute_numeric_col)

In [118]:
#impute categorical columns
imputed_cat_df = pd.DataFrame(cat_impute.fit_transform(train_wr_miss.loc[:, impute_cat_col]), columns=impute_cat_col)

Some analysis on the missing values

In [120]:
train_wr_miss[train_wr_miss['last_pa'].isna()].to_csv('C:/Users/Joe/Desktop/missing.csv', index=False)