# EDA & Model Development 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import random

## Initial Setup

In [2]:
#https://raw.githubusercontent.com/jchristo12/fantasy_football/master/data/full_data.csv
df = pd.read_csv('https://github.com/jchristo12/fantasy_football/blob/master/data/full_data.csv?raw=true')

In [3]:
df.columns

Index(['pk', 'gid', 'seas', 'wk', 'player', 'fname', 'lname', 'full_name',
       'team', 'pos1',
       ...
       'ou', 'sprv', 'ptsv', 'ptsh', 'udog', 'gen_cond', 'udog_binary',
       'gen_dv', 'def_team', 'f_pts'],
      dtype='object', length=171)

In [4]:
#remove rows that have NaN for the shifted variables
df_clean = df[~df.loc[:,'seas_pa':'seas_tdret'].isna().all(axis=1)]

##### We are only focused on offensive players right now. Therefore, we will specifiy the positions we want to retain in the data and remove everyone else

In [5]:
#store positions we are concerned about; will use these to filter out 
pos_of_interest = ['QB', 'RB', 'WR', 'TE']
#filter out positions we don't care about
df_clean2 = df_clean[df_clean['pos1'].isin(pos_of_interest)]

##### Convert each column to the appropriate data type (i.e. make the categorical data categories)

In [8]:
#set the column types
col_dtypes = {'category': ['seas', 'wk', 'pos1', 'team', 'udog', 'v', 'h', 'day', 'stad', 'wdir',
                          'surf', 'gen_cond', 'gen_dv', 'def_team']}
#flip the key and values around so they will work in the argument for 'astype()'
col_dtypes_alt = {old: new for new, old_all in col_dtypes.items() for old in old_all}

In [9]:
df_clean2 = df_clean2.astype(col_dtypes_alt)

##### Segment out the WR data

In [10]:
df_wr = df_clean2.loc[df_clean2['pos1']=='WR', :]

##### Only focus on week 10 data

In [11]:
df_wr10 = df_wr.loc[df_wr['wk'] == 10, :]

## EDA

Only focusing on the WR data right now. Will need to build this so that each action is extendable to the other positions

### Training/Test split

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
#set the random seed for reproducability
random.seed(837)

In [14]:
#break out the data between training and test
train_wr, test_wr = train_test_split(df_wr10, train_size=0.75, test_size=0.25, shuffle=True)

In [15]:
#shape of the data
print(train_wr.shape, test_wr.shape)

(1625, 171) (542, 171)


### Missing Data analysis

#### Basic analysis

In [16]:
#create a series of percent of missing data
missing_data_pct = train_wr.isna().sum() / train_wr.shape[0]

In [17]:
#list all columns with missing data that is greater than 25%
missing_data_pct[missing_data_pct > 0.25].sort_values(ascending=False)

roll_td_to_int           0.992615
seas_td_to_int           0.990769
roll_yds_per_comp        0.987077
seas_yds_per_comp        0.982769
roll_comp_pct            0.975385
seas_comp_pct            0.963077
last_td_to_int           0.910769
last_yds_per_comp        0.901538
last_comp_pct            0.896615
last_ret_to_td           0.860308
last_avg_ret             0.860308
career_td_to_int         0.850462
career_yds_per_comp      0.838154
career_comp_pct          0.789538
roll_carry_to_td         0.725538
roll_ryds_per_carry      0.724923
roll_ret_to_td           0.675692
roll_avg_ret             0.675077
roll_carry_to_fuml       0.659692
last_ryds_per_carry      0.654769
last_carry_to_td         0.654154
seas_ret_to_td           0.646769
seas_avg_ret             0.646154
last_carry_to_fuml       0.641846
seas_carry_to_td         0.624615
seas_ryds_per_carry      0.624000
career_avg_ret           0.588308
career_ret_to_td         0.588308
seas_carry_to_fuml       0.535385
last_yds_per_r

In [18]:
#columns with missing values but less than or equal to 25%
impute_cols = missing_data_pct[(missing_data_pct <= 0.25) & (missing_data_pct > 0)].sort_values(ascending=False)
impute_cols

career_carry_to_fuml    0.230154
temp                    0.145846
career_rec_to_td        0.136000
career_yds_per_rec      0.132308
last_pa                 0.123692
last_ra                 0.123692
last_rety               0.123692
last_ret                0.123692
last_tdrec              0.123692
last_recy               0.123692
last_rec                0.123692
last_trg                0.123692
last_fuml               0.123692
last_tdr                0.123692
last_ry                 0.123692
last_sra                0.123692
last_py                 0.123692
last_tdp                0.123692
last_ints               0.123692
last_pc                 0.123692
last_tdret              0.123692
career_catch_pct        0.120615
roll_yds_per_rec        0.068308
roll_rec_to_td          0.068308
seas_yds_per_rec        0.060923
seas_rec_to_td          0.060923
roll_catch_pct          0.045538
seas_catch_pct          0.040000
career_pa               0.033846
career_sra              0.033846
career_pc 

#### Take action on missing data

##### Drop columns with too many missing values

In [23]:
#drop columns with missing data greater than 25%
#store the column names
missing_cols_del = missing_data_pct[missing_data_pct > 0.25].sort_values(ascending=False).index
#drop the columns and store as new dataframe
train_wr_miss = train_wr.drop(missing_cols_del, axis=1, inplace=False)

In [24]:
#check the shape
train_wr_miss.shape

(1625, 137)

##### Impute the rest of the missing values

In [25]:
from sklearn.impute import SimpleImputer

In [26]:
#Build simple imputers for both numeric and categorical features
numeric_impute = SimpleImputer(missing_values=np.NaN, strategy='median')
cat_impute = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')

In [27]:
#create a dataframe of all of the features to impute
missing_values_df = train_wr_miss.drop(train_wr_miss.columns.difference(impute_cols.index), axis=1, inplace=False)

In [28]:
missing_values_df.columns

Index(['last_pa', 'last_pc', 'last_py', 'last_ints', 'last_tdp', 'last_ra',
       'last_sra', 'last_ry', 'last_tdr', 'last_fuml', 'last_trg', 'last_rec',
       'last_recy', 'last_tdrec', 'last_ret', 'last_rety', 'last_tdret',
       'career_pa', 'career_pc', 'career_py', 'career_ints', 'career_tdp',
       'career_ra', 'career_sra', 'career_ry', 'career_tdr', 'career_fuml',
       'career_trg', 'career_rec', 'career_recy', 'career_tdrec', 'career_ret',
       'career_rety', 'career_tdret', 'career_carry_to_fuml',
       'career_catch_pct', 'seas_catch_pct', 'roll_catch_pct',
       'career_yds_per_rec', 'seas_yds_per_rec', 'roll_yds_per_rec',
       'career_rec_to_td', 'seas_rec_to_td', 'roll_rec_to_td', 'temp', 'wdir',
       'humd'],
      dtype='object')

In [38]:
#store the columns that need to use the numeric imputation and the categorical imputation
impute_numeric_col = missing_values_df.select_dtypes(include=np.number).columns
impute_cat_col = missing_values_df.select_dtypes(exclude=np.number).columns

In [80]:
#Boolean if the column has 'inf' values or not
inf_cols = np.isinf(train_wr_miss.loc[:, impute_numeric_col]).any()

In [87]:
#list of columns that have 'inf' values
inf_cols2 = list(train_wr_miss.loc[:, impute_numeric_col].columns.to_series()[inf_cols].values)

In [94]:
train_wr_miss.loc[:, inf_cols2]

Unnamed: 0,career_carry_to_fuml,career_yds_per_rec,career_rec_to_td,seas_rec_to_td,roll_rec_to_td
10142,72.000000,6.645161,inf,25.000000,15.000000
30496,inf,2.500000,inf,inf,inf
4909,,12.000000,inf,inf,inf
50940,,3.000000,inf,7.800000,7.666667
27982,,4.166667,inf,3.250000,2.750000
74615,inf,4.416667,inf,8.000000,15.000000
1122,inf,9.900000,10.000000,inf,inf
86204,0.400000,12.512500,8.727273,1.000000,1.000000
79686,,3.750000,inf,6.500000,6.000000
63917,,9.788235,12.142857,17.250000,37.000000


In [86]:
def tree_impute(df, cols):
    #import necessary packages
    try:
        from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
        from sklearn.impute import SimpleImputer
        import pandas as pd
    except:
        print('Necessary packages are not installed. Please install:')
        print('scikit-learn & pandas')
    
    #initialize a list to store imputed features
    imputed_features = []
    
    #set up the dataframe to handle categorical features in Decision Trees
        #store the categorical and bool columns as OneHotEncoded dummies

    
    #loop thru all features that have missing values
    for c in cols:
        #store the column datatype
        data_type = df[c].dtype
        #store the target variable separate
        target = df[c].values
        
        
        #store the features separate
        features_df = df.drop(c, axis=1)
        #create one hot encoded dummy variables
        dummies = pd.get_dummies(features_df.select_dtypes(include=['category', 'bool']), drop_first=True)
        #drop original columns (as well as strings) and concate the dummies
        interim_df = features_df.drop(features_df.select_dtypes(include=['category', 'bool']), axis=1).select_dtypes(exclude='object')
        new_df = pd.concat([interim_df, dummies], axis=1)
        #convert to values
        features = new_df.values
        
        
        
        #perform the imputation
        if (data_type == 'category') | (data_type == 'bool'):
            #Decision tree for categorical variable
            Ctree = DecisionTreeClassifier(criterion='gini', random_state=100, max_depth=5, min_samples_leaf=3)
            Ctree.fit(features, target)
            output = Ctree.predict(target)
        else:
            try:
                #Decision tree for regression variables
                Rtree = DecisionTreeRegressor(criterion='mse', random_state=100, max_depth=5, min_samples_leaf=3)
                Rtree.fit(features, target)
                output = Rtree.predict(target)
            except:
                #use the average value to impute missing values
                print('Decision tree could not be used for imputation')
                print('Average value used')
                mean_impute = SimpleImputer(strategy='mean')
                output = mean_impute.fit_transform(target)
        
        #store the missing values
        result = pd.Series(output).rename('imp_' + c)
        imputed_features.append(result)
    
    #concat all series into a dataframe
    final_df = pd.concat(imputed_features, axis=1)
    
    return final_df

In [19]:
#store the columns with missing data
low_missing_cols = missing_data_pct[(missing_data_pct <= 0.25) & (missing_data_pct > 0)].index

Testing

In [43]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.impute import SimpleImputer
import pandas as pd

In [44]:
low_missing_cols

Index(['dv', 'last_pa', 'last_pc', 'last_py', 'last_ints', 'last_tdp',
       'last_ra', 'last_sra', 'last_ry', 'last_tdr', 'last_fuml', 'last_trg',
       'last_rec', 'last_recy', 'last_tdrec', 'last_ret', 'last_rety',
       'last_tdret', 'career_pa', 'career_pc', 'career_py', 'career_ints',
       'career_tdp', 'career_ra', 'career_sra', 'career_ry', 'career_tdr',
       'career_fuml', 'career_trg', 'career_rec', 'career_recy',
       'career_tdrec', 'career_ret', 'career_rety', 'career_tdret',
       'career_carry_to_fuml', 'career_catch_pct', 'seas_catch_pct',
       'roll_catch_pct', 'career_yds_per_rec', 'seas_yds_per_rec',
       'roll_yds_per_rec', 'career_rec_to_td', 'seas_rec_to_td',
       'roll_rec_to_td', 'cond', 'temp', 'wdir', 'humd'],
      dtype='object')

In [45]:
#store the argments for testing
c = low_missing_cols[0]
df = train_wr_miss

In [46]:
#store the column type
col_data_type = df[c].dtype

In [47]:
#create dummy variables
#dummies = pd.get_dummies(df.select_dtypes(include=['category', 'bool']), drop_first=True)

In [30]:
#drop original columns (as well as strings) and concate the dummies
interim_df = df.drop(df.select_dtypes(include=['category', 'bool']), axis=1).select_dtypes(exclude='object')
new_df = pd.concat([interim_df, dummies], axis=1)
#drop identifier columns
new_df = new_df.drop(['gid', 'nflid'], axis=1)
#drop ultimate response variable
new_df = new_df.drop('f_pts', axis=1)

In [31]:
for i in new_df.columns: print(i)

pa
pc
py
ints
tdp
ra
sra
ry
tdr
fuml
trg
rec
recy
tdrec
ret
rety
tdret
exp
height
weight
forty
bench
vertical
broad
shuttle
cone
arm
hand
seas_pa
seas_pc
seas_py
seas_ints
seas_tdp
seas_ra
seas_sra
seas_ry
seas_tdr
seas_fuml
seas_trg
seas_rec
seas_recy
seas_tdrec
seas_ret
seas_rety
seas_tdret
last_pa
last_pc
last_py
last_ints
last_tdp
last_ra
last_sra
last_ry
last_tdr
last_fuml
last_trg
last_rec
last_recy
last_tdrec
last_ret
last_rety
last_tdret
career_pa
career_pc
career_py
career_ints
career_tdp
career_ra
career_sra
career_ry
career_tdr
career_fuml
career_trg
career_rec
career_recy
career_tdrec
career_ret
career_rety
career_tdret
recent_fuml
recent_ints
recent_pa
recent_pc
recent_py
recent_ra
recent_rec
recent_recy
recent_ret
recent_rety
recent_ry
recent_sra
recent_tdp
recent_tdr
recent_tdrec
recent_tdret
recent_trg
career_carry_to_fuml
career_catch_pct
seas_catch_pct
roll_catch_pct
career_yds_per_rec
seas_yds_per_rec
roll_yds_per_rec
career_rec_to_td
seas_rec_to_td
roll_rec_to_td
ag

In [48]:
#store the target
target = df[c].values

In [49]:
#store the features
features_df = df.drop(c, axis=1)

In [50]:
#create one hot encoded dummy variables
dummies = pd.get_dummies(features_df.select_dtypes(include=['category', 'bool']), drop_first=False)

In [51]:
#drop original columns (as well as strings) and concate the dummies
interim_df = features_df.drop(features_df.select_dtypes(include=['category', 'bool']), axis=1).select_dtypes(exclude='object')
new_df = pd.concat([interim_df, dummies], axis=1)

In [52]:
#convert to values
features = new_df.values

In [53]:
#tree classifier
tree = DecisionTreeClassifier(criterion='gini', random_state=212, min_samples_leaf=3)

In [54]:
#fit the tree
tree.fit(features, target)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').