# EDA & Model Development 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import random

## Initial Setup

In [2]:
#https://raw.githubusercontent.com/jchristo12/fantasy_football/master/data/full_data.csv
df = pd.read_csv('https://github.com/jchristo12/fantasy_football/blob/master/data/full_data.csv?raw=true')

In [3]:
df.columns

Index(['pk', 'gid', 'seas', 'wk', 'player', 'fname', 'lname', 'full_name',
       'team', 'pos1',
       ...
       'humd', 'ou', 'sprv', 'ptsv', 'ptsh', 'udog', 'gen_cond', 'udog_binary',
       'def_team', 'f_pts'],
      dtype='object', length=161)

In [4]:
#remove rows that have NaN for the shifted variables
df_clean = df[~df.loc[:,'seas_pa':'seas_tdret'].isna().all(axis=1)]

##### We are only focused on offensive players right now. Therefore, we will specifiy the positions we want to retain in the data and remove everyone else

In [5]:
#store positions we are concerned about; will use these to filter out 
pos_of_interest = ['QB', 'RB', 'WR', 'TE']
#filter out positions we don't care about
df_clean2 = df_clean[df_clean['pos1'].isin(pos_of_interest)]

##### Convert each column to the appropriate data type (i.e. make the categorical data categories)

In [6]:
#set the column types
col_dtypes = {'category': ['seas', 'wk', 'pos1', 'team', 'udog', 'dv', 'v', 'h', 'day', 'cond', 'stad', 'wdir',
                          'surf', 'gen_cond', 'def_team']}
#flip the key and values around so they will work in the argument for 'astype()'
col_dtypes_alt = {old: new for new, old_all in col_dtypes.items() for old in old_all}

In [7]:
df_clean2 = df_clean2.astype(col_dtypes_alt)

##### Segment out the WR data

In [8]:
df_wr = df_clean2.loc[df_clean2['pos1']=='WR', :]

##### Only focus on week 10 data

In [9]:
df_wr10 = df_wr.loc[df_wr['wk'] == 10, :]

## EDA

Only focusing on the WR data right now. Will need to build this so that each action is extendable to the other positions

### Training/Test split

In [12]:
from sklearn.model_selection import train_test_split

In [31]:
#set the random seed for reproducability
random.seed(837)

In [32]:
#break out the data between training and test
train_wr, test_wr = train_test_split(df_wr10, train_size=0.75, test_size=0.25, shuffle=True)

In [33]:
#shape of the data
print(train_wr.shape, test_wr.shape)

(1625, 161) (542, 161)


### Missing Data analysis

#### Basic analysis

In [34]:
#create a series of percent of missing data
missing_data_pct = train_wr.isna().sum() / train_wr.shape[0]

In [35]:
#list all columns with missing data that is greater than 25%
missing_data_pct[missing_data_pct > 0.25].sort_values(ascending=False)

roll_td_to_int           0.992000
seas_td_to_int           0.990154
roll_yds_per_comp        0.985846
seas_yds_per_comp        0.980923
roll_comp_pct            0.972923
seas_comp_pct            0.960000
career_td_to_int         0.857846
career_yds_per_comp      0.846769
career_comp_pct          0.798769
roll_carry_to_td         0.737231
roll_ryds_per_carry      0.736615
roll_avg_ret             0.685538
roll_ret_to_td           0.685538
roll_carry_to_fuml       0.662769
seas_ret_to_td           0.653538
seas_avg_ret             0.653538
seas_carry_to_td         0.633231
seas_ryds_per_carry      0.632615
career_avg_ret           0.600000
career_ret_to_td         0.600000
seas_carry_to_fuml       0.533538
career_ryds_per_carry    0.328615
career_carry_to_td       0.327385
dtype: float64

In [45]:
#columns with missing values but less than or equal to 25%
missing_data_pct[(missing_data_pct <= 0.25) & (missing_data_pct > 0)].sort_values(ascending=False)

career_carry_to_fuml    0.246154
humd                    0.230154
wdir                    0.217846
temp                    0.140923
career_rec_to_td        0.136615
career_yds_per_rec      0.132923
last_tdr                0.131692
last_rety               0.131692
last_ret                0.131692
last_tdrec              0.131692
last_recy               0.131692
last_rec                0.131692
last_trg                0.131692
last_fuml               0.131692
last_tdret              0.131692
last_ry                 0.131692
last_sra                0.131692
last_ra                 0.131692
last_tdp                0.131692
last_ints               0.131692
last_py                 0.131692
last_pc                 0.131692
last_pa                 0.131692
career_catch_pct        0.125538
dv                      0.074462
roll_rec_to_td          0.072000
roll_yds_per_rec        0.072000
seas_rec_to_td          0.062154
seas_yds_per_rec        0.062154
roll_catch_pct          0.048000
seas_catch

#### Take action on missing data

##### Drop columns with too many missing values

In [36]:
#drop columns with missing data greater than 25%
#store the column names
missing_cols_del = missing_data_pct[missing_data_pct > 0.25].sort_values(ascending=False).index
#drop the columns and store as new dataframe
train_wr_miss = train_wr.drop(missing_cols_del, axis=1, inplace=False)

In [37]:
#check the shape
train_wr_miss.shape

##### Impute the rest of the missing values

In [85]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.impute import SimpleImputer

In [None]:
def tree_impute(df, cols):
    #initialize a list to store imputed features
    imputed_features = []
    
    #loop thru all features that have missing values
    for c in cols:
        #store the column datatype
        data_type = df[c].dtype
        #store the target variable separate
        target = df[c].values
        #store the features separate
        features = df.drop(c, axis=1).values
        
        #perform the imputation
        if (data_type == 'category') | (data_type == 'bool'):
            #Decision tree for categorical variable
            Ctree = DecisionTreeClassifier(criterion='gini', random_state=100, min_samples_leaf=3)
            Ctree.fit(features, target)
            output = Ctree.predict(target)
        else:
            try:
                #Decision tree for regression variables
                Rtree = DecisionTreeRegressor(criterion='mse', random_state=100, min_samples_leaf=3)
                Rtree.fit(features, target)
                output = Rtree.predict(target)
            except:
                #use the average value to impute missing values
                print('Decision tree could not be used for imputation')
                print('Average value used')
                mean_impute = SimpleImputer(strategy='mean')
                output = mean_impute.fit_transform(target)
        
        #store the missing values
        result = pd.Series(output).rename('imp_' + c)
        imputed_features.append(result)
    
    #concat all series into a dataframe
    final_df = pd.concat(imputed_features, axis=1)
    
    return final_df

In [46]:
#store the columns with missing data
low_missing_cols = missing_data_pct[(missing_data_pct <= 0.25) & (missing_data_pct > 0)].index

Testing

In [61]:
#store the argments for testing
c = low_missing_cols[0]
df = train_wr_miss

In [63]:
#store the column type
col_data_type = df[c].dtype

In [82]:
#store the target
target = df[c].values

In [76]:
#store the features
features = df.drop(c, axis=1).loc[:, 'team':].values

In [79]:
#tree classifier
tree = DecisionTreeClassifier(criterion='gini', random_state=212, min_samples_leaf=3)

In [80]:
#fit the tree
fit = tree.fit(features, target)

ValueError: could not convert string to float: 'NE'