# EDA & Model Development 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import random

## Initial Setup

In [2]:
#https://raw.githubusercontent.com/jchristo12/fantasy_football/master/data/full_data.csv
df = pd.read_csv('https://github.com/jchristo12/fantasy_football/blob/master/data/full_data.csv?raw=true')

In [3]:
df.columns

Index(['pk', 'gid', 'seas', 'wk', 'player', 'fname', 'lname', 'full_name',
       'team', 'pos1',
       ...
       'humd', 'ou', 'sprv', 'ptsv', 'ptsh', 'udog', 'gen_cond', 'udog_binary',
       'def_team', 'f_pts'],
      dtype='object', length=161)

In [4]:
#remove rows that have NaN for the shifted variables
df_clean = df[~df.loc[:,'seas_pa':'seas_tdret'].isna().all(axis=1)]

##### We are only focused on offensive players right now. Therefore, we will specifiy the positions we want to retain in the data and remove everyone else

In [5]:
#store positions we are concerned about; will use these to filter out 
pos_of_interest = ['QB', 'RB', 'WR', 'TE']
#filter out positions we don't care about
df_clean2 = df_clean[df_clean['pos1'].isin(pos_of_interest)]

##### Convert each column to the appropriate data type (i.e. make the categorical data categories)

In [6]:
#set the column types
col_dtypes = {'category': ['seas', 'wk', 'pos1', 'team', 'udog', 'dv', 'v', 'h', 'day', 'cond', 'stad', 'wdir',
                          'surf', 'gen_cond', 'def_team']}
#flip the key and values around so they will work in the argument for 'astype()'
col_dtypes_alt = {old: new for new, old_all in col_dtypes.items() for old in old_all}

In [7]:
df_clean2 = df_clean2.astype(col_dtypes_alt)

##### Segment out the WR data

In [8]:
df_wr = df_clean2.loc[df_clean2['pos1']=='WR', :]

## EDA

Only focusing on the WR data right now. Will need to build this so that each action is extendable to the other positions

### Training/Test split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
#set the random seed for reproducability
random.seed(837)

In [11]:
#break out the data between training and test
train_wr, test_wr = train_test_split(df_wr, train_size=0.70, test_size=0.30, shuffle=True)

### Missing Data analysis

In [12]:
#create a series of percent of missing data
missing_data_pct = train_wr.isna().sum() / train_wr.shape[0]

In [13]:
missing_data_pct[missing_data_pct > 0.25].sort_values(ascending=False)

roll_td_to_int           0.994700
seas_td_to_int           0.991238
roll_yds_per_comp        0.990640
seas_yds_per_comp        0.983246
roll_comp_pct            0.978202
seas_comp_pct            0.963884
career_td_to_int         0.851477
career_yds_per_comp      0.834039
career_comp_pct          0.787024
roll_ryds_per_carry      0.745480
roll_carry_to_td         0.744925
roll_ret_to_td           0.683421
roll_avg_ret             0.683293
roll_carry_to_fuml       0.678933
seas_ryds_per_carry      0.665427
seas_carry_to_td         0.664530
seas_ret_to_td           0.652007
seas_avg_ret             0.651793
career_ret_to_td         0.599308
career_avg_ret           0.599179
seas_carry_to_fuml       0.574732
career_ryds_per_carry    0.318331
career_carry_to_td       0.317733
humd                     0.274138
wdir                     0.264863
dtype: float64

In [15]:
missing_data_pct[missing_data_pct != 0]

dv                       0.080737
last_pa                  0.122879
last_pc                  0.122879
last_py                  0.122879
last_ints                0.122879
last_tdp                 0.122879
last_ra                  0.122879
last_sra                 0.122879
last_ry                  0.122879
last_tdr                 0.122879
last_fuml                0.122879
last_trg                 0.122879
last_rec                 0.122879
last_recy                0.122879
last_tdrec               0.122879
last_ret                 0.122879
last_rety                0.122879
last_tdret               0.122879
career_pa                0.034278
career_pc                0.034278
career_py                0.034278
career_ints              0.034278
career_tdp               0.034278
career_ra                0.034278
career_sra               0.034278
career_ry                0.034278
career_tdr               0.034278
career_fuml              0.034278
career_trg               0.034278
career_rec    