In [1]:
import pandas as pd
import numpy as np
import os

### Read in the data

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/jchristo12/fantasy_football/master/data/predictor/player_offense.csv')

In [3]:
#convert column types to appropriate data types
df = df.astype({'seas': 'category',
          'wk': 'category',
          'team': 'category',
          'pos1': 'category',
          'dv': 'category',
          'seas.1': 'category'})

In [4]:
#convert dob to datetime
#df['dob'] = pd.to_datetime(df['dob'])

In [5]:
#rename seas.1 columns
df.rename(columns={'seas.1': 'exp'}, inplace=True)

### Cumulative Stats

In [6]:
df_sorted = df.sort_values(by=['player', 'gid'], ascending=True)

In [7]:
df_sorted.loc[:, 'pa':'tdrec'].cumsum().head()

Unnamed: 0,pa,pc,py,ints,tdp,ra,sra,ry,tdr,fuml,trg,rec,recy,tdrec
74708,0,0,0,0,0,7,4,50,1,0,4,4,44,0
74935,0,0,0,0,0,13,7,59,1,0,5,5,53,0
75490,0,0,0,0,0,21,12,82,1,0,10,7,72,1
75814,0,0,0,0,0,34,17,115,1,0,15,9,83,1
76021,0,0,0,0,0,40,20,131,1,1,17,10,92,1


In [8]:
#initialize a list to hold all cumulative stats
cusum_stats = []

In [9]:
#loop through all unique players and calculate cumulative stats
for i in df_sorted['player'].unique():
    x = df_sorted[df_sorted['player'] == i].loc[:, 'pa':'tdrec'].cumsum()
    cusum_stats.append(x)

In [10]:
#check if cusum stats is same length as original dataframe
cusum_df = pd.concat(cusum_stats, axis=0)
cusum_df.shape

(94759, 14)

Proof that this method is calculating correctly (lengths match)

In [11]:
#rename columns to add a cusum prefix
cusum_df = cusum_df.add_prefix('cusum_')

In [12]:
#add the cusum data to the sorted data frame
df_sorted_cusum = pd.concat([df_sorted, cusum_df], axis=1)

#### 4-game rolling

In [47]:
#initialize a list to hold each players rolling sum
recent_stats = []

In [50]:
#loop thru each unique player calculating rolling sum
for i in df_sorted['player'].unique():
    x = df_sorted[df_sorted['player'] == i].loc[:, 'pa':'tdrec'].rolling(window=4, min_periods=1).sum()
    recent_stats.append(x)

In [52]:
recent_df = pd.concat(recent_stats, axis=0)
recent_df.shape

(94827, 14)

### Add other features

In [188]:
df_sorted_cusum.columns

Index(['pk', 'gid', 'seas', 'wk', 'player', 'team', 'pos1', 'pa', 'pc', 'py',
       'ints', 'tdp', 'ra', 'sra', 'ry', 'tdr', 'fuml', 'trg', 'rec', 'recy',
       'tdrec', 'exp', 'height', 'weight', 'dob', 'dv', 'forty', 'bench',
       'vertical', 'broad', 'shuttle', 'cone', 'arm', 'hand', 'cusum_pa',
       'cusum_pc', 'cusum_py', 'cusum_ints', 'cusum_tdp', 'cusum_ra',
       'cusum_sra', 'cusum_ry', 'cusum_tdr', 'cusum_fuml', 'cusum_trg',
       'cusum_rec', 'cusum_recy', 'cusum_tdrec', 'age', 'career_comp_pct'],
      dtype='object')

#### Age

In [176]:
#strip out the year from DOB
dob_year = df_sorted_cusum['dob'].apply(lambda x: int(str(x[-4:])))

In [181]:
#add age to dataframe
df_sorted_cusum['age'] = df_sorted_cusum['seas'].astype(int) - dob_year

#### Passing summaries

In [187]:
#Passing completion
df_sorted_cusum['career_comp_pct'] = df_sorted_cusum['cusum_pc'] / df_sorted_cusum['cusum_pa']

In [190]:
#touchdown to interception ratio
df_sorted_cusum['career_td_to_int'] = df_sorted_cusum['cusum_tdp'] / df_sorted_cusum['cusum_ints']

In [191]:
#Passing yards per completion
df_sorted_cusum['career_yds_per_comp'] = df_sorted_cusum['cusum_py'] / df_sorted_cusum['cusum_pc']

#### Running summaries

In [192]:
#Yards per rush
df_sorted_cusum['career_ryds_per_carry'] = df_sorted_cusum['cusum_ry'] / df_sorted_cusum['cusum_ra']

In [195]:
#Carries to touchdown ratio
df_sorted_cusum['career_carry_to_td'] = df_sorted_cusum['cusum_ra'] / df_sorted_cusum['tdr']

In [196]:
#Carries to fumbles lost ratio
df_sorted_cusum['career_carry_to_fuml'] = df_sorted_cusum['cusum_ra'] / df_sorted_cusum['cusum_fuml']

## Game features

#### Read in data

In [198]:
game = pd.read_csv('https://raw.githubusercontent.com/jchristo12/fantasy_football/master/data/predictor/game.csv')

In [202]:
#change the data types to the correct format
game = game.astype({'v': 'category',
                   'h': 'category',
                   'day': 'category',
                   'cond': 'category',
                   'stad': 'category',
                   'wdir': 'category',
                   'surf': 'category'})

### Feature creation

In [212]:
#add underdog category
game['udog'] = pd.Series(np.where(game['sprv'] > 0, game['v'], game['h']))
game = game.astype({'udog': 'category'})

In [214]:
#combine game and offensive player data
test = df_sorted_cusum.join(game, on='gid', how='left', lsuffix='_poff', rsuffix='_game')

In [216]:
test.head()

Unnamed: 0,pk,gid_poff,seas,wk,player,team,pos1,pa,pc,py,...,stad,temp,wdir,surf,humd,ou,sprv,ptsv,ptsh,udog
74708,3999&AA-0025,3999,2015,1,AA-0025,DET,RB,0,0,0,...,Raymond James Stadium,84.0,NNE,Grass,66.0,41.0,3.0,42,14,TEN
74935,4010&AA-0025,4010,2015,2,AA-0025,DET,RB,0,0,0,...,Soldier Field,70.0,S,Grass,47.0,46.0,-2.0,48,23,CHI
75490,4036&AA-0025,4036,2015,3,AA-0025,DET,RB,0,0,0,...,Lambeau Field,67.0,N,DD GrassMaster,93.0,49.0,6.5,28,38,KC
75814,4052&AA-0025,4052,2015,4,AA-0025,DET,RB,0,0,0,...,NRG Stadium,,,Grass,,46.0,-3.0,27,20,HOU
76021,4062&AA-0025,4062,2015,5,AA-0025,DET,RB,0,0,0,...,AT&T Stadium,,,AstroTurf,,49.5,-8.5,30,6,DAL


This doesn't look right... will have to dig into this more