In [2]:
from collections import defaultdict
from glob import glob

import numpy as np
import pandas as pd
import patsy
import re

NAN = float('nan')
INPUT_PATH = "data/1/{}.csv"
OUTPUT_PATH = "data/2/{}.csv"

1. Ingest player stats, Injury, and Division CSVs.
2. Add next_fp column to player stats.
3. Discard rows for which next_fp can't be determined. 
4. Merge Division and FFToday datasets (add division column to player info). 
5. Transform division column to 4x4 matrix. 
6. Add score column for Injury data.
7. Merge player and injury datasets (player name, week, season, injury, status, score).
8. Write combined datasets to output path. 

# Import player data

In [18]:
def player_df_from_files(files):
    dfs = []
    for file in files:
        df = pd.read_csv(file)
        
        # get rid of 'Unnamed: 0' column
        if 'Unnamed: 0' in df.columns:
            df.drop(columns='Unnamed: 0', inplace=True)
        
        # group chronologically by player
        df.sort_values(['name', 'season', 'week'], inplace=True)
    
        # remove players that have only 1 row, we can't use these for anything
        len_before = len(df)
        df = df[df.groupby('name').name.transform(len) > 1]
        len_lost = len_before - len(df)
        print(file, ":", len_before, "rows, {0:.0%}".format(len_lost/len_before), "unusable")
    
        # add column for next week's points
        df['next_fp'] = np.where(df['name'] == df['name'].shift(-1), df['fp'].shift(-1), NAN)
        df = df.dropna(subset = ['next_fp'])
        
        # Make sure number of columns is consistent (TE don't have rush data)
        if 'rush_att' not in df.columns:
            df['rush_att'] = NAN
        if 'rush_yd' not in df.columns:
            df['rush_yd'] = NAN
        if 'rush_td' not in df.columns:
            df['rush_td'] = NAN
        dfs.append(df)
    return pd.concat(dfs)

# Merge functions

In [97]:
# TODO: team name from abbr is a mess

def get_full_team_name(abbr):
    return div_df[div_df['abbr'] == abbr]['team'].get_values()[0]

def merge_divisions(player_df):
    # Add division column
    player_df = player_df.join(div_df[['div', 'abbr']].set_index('abbr'), on='team').reset_index()
        
    # Transform division column into dummy matrix
    dummy_df = patsy.dmatrix('div', data=player_df, return_type='dataframe')
    player_df = player_df.join(dummy_df)
    
    # Translate team abbreviation to the full name
    player_df['team'] = player_df['team'].apply(get_full_team_name)

    player_df.rename(index=str, columns={
        "div[T.AFC N]": "afc_n",
        "div[T.AFC W]": "afc_w",
        "div[T.AFC S]": "afc_s",
        "div[T.NFC N]": "nfc_n",
        "div[T.NFC W]": "nfc_w",
        "div[T.NFC S]": "nfc_s",
        "div[T.NFC E]": "nfc_e"
    }, inplace=True)
    
    player_df.drop(columns=['div', 'Intercept', 'index'], inplace=True)
    
    return player_df

def merge_weather_data(player_df):
    weather_cols = ['season','week','team','temperature','wind','precipitation']
    player_df = pd.merge(player_df,
                         games_df[weather_cols],on=['team','season','week'],how='outer')
    return player_df

def merge_injury_data(player_df):
    # add status_code by joining player and injury on week, season, player
    player_df = pd.merge(player_df,
                         injury_df[['status_code', 'season', 'week', 'name']],
                         on=['name', 'season', 'week'],
                         how='outer')
    # if status code is NaN, then they were not injured
    player_df['status_code'] = player_df['status_code'].fillna(0)
    return player_df

# run everything... 

In [95]:
# Player files
RB_files = glob(INPUT_PATH.format("fftoday*_pos20"))
WR_files = glob(INPUT_PATH.format("fftoday*_pos30"))
TE_files = glob(INPUT_PATH.format("fftoday*_pos40"))
assert len(RB_files) > 0, "where are my files??? path = "
assert len(WR_files) > 0, "where are my files??? path = "
assert len(TE_files) > 0, "where are my files??? path = "

RB_df = player_df_from_files(RB_files)
WR_df = player_df_from_files(WR_files)
TE_df = player_df_from_files(TE_files)
assert len(RB_df) > 500, "RB too small"
assert len(WR_df) > 500, "WR too small"
assert len(TE_df) > 500, "TE too small"

# Injury data -- injury_df
%run ./Prepare_InjuryData.ipynb
assert len(injury_df) > 0, "where is my injury data???"

# Games data -- games_df
%run ./Prepare_GamesData.ipynb
assert len(games_df) > 0, "where is my games/weather data???"

# Division data -- div_df
div_df = pd.read_csv('data/team_divisions.csv')
assert len(div_df) > 0, "where is my divisions data???"

data/1/fftoday_2016-2017_pos20.csv : 1700 rows, 2% unusable
data/1/fftoday_2014-2015_pos20.csv : 1700 rows, 1% unusable
data/1/fftoday_2014-2015_pos30.csv : 1700 rows, 2% unusable
data/1/fftoday_2016-2017_pos30.csv : 1700 rows, 2% unusable
data/1/fftoday_2016-2017_pos40.csv : 1700 rows, 1% unusable
data/1/fftoday_2014-2015_pos40.csv : 1695 rows, 1% unusable


In [98]:
RB = RB_df.copy(deep=True)
WR = WR_df.copy(deep=True)
TE = TE_df.copy(deep=True)
dfs = []

# make mega DF from player, weather, division, and injury data
for tup in [(RB, "RB_2014-2017"), 
            (WR, "WR_2014-2017"),
            (TE, "TE_2014-2017")]:
    player_df = tup[0]
    
    player_df = merge_divisions(player_df)
    player_df = merge_weather_data(player_df)
    player_df = merge_injury_data(player_df)
    
    # Drop extra columns, NaN
    player_df.drop(columns=['g', 'fpg'], inplace=True)
    player_df.dropna(subset=['fp', 'next_fp'], inplace=True)
    
    dfs.append(player_df)
    
    # write to csv
    player_df.to_csv(OUTPUT_PATH.format(tup[1]))

dfs[0].head()

Unnamed: 0,name,team,rush_att,rush_yd,rush_td,rec_target,rec_rec,rec_yd,rec_td,fp,...,afc_s,afc_w,nfc_e,nfc_n,nfc_s,nfc_w,temperature,wind,precipitation,status_code
0,Aaron Jones,Packers,13.0,49.0,1.0,0.0,0.0,0.0,0.0,10.9,...,0.0,0.0,0.0,1.0,0.0,0.0,64,4,0.0,0.0
1,Aaron Jones,Packers,19.0,125.0,1.0,1.0,1.0,9.0,0.0,19.4,...,0.0,0.0,0.0,1.0,0.0,0.0,88,2,0.0,0.0
2,Aaron Jones,Packers,13.0,41.0,0.0,4.0,1.0,1.0,0.0,4.2,...,0.0,0.0,0.0,1.0,0.0,0.0,DOME,0,0.0,0.0
3,Ty Montgomery,Packers,10.0,28.0,0.0,3.0,1.0,3.0,0.0,3.1,...,0.0,0.0,0.0,1.0,0.0,0.0,DOME,0,0.0,1.0
4,Aaron Jones,Packers,17.0,131.0,1.0,5.0,3.0,7.0,0.0,19.8,...,0.0,0.0,0.0,1.0,0.0,0.0,51,5,0.0,0.0


In [103]:
test = dfs[0]
test[test['precipitation'].isnull()]['season']

1771    2014
1772    2015
1773    2015
1774    2015
1775    2015
1776    2015
1777    2015
1778    2015
1779    2015
1780    2015
1781    2015
1782    2015
1783    2015
1784    2015
1785    2015
1786    2015
1787    2015
1788    2015
1789    2015
1790    2015
1791    2015
1792    2015
1793    2015
1794    2014
1795    2014
1796    2014
1797    2014
1798    2014
1799    2014
1800    2014
        ... 
3268    2015
3269    2015
3270    2015
3271    2015
3272    2014
3273    2014
3274    2014
3275    2014
3276    2015
3277    2015
3278    2015
3279    2015
3280    2015
3281    2015
3282    2015
3283    2015
3284    2015
3285    2015
3286    2015
3287    2015
3288    2015
3289    2015
3290    2014
3291    2014
3292    2014
3293    2014
3294    2015
3295    2014
3296    2014
3297    2014
Name: season, Length: 1527, dtype: int64