This notebook ingests the various data files produced by our feature engineering and data collection and aggregates them into a single feature set.  These feature files are saved off as csv files in the associated processed_data folder

In [1]:
import numpy as np
import pandas as pd
import datetime

In [107]:
#Import basic game stats for each NFL game from 2013-2017 seasons
cols = ['Key','Pts','FD','Fum','FumL','PY','PA','PI','PS', 
        'PSY', 'RA', 'RY', 'LWHG', 'DaysSLG',
       'qb_relinquished','rb_relinquished','wrte_relinquished',
        'qb_val_lost','rb_val_lost','wrte_val_lost','non_key_relinquished']

injuries = pd.read_csv("n_missed_games_by_annoucement_date.csv", index_col = 0)
injuries['Date'] = pd.to_datetime(injuries['datestring'], format = '%Y%m%d')
priors = pd.read_csv("starters.csv", index_col = 0)

for y in range(2013,2018):
    #Imports and creates initial season game files
    game_temp = pd.read_csv("base_data/nflstats" + str(y) + ".csv")
    game_temp.rename(index = str, columns = {'HPS': 'APS', 'HPSY': 'APSY', 'HPS.1':'HPS', 'HPSY.1':'HPSY'}, inplace = True)
    game_temp['Datetime'] = pd.to_datetime(game_temp['Start'])
    game_temp['Date'] = game_temp['Datetime'].map(lambda x: x.date())
    game_temp['Key'] = game_temp['Datetime'].dt.strftime('%Y%m%d') + game_temp['Home']
    game_temp['Spread'] = game_temp['HPts'] - game_temp['APts']
    
    #Filter injury data to the year
    start_date = datetime.datetime(year = y, month = 8, day = 15)
    end_date = datetime.datetime(year = y + 1, month = 2, day  = 15)
    year_mask = injuries['Date'].between(start_date,end_date)
    year_injuries = injuries[year_mask]
    year_priors = priors.loc[priors['Season'] == y]

    #New base dataframe to join everything onto
    game_final = game_temp[['Key','Datetime','Home','Away','Spread']]
    first_team = True

    #Calculate all the cumulative stats
    teams = game_temp['Home'].unique()
    for t in teams:
        
        if t == 'STL':
            team_injuries = year_injuries.loc[year_injuries['abbr'] == 'LAR']
            team_prior = year_priors.loc[year_priors['Team_ID'] == 'LAR']
        elif t == 'SDG':
            team_injuries = year_injuries.loc[year_injuries['abbr'] == 'LAC']
            team_prior = year_priors.loc[year_priors['Team_ID'] == 'LAC']
        else:
            team_injuries = year_injuries.loc[year_injuries['abbr'] == t]
            team_prior = year_priors.loc[year_priors['Team_ID'] == t]
        
        team_stats = weekly_team_stats(game_temp, team_injuries, team_prior, t, cols, y)
        
        #Home merge
        right = team_stats.add_prefix('H')
        right.rename(index = str, columns = {'HKey': 'Key'}, inplace = True)
        right['Home'] = t
        game_final = game_final.merge(right, how = 'left', on = ['Key', 'Home'])
        
        #Away merge
        right = team_stats.add_prefix('A')
        right.rename(index = str, columns = {'AKey': 'Key'}, inplace = True)
        right['Away'] = t
        game_final = game_final.merge(right, how = 'left', on = ['Key', 'Away'])
        
        #Take care of dup columns
        if first_team:
            first_team = False
        else:
            for col in cols:
                if col == 'Key':
                    next
                else:
                    game_final['A'+col] = game_final[['A'+col+'_x', 'A'+col+'_y']].sum(axis = 1)
                    game_final['H'+col] = game_final[['H'+col+'_x', 'H'+col+'_y']].sum(axis = 1)
                    game_final.drop(['A'+col+'_x', 'A'+col + '_y', 'H'+col+'_x', 'H'+col + '_y'], axis = 1, inplace = True)
    
        
    #Timezone Features
    timezones = pd.read_csv("timezone_data/timezones" + str(y) + ".csv")
    
    Away_timezones = timezones[['Team_Code','Team_Zone','Zone_Value']]
    Away_timezones = Away_timezones.rename(index=str, columns = {"Team_Code":"Away","Team_Zone":"AZone","Zone_Value":"AZoneVal"})
    game_final = game_final.join(Away_timezones.set_index('Away'), on='Away',how = 'left')

    Home_timezones = timezones[['Team_Code','Team_Zone','Zone_Value']]
    Home_timezones = Home_timezones.rename(index=str, columns = {"Team_Code":"Home","Team_Zone":"HZone","Zone_Value":"HZoneVal"})
    game_final = game_final.join(Home_timezones.set_index('Home'), on='Home',how = 'left')
    
    #Time zone and last week home diffs
    game_final = game_final.assign(AZDiff = lambda x: game_final.HZoneVal - game_final.AZoneVal)
    game_final = game_final.assign(H_LWGAdv = lambda x: game_final.HLWHG - game_final.ALWHG)
    game_final.drop(['HLWHG', 'ALWHG', 'HZone', 'HZoneVal', 'AZone', 'AZoneVal'], axis = 1, inplace = True)

    #Stadium Features (only join on Home)
    stadiums = pd.read_csv("stadiums_data/stadiums" + str(y) + ".csv")
    stadiums = stadiums[['Team_Code','Turf','Grass','Hybrid','Roof_Open','Roof_Fixed','Roof_Retract']]
    stadiums.rename(index = str, columns={'Team_Code':'Home'}, inplace = True)
    game_final = game_final.join(stadiums.set_index('Home'), on = 'Home', how = 'left')
    
    game_final.to_csv('processed_data/'+str(y)+"processed_extra2.csv", index = False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  interactivity=interactivity, compiler=compiler, result=result)


In [106]:
#Takes game stats for each week and breaks them into dataframes of cumulative prior week averages for each team
#This is the data we want to use for prediction
def weekly_team_stats(stats, injuries, prior, team, cols, year):
    t_stats = stats.query('Home == @team | Away == @team')
    weekly_stats = pd.DataFrame(columns = cols)
    
    #Get our "prior" for the season
    avg_stats = team_stat_priors(team, cols, year, prior)
    count = 3
    
    for index,week in t_stats.iterrows():
        week = week.to_dict()
        avg_stats['Key'] = week['Key']
        last_game_date = avg_stats['DaysSLG']
        avg_stats['DaysSLG'] = (week['Date'] - avg_stats['DaysSLG']).days
        
        #Get Injuries between previous game and current:
        injury_mask = (injuries['Date'] < week['Date']) & (injuries['Date'] >= last_game_date)
        game_injuries = injuries.loc[injury_mask]
        game_injuries.drop(['datestring', 'abbr', 'Date'], axis = 1, inplace = True)
        if not game_injuries.empty:
            game_injuries = game_injuries.sum()
            for key, val in game_injuries.items():
                avg_stats[key] = val
        
        #Add avg stats so far as entry for current week
        weekly_stats = weekly_stats.append(avg_stats, ignore_index = True)
        
        #Get stats for this week
        if team == week['Home']:
            prefix = 'H'
            week.pop('Home')
        else:
            prefix = 'A'
            week.pop('Away')
        w_stats = {k[1:]: v for k, v in week.items() if k[0] == prefix}
        
        #Compute cumulative avgs including this week
        count += 1
        weight = 1/count
        for key,val in avg_stats.items(): 
            if key == 'Key':
                avg_stats[key] = "Bad Key"
            elif key == 'LWHG':
                avg_stats['LWHG'] = int(prefix == 'H')
            elif key == 'DaysSLG':
                avg_stats['DaysSLG'] = week['Date']
            elif '_' in key:
                avg_stats[key] = 0
            else:
                avg_stats[key] = val*(1-weight) + w_stats[key]*weight
                
    return weekly_stats

In [105]:
#Generates 3 weeks of a "prior" data to use in the cumulative stat averaging
#This prior is based on avg data in the training set and the distribution of
#Player scores at each position for each team
#Helps to make predicitons early in the season, diminishes in contribution
#To the average as season progresses
def team_stat_priors(team, cols, year, prior):
    rush_talent = prior['rush_talent'].values[0]
    pass_talent = prior['pass_talent'].values[0]
    temp = {k:0 for k in cols}
    temp['DaysSLG'] = datetime.date(year = year, month = 8, day = 15)
    temp['RY'] = 115 #+ #rush_talent * 26
    temp['RA'] = 29 #+ #4* rush_talent
    temp['PY'] = 235 #+ #(pass_talent - 2.25) * 18
    temp['PA'] = 35 #+ #3 * (pass_talent - 2.25)
    temp['PI'] = 0.5 #* 1/np.abs(pass_talent + 1)
    temp['Pts'] = 23 #+ 5*(rush_talent + pass_talent - 2)
    return temp

In [62]:
temp = pd.read_csv("base_data/nflstats2013.csv")
temp.rename(index = str, columns = {'HPS': 'APS', 'HPSY': 'APSY', 'HPS.1':'HPS', 'HPSY.1':'HPSY'}, inplace = True)
temp.drop(['Season', 'Week', 'OverUnder', 'VegasLine'], axis = 1, inplace = True)
temp.describe()

Unnamed: 0,APts,HPts,AFD,AFum,AFumL,APY,APA,API,APS,APSY,...,HFD,HFum,HFumL,HPY,HPA,HPI,HPS,HPSY,HRA,HRY
count,267.0,267.0,267.0,267.0,267.0,267.0,267.0,267.0,267.0,267.0,...,267.0,267.0,267.0,267.0,267.0,267.0,267.0,267.0,267.0,267.0
mean,21.977528,24.891386,19.337079,1.217228,0.588015,231.696629,35.636704,1.014981,2.636704,17.355805,...,20.595506,1.333333,0.614232,239.842697,35.011236,0.93633,2.385768,15.602996,28.089888,117.681648
std,9.680199,10.582623,4.94468,1.1229,0.747632,75.744536,8.857638,1.058336,1.725266,12.932619,...,5.124141,1.212921,0.75907,83.419841,8.823875,0.996078,1.700581,11.973956,7.972189,52.795817
min,0.0,0.0,6.0,0.0,0.0,89.0,16.0,0.0,0.0,0.0,...,9.0,0.0,0.0,46.0,15.0,0.0,0.0,0.0,9.0,18.0
25%,16.0,19.0,16.0,0.0,0.0,176.5,29.0,0.0,1.0,7.0,...,17.5,0.0,0.0,180.0,28.0,0.0,1.0,7.0,22.0,81.5
50%,21.0,24.0,19.0,1.0,0.0,221.0,35.0,1.0,2.0,16.0,...,20.0,1.0,0.0,231.0,35.0,1.0,2.0,14.0,29.0,111.0
75%,27.0,31.0,23.0,2.0,1.0,283.0,41.0,2.0,4.0,25.5,...,24.0,2.0,1.0,296.0,41.0,1.0,3.0,23.0,34.0,149.0
max,56.0,55.0,34.0,7.0,5.0,428.0,62.0,5.0,9.0,63.0,...,40.0,6.0,3.0,480.0,59.0,5.0,9.0,54.0,55.0,299.0


In [19]:
#Debugging
teams = game_stats['Home'].unique()
team_stats = {}
for t in teams:
    team_stats.update({t:weekly_team_stats(game_stats, t)})

In [21]:
#Debugging
print(team_stats['NYJ'])

            Key      Pts       FD      Fum      FumL       PY       PA  \
0             0        0        0        0         0        0        0   
1   20140907NYJ       19       20        2         1      190       29   
2   20140914GNB     21.5     19.5      1.5       0.5      178     30.5   
3   20140922NYJ  20.6667       20  2.66667  0.666667  218.667  34.6667   
4   20140928NYJ    19.75    19.25      2.5      0.75      215    34.25   
5   20141005SDG     15.8     17.6      2.2       0.8      184     33.6   
6   20141012NYJ       16  17.1667  2.16667  0.833333  182.167  35.1667   
7   20141016NWE  17.2857  18.7143        2  0.714286  185.429       35   
8   20141026NYJ       18   19.125     2.25     0.875  179.375   36.125   
9   20141102KAN  17.1111  19.4444  2.11111  0.777778  184.444  36.1111   
10  20141109NYJ     17.4       19      1.9       0.7    178.5     34.3   
11  20141124BUF  16.0909  18.2727  1.72727  0.636364  173.727       34   
12  20141201NYJ  15.8333    18.25  1.6

In [25]:
injuries = pd.read_csv("n_missed_games_by_annoucement_date.csv")
print(injuries['abbr'].unique())

['CIN' 'PHI' 'ATL' 'DAL' 'DEN' 'HOU' 'IND' 'JAX' 'LAR' 'MIN' 'NOR' 'NYG'
 'SFO' 'TAM' 'GNB' 'SEA' 'ARI' 'BUF' 'LAC' 'NWE' 'OAK' 'PIT' 'TEN' 'CHI'
 'NYJ' 'CAR' 'DET' 'WAS' 'MIA' 'BAL' 'KAN' 'CLE']
