In [12]:
%run ../initialize.ipynb

ERROR:root:File `u'../initialize.ipynb.py'` not found.


## Time features
* #### key: (game_id)
* #### table name: features.time_date
* #### __game_time_eastern__
* #### __game_time_local__
* #### __is_sunday__
* #### __day_of_week__
* #### __week_id__
* #### __is_playoffs__
* #### __is_superbowl__

In [14]:
key = 'game_id'
out_tbl = 'features.time_date'
features_list = ['game_time_eastern','game_time_local']

In [15]:
gametime = spark.table('game_metadata').join(
    spark.table('game'), on='game_id'
).join(
    spark.table('stadium'), on='stadium'
).select('game_id','time','stadium','timezone',
         'day_of_week','week_id',
         'game_time_eastern','game_time_local').cache()

### time of day
* raw times are eastern
* can use timezone to adjust
* issue with timezones: daylight savings and Wembley
 * not going to worry about adjusting for this
 * since raw times are eastern, local time adjustment is (tz + 5)
 * e.g. SF is -8 --> -8 + 5 = -3, so a "1pm" game is 10am local

In [16]:
gametime.filter(col('stadium') == 'AT&T Stadium').limit(5).toPandas()

Unnamed: 0,game_id,time,stadium,timezone,day_of_week,week_id,game_time_eastern,game_time_local
0,201409070dal,4:25pm,AT&T Stadium,-6.0,6,1,16.0,15.0
1,201409070dal,4:25pm,AT&T Stadium,-6.0,6,1,16.0,15.0
2,201409070dal,4:25pm,AT&T Stadium,-6.0,6,1,16.0,15.0
3,201409070dal,4:25pm,AT&T Stadium,-6.0,6,1,16.0,15.0
4,201409070dal,4:25pm,AT&T Stadium,-6.0,6,1,16.0,15.0


#### day of week -- raw
 * very sparse other than Sunday. 
 * could encode as "is_sunday" feature
 * could leave as is
  * upside: since Sunday (6) is the max #, could allow tree to split how it sees fit
    * group (Mon, Thurs, Sat) vs. Sun... or (Mon) vs. (Thurs, Sat, Sun), etc.
  * downside: this might be overfitting
 * will go with both

In [17]:
gametime = gametime.withColumn(
    'day_of_week', col('day_of_week').cast(FloatType())
).withColumn(
    'is_sunday', F.when(col('day_of_week') == 6., 1.).otherwise(0.)
)

features_list += ['day_of_week','is_sunday']

In [18]:
gametime.groupby('is_sunday').count().toPandas()

Unnamed: 0,is_sunday,count
0,0.0,3574
1,1.0,20338


#### is playoffs
#### week # -- raw

In [19]:
gametime.groupby('week_id').count().toPandas()

Unnamed: 0,week_id,count
0,19,385
1,7,1256
2,6,1228
3,9,1134
4,17,1394
5,5,1224
6,1,1401
7,10,1327
8,3,1434
9,12,1392


In [20]:
gametime = gametime.withColumn(
    'week_id', col('week_id').cast(FloatType())
).withColumn(
    'is_playoffs', F.when(col('week_id') >= 18, 1.).otherwise(0.)
).withColumn(
    'is_superbowl', F.when(col('week_id') == 21, 1.).otherwise(0.)
)

features_list += ['week_id','is_playoffs','is_superbowl']

### Write table

In [21]:
gametime.select(*([key] + features_list))\
        .write.mode('overwrite').saveAsTable(out_tbl)

In [22]:
spark.table(out_tbl).limit(5).toPandas()

Unnamed: 0,game_id,game_time_eastern,game_time_local,day_of_week,is_sunday,week_id,is_playoffs,is_superbowl
0,200810050det,13.0,13.0,6.0,1.0,5.0,0.0,0.0
1,200810050det,13.0,13.0,6.0,1.0,5.0,0.0,0.0
2,200810050det,13.0,13.0,6.0,1.0,5.0,0.0,0.0
3,200810050det,13.0,13.0,6.0,1.0,5.0,0.0,0.0
4,200810050det,13.0,13.0,6.0,1.0,5.0,0.0,0.0
