In [1]:
%run ../config/initialize.ipynb

## Team Rank Features
* key: (game_id, team_id)
* table name: features.rankings
* __h__dave_or_wtddvoa__
* __h__def_rank__
* __h__defensedvoa__
* __h__estim_winrate__
* __h__off_rank__
* __h__offensedvoa__
* __h__s_t_dvoa__
* __h__s_t_rank__
* __h__totaldvoa__
* __v__dave_or_wtddvoa__
* __v__def_rank__
* __v__defensedvoa__
* __v__estim_winrate__
* __v__off_rank__
* __v__offensedvoa__
* __v__s_t_dvoa__
* __v__s_t_rank__
* __v__totaldvoa__

In [2]:
key = 'game_id'
out_tbl = 'features.rankings'
features_list = []

In [3]:
dvoa = spark.table('dvoa').select('team_id','season','week_id',
                                  ## wins/losses
                                  'w_l','estim_wins',
                                  ## total
                                  'dave_or_wtddvoa','totaldvoa',
                                  ## offense
                                 'offensedvoa','off_rank',
                                  ## defense
                                  'defensedvoa','def_rank',
                                  ## special teams
                                  's_t_dvoa','s_t_rank')
game = spark.table('game').select('game_id','season','week_id','date',
                                  'h_team_id','v_team_id')

### Estimated winrate
* estimated wins divided by number of games

In [4]:
ngames_udf = udf(lambda x: sum(map(float, x.split('-'))), FloatType())

dvoa_features = dvoa.withColumn(
    'ngames', ngames_udf('w_l')
).withColumn(
    'estim_winrate', col('estim_wins') / col('ngames')
).drop(
    'w_l', 'estim_wins', 'ngames'
)

dvoa_features.limit(5).toPandas()

Unnamed: 0,team_id,season,week_id,dave_or_wtddvoa,totaldvoa,offensedvoa,off_rank,defensedvoa,def_rank,s_t_dvoa,s_t_rank,estim_winrate
0,mia,2008,8,6.9,9.8,22.2,3.0,2.3,16.0,-10.0,31.0,0.616667
1,mia,2008,9,13.6,13.3,20.8,2.0,-1.4,12.0,-8.9,31.0,0.671429
2,mia,2008,10,10.6,9.4,14.9,8.0,-3.0,11.0,-8.5,31.0,0.625
3,mia,2008,11,10.1,7.7,15.3,9.0,-0.5,13.0,-8.2,31.0,0.544444
4,mia,2008,12,6.6,6.8,16.7,6.0,0.5,13.0,-9.4,31.0,0.54


### Use raw rankings and DVOA for now
* change to H/V

In [5]:
base = set(['team_id','season','week_id'])
dvoa_home = reduce(
    lambda df, c: df.withColumnRenamed(c, 'h__{}'.format(c)),
    set(dvoa_features.columns) - base,
    dvoa_features
)
dvoa_visitor = reduce(
    lambda df, c: df.withColumnRenamed(c, 'v__{}'.format(c)),
    set(dvoa_features.columns) - base,
    dvoa_features
)

features_list += list((set(dvoa_home.columns)
                      | set(dvoa_visitor.columns))
                      - base)

dvoa_home = dvoa_home.withColumnRenamed('team_id','h_team_id')
dvoa_visitor = dvoa_visitor.withColumnRenamed('team_id','v_team_id')

In [6]:
rankings_features = game.join(
    dvoa_home, on=['season','week_id','h_team_id']
).join(
    dvoa_visitor, on=['season','week_id','v_team_id']
)

assert rankings_features.count() == game.count()

### Fill NULLs
* fill in with the mean value
* __TODO__: come up with something better/more appropriate

In [7]:
for f in features_list:
    mean_value = rankings_features.fillna(-99999, f)\
                    .filter(col(f) != -99999)\
                    .select(F.mean(f)).toPandas()\
                    .iloc[0, 0]   
    rankings_features = rankings_features.fillna(mean_value, f)

### Write Table

In [8]:
print '* __' + '__\n* __'.join(sorted(features_list)) + '__'

* __h__dave_or_wtddvoa__
* __h__def_rank__
* __h__defensedvoa__
* __h__estim_winrate__
* __h__off_rank__
* __h__offensedvoa__
* __h__s_t_dvoa__
* __h__s_t_rank__
* __h__totaldvoa__
* __v__dave_or_wtddvoa__
* __v__def_rank__
* __v__defensedvoa__
* __v__estim_winrate__
* __v__off_rank__
* __v__offensedvoa__
* __v__s_t_dvoa__
* __v__s_t_rank__
* __v__totaldvoa__


In [9]:
rankings_features.select(*([key] + features_list))\
              .write.mode('overwrite').saveAsTable(out_tbl)