In [1]:
%run ../config/initialize.ipynb

## Global functions

In [2]:
def get_window(window_type, n, partition_cols, date_orderby_col):
    '''accepts window type (days, games, seasons),
     size of window n,
     and window fields (partition and sort), 
     and returns a spark Window'''
    assert type(partition_cols) is list
    assert type(date_orderby_col) is str
    assert type(n) is int
    assert window_type in ['games','days','seasons']
    
    from pyspark.sql.window import Window
    if window_type == 'games':
        ## fixed number of games
        return Window.partitionBy(*partition_cols)\
                  .orderBy(col(date_orderby_col).desc())\
                  .rowsBetween(1, n)

    elif window_type == 'days':
        ## fixed number of days
        seconds = 24*60*60*n
        return Window.partitionBy(*partition_cols)\
                  .orderBy(col(date_orderby_col).cast('timestamp').cast('long'))\
                  .rangeBetween(-seconds, -1)

    elif window_type == 'seasons':
        ## fixed number of seasons, including current
        ## n = 1 is YTD
        seconds = 24*60*60*(180 + 365*(n-1))
        return Window.partitionBy(*partition_cols)\
                  .orderBy(col(date_orderby_col).cast('timestamp').cast('long'))\
                  .rangeBetween(-seconds, -1)

## Matchup Features
* key: (game_id)
* table name: features.matchup
* __is_same_division__
* __is_same_conf__
* __h_wr_past_5_games__
* __h_wr_ytd__
* __h_wr_past_2_seasons__
* __h_wr_past_3_seasons__

In [3]:
key = 'game_id'
out_tbl = 'features.matchup'
features_list = []

In [4]:
games = spark.table('game').select('game_id','h_team_id','date',
                                   'v_team_id','season','matchup_id')

h_team = spark.table('team_season')\
     .select('team_id','season','division','name')\
     .withColumnRenamed('team_id','h_team_id')\
     .withColumnRenamed('division','h_division')\
     .withColumnRenamed('name','h_name')

v_team = spark.table('team_season')\
     .select('team_id','season','division','name')\
     .withColumnRenamed('team_id','v_team_id')\
     .withColumnRenamed('division','v_division')\
     .withColumnRenamed('name','v_name')

teams = games.join(h_team, on=['h_team_id','season'])\
             .join(v_team, on=['v_team_id','season'])

assert teams.count() == games.count()

* ### is intra-division
* ### is intra-conference

In [5]:
teams = teams.withColumn(
        'is_same_division', 
        F.when(
            col('h_division') == col('v_division'),
            1.
        ).otherwise(0.)
    ).withColumn(
        'is_same_conf', 
        F.when(
            F.split('h_division', ' ')[0] == F.split('v_division', ' ')[0],
            1.
        ).otherwise(0.)
    )

teams.groupby('is_same_division').pivot('is_same_conf').count()\
    .toPandas().fillna(0)

features_list += ['is_same_division','is_same_conf']

Unnamed: 0,is_same_division,0.0,1.0
0,0.0,715.0,1153
1,1.0,0.0,1069


## Head-to-head record recent history

* last 5 games, 5 years, etc.
* requires window
* __h_wr_last_5_games__
* __h_wr_last_365_days__

In [6]:
def get_wr_over_window(df, w, feat_name):
    '''Takes Spark DF, Spark Window obj,
    and new column name,
    and computes the home team winrate
    over the window'''
    first_team = F.substring('matchup_id', 0, 3)
    
    return df.withColumn(
        feat_name, 
        ## sum of home team wins
        ## tie is 0.5
        F.sum(
            F.when(col('winner') == first_team, 1.)
             .when(col('winner') == '', 0.5)
             .otherwise(0.)
        ).over(w) 
        ## divided by count for winrate
        / F.count(F.lit(1)).over(w)
    ).withColumn(
        feat_name,
        F.when(first_team == col('h_team_id'), 
               col(feat_name))
         .otherwise(1 - col(feat_name))
    )

In [7]:
scores = spark.table('game_outcome').select('game_id','h_final','v_final')

teams_score = teams.join(
        scores, on='game_id'
    ).withColumn(
        'winner', 
        F.when(col('h_final') > col('v_final'), col('h_team_id'))
         .when(col('h_final') < col('v_final'), col('v_team_id'))
         .otherwise('')
    ).drop(
        'h_final','v_final'
    )

In [8]:
for n, window_type in [(5, 'games'), 
                       (1, 'seasons'), 
                       (2, 'seasons'),
                       (3, 'seasons')]:
    feat_name = 'h_wr_past_{}_{}'.format(n, window_type)
    feat_name = feat_name.replace('past_1_seasons','ytd')
    w = get_window(window_type, n, ['matchup_id'], 'date')
    teams_score = get_wr_over_window(teams_score, w, feat_name)\
                      .fillna(0.5, feat_name)

    features_list.append(feat_name)

#### test some matchups in pandas

In [9]:
def check_test_cases(df, test_cases, feat_name):
    '''takes a Spark DF and filter conditions and
    asserts that they hold.'''
    for g, v in test_cases.iteritems():
        if not v:
            assert v == df.filter(col('game_id') == g)\
                    .select(feat_name)\
                    .toPandas().iloc[0, 0]
        else:
            assert np.isclose(
                df.filter(col('game_id') == g)\
                    .select(feat_name)\
                    .toPandas().iloc[0, 0],
                v)    

In [10]:
test_cases = {}

test_cases[(5, 'games')] = {
    '200710140jax': None,
    '200712300htx': 0.,
    '200809280jax': 1./2,
    '201709100htx': 1.,
    '201712170jax': 1./5,
    '201802040nwe': 2./3,
    '201410020gnb': 3.5/5
}
test_cases[(365, 'days')] = {
    '200710140jax': None,
    '200712300htx': 0.,
    '200809280jax': 1./2,
    '201709100htx': 1.,
    '201712170jax': 1./2,
    '201802040nwe': None,
    '201410020gnb': 3./4
}

test_sdf = teams_score
for (n, window_type), v in test_cases.iteritems():
    print 'checking {}...'.format(window_type)
    w = get_window(window_type, n, ['matchup_id'], 'date')
    test_sdf = get_wr_over_window(test_sdf, w, 'val')
    check_test_cases(test_sdf, v, 'val')

checking games...
checking days...


## Write Table

In [11]:
print '* __' + '__\n* __'.join(features_list) + '__'

* __is_same_division__
* __is_same_conf__
* __h_wr_past_5_games__
* __h_wr_ytd__
* __h_wr_past_2_seasons__
* __h_wr_past_3_seasons__


In [12]:
teams_score.select(*([key] + features_list))\
        .write.mode('overwrite').saveAsTable(out_tbl)

In [13]:
spark.table(out_tbl).limit(5).toPandas()

Unnamed: 0,game_id,is_same_division,is_same_conf,h_wr_past_5_games,h_wr_ytd,h_wr_past_2_seasons,h_wr_past_3_seasons
0,200709160oti,1.0,1.0,0.5,0.5,0.5,0.5
1,200712300clt,1.0,1.0,1.0,1.0,1.0,1.0
2,200810270oti,1.0,1.0,0.5,0.5,0.5,0.5
3,200812280clt,1.0,1.0,0.333333,0.0,0.333333,0.333333
4,200910110oti,1.0,1.0,0.5,0.5,0.5,0.5
