In [None]:
# only need to run once for each new instance
!pip install pyathena

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pyathena import connect
from pyathena.pandas_cursor import PandasCursor

from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import make_scorer

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import warnings

#warnings.simplefilter(action='ignore', category=FutureWarning)

pd.options.mode.chained_assignment = None  # default='warn'
pd.options.display.max_columns = 30
pd.options.display.max_rows = 50

# Define connection to DB
conn = connect(
    s3_staging_dir='s3://aws-athena-query-results-323906537337-us-east-1/',
    region_name='us-east-1',
    cursor_class=PandasCursor
    )
cursor = conn.cursor()

In [None]:
# # thoughts

# Only 2018~2019 seasons contain data for override values, thus for validation purpose.
# could consider more seasons without validation data, but only for building model purpose (R2, MAE, cross-validation).

# 0. Data

## 0.1 Game data

In [None]:
# # This implementation has issue due to data missing in tables

# # Note: TE rarely gets rush carry

# simple_query = f'''
# with teamquery as
# (
# select season,
#        eventmetadata.week week,
#        eventmetadata.gameCode gamecode,  
#        teamid, 
#        totalrushingattempts
# from datalakefootball.team_aggregated_game_stats
# where season>='2017' and eventmetadata.eventtypeid in (1,2)
# order by season, week, gamecode, teamid
# )

# select
#     cast(t1.season as integer) season,
#     t1.eventmetadata.week week,
#     t1.eventmetadata.gamecode gamecode,
#     t1.teamid teamid,
#     t1.player.playerid,
#     t1.player.positionid,
#     t1.rushertotalrushingattempts,
#     t2.totalrushingattempts,
    
#     case when t2.totalrushingattempts = 0 
#         then null
#         else t1.rushertotalrushingattempts / cast (t2.totalrushingattempts as double)
#         end as rushingShare,
#     row_number() over (PARTITION BY
#                          t1.season, t1.teamid, t1.eventmetadata.week, t1.player.positionid
#                      ORDER BY
#                         t1.rushertotalrushingattempts DESC) Rank
# from
#     datalakefootball.player_aggregated_game_stats as t1
#     left join
#     teamquery t2
#     on t1.season = t2.season and 
#        t1.eventmetadata.gamecode = t2.gamecode and 
#        t1.teamid = t2.teamid
# where t1.season >= '2017' and 
#      t1.eventmetadata.eventtypeid in (1,2) and
#      t1.player.positionid in (1,8,9) -- 1:WR, 7:TE, 8:QB, 9:RB
# order by season, week, gamecode, teamid
# '''

# if True:
#     game_df2 = cursor.execute(simple_query).as_pandas()
#     print(game_df2.info())
# else:
#     print("Failed to query!")

# print(game_df2.rushingShare.describe())

In [None]:
# a work-around
# position_id may have some problem if a player changes position!

simple_query = f'''
with teamquery as
(
    select season,
           eventmetadata.week week,
           eventmetadata.gameCode gamecode,  
           teamid, 
           totalrushingattempts
    from datalakefootball.team_aggregated_game_stats
    where season>='2017' and eventmetadata.eventtypeid in (1,2)
    order by season, week, gamecode, teamid
),
playerquery as 
(
  select playerid,
           positions[1].positionid positionid
  from datalakefootball.players
)

select
    cast(t1.season as integer) season,
    t1.eventmetadata.week week,
    t1.eventmetadata.gamecode gamecode,
    t1.eventmetadata.eventtypeid eventType,
    t1.teamid teamid,
    t1.playerid,
    t3.positionid,
    t1.playerstats.rushingstats.attempts rushertotalrushingattempts,
    t2.totalrushingattempts,
    
    case when t2.totalrushingattempts = 0 
        then null
        else t1.playerstats.rushingstats.attempts / cast (t2.totalrushingattempts as double)
        end as rushingShare,
    row_number() over (PARTITION BY
                         t1.season, t1.teamid, t1.eventmetadata.week, t3.positionid
                     ORDER BY
                        t1.playerstats.rushingstats.attempts DESC) Rank,
    if (playerstats.inactives is not null, False, True) as isActive
from
    datalakefootball.player_stats_game as t1
    left join teamquery t2
    on t1.season = t2.season and 
       t1.eventmetadata.gamecode = t2.gamecode and 
       t1.teamid = t2.teamid
    left join playerquery t3
    on t1.playerid = t3.playerid 
where t1.season >= '2017' and 
     t1.eventmetadata.eventtypeid in (1,2) and
     t3.positionid in (1,8,9) -- 1:WR, 7:TE, 8:QB, 9:RB
order by season, week, gamecode, teamid
'''

if True:
    game_df = cursor.execute(simple_query).as_pandas()
    print(game_df.info())
else:
    print("Failed to query!")

print(game_df.rushingShare.describe())

In [None]:
sum(game_df.totalrushingattempts > 0)

In [None]:
# Check rush share distribution by player positions

id = (game_df.rushertotalrushingattempts > 0)
rushCounts = game_df[id].positionid.value_counts()
print(rushCounts, '\n\n', rushCounts / rushCounts.sum(), sep='')

In [None]:
totalRushAttempts = game_df.rushertotalrushingattempts.sum()
print(totalRushAttempts)

id1 = game_df[id].positionid == 9
RB_rushAttempts = sum(game_df[id].rushertotalrushingattempts[id1])
print("RB rush share: {:}, {:.2%}".format(RB_rushAttempts, RB_rushAttempts / totalRushAttempts) )

id1 = game_df[id].positionid == 8
QB_rushAttempts = sum(game_df[id].rushertotalrushingattempts[id1])
print("QB rush share: {:},  {:.2%}".format(QB_rushAttempts, QB_rushAttempts / totalRushAttempts) )

id1 = game_df[id].positionid == 1
WR_rushAttempts = sum(game_df[id].rushertotalrushingattempts[id1])
print("WR rush share: {:},   {:.2%}".format(WR_rushAttempts, WR_rushAttempts / totalRushAttempts) )


# Pie chart, counter-clockwise:
labels = ['RB', 'QB', 'WR']
sizes = [RB_rushAttempts, QB_rushAttempts, WR_rushAttempts]
explode = (0.1, 0, 0)  # only "explode" the 2nd slice (i.e. 'Hogs')

plt.figure(figsize=[9,9])
plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90)
plt.show()

## 0.2 Expected data

In [None]:
# we only have 2 years data 2018 ~ 2019 for research purpose

simple_query = f'''
select
    eventmetadata.gamecode, -- may not be complete for some game
    player.playerid,
    rushingpercentage exp_rushingShare
    -- overriderushingpercentage exp_rushingShare
from datalakefootball.player_expected_rates
where 
    season >= '2018' and 
    eventmetadata.eventtypeid in (1,2) and
    player.positionid in (1,8,9) and version='override'
order by season
'''

if True:
    exp_df = cursor.execute(simple_query).as_pandas()
    print(exp_df.info())
else:
    print("Failed to query!")

In [None]:
game_df = pd.merge(game_df, exp_df, on=['gamecode','playerid'], how='left')
game_df.info()

## 0.3 Ytd data

In [None]:
# # # This implementation has issue due to data missing in tables

# # In order to account for injury situation, team_aggregated_game_stats is used instead of team_aggregated_ytd_stats

# simple_query = f'''
# with teamquery as
# (
# select eventmetadata.gameCode gamecode,  
#        teamid, 
#        totalrushingattempts game_totalRushingAttempts
# from datalakefootball.team_aggregated_game_stats
# where season>='2017' and eventmetadata.eventtypeid in (1,2)
# )

# select 
#   cast(t1.season as integer) season, 
#   t1.eventmetadata.week, 
#   t1.eventmetadata.gamecode, 
#   t1.teamid,
#   t1.player.playerid,
#   t1.player.positionid,
#   t1.rushertotalrushingattempts ytd_rushertotalrushingattempts,
#   t2.game_totalRushingAttempts,
#   row_number() over (PARTITION BY
#                          t1.season, t1.teamid, t1.eventmetadata.week, t1.player.positionid
#                      ORDER BY
#                          t1.rushertotalrushingattempts DESC) ytd_rank
# from datalakefootball.player_aggregated_ytd_stats t1
#     left join teamquery t2
#     on t1.eventmetadata.gamecode = t2.gamecode and 
#        t1.teamid=t2.teamid
# where 
#     t1.season >='2017' and 
#     t1.player.positionid in (1,8,9) and 
#     t1.eventmetadata.eventtypeid in (1,2) and
#     t1.eventmetadata.week is not null -- when a week is missing, the player may not be active
# order by season, week, gamecode, teamid
# '''

# if True:
#     ytd_df = cursor.execute(simple_query).as_pandas()
#     print(ytd_df.info())
# else:
#     print("Failed to query!")
    

# # prepare ytd_rushingattempts for each player. in this way, no update for missed games
# gd = ytd_df.groupby(['season','playerid'])

# ytd_df['ytd_totalRushingAttempts'] = gd.game_totalrushingattempts.cumsum()
# ytd_df['ytd_totalRushingAttempts'] = gd.ytd_totalRushingAttempts.shift(1)

# ytd_df['ytd_rushingShare'] = ytd_df.ytd_rushertotalrushingattempts / ytd_df.ytd_totalRushingAttempts


# # merge ytd data into game data
# game_df = pd.merge(game_df, ytd_df[['gamecode','playerid',
#                                     'ytd_rank',
#                                    'ytd_rushertotalrushingattempts',
#                                    'ytd_totalRushingAttempts',
#                                    'ytd_rushingShare']], 
#                     on=['gamecode','playerid'], how='left')
# game_df.info()

In [None]:
# a work-around

simple_query = f'''
select eventmetadata.gameCode gamecode,  
       teamid, 
       totalrushingattempts game_totalRushingAttempts
from datalakefootball.team_aggregated_game_stats
where season>='2017' and eventmetadata.eventtypeid in (1,2)
'''

if True:
    ytd_team_df = cursor.execute(simple_query).as_pandas()
    print(ytd_team_df.info())
else:
    print("Failed to query!")

game_df = pd.merge(game_df, ytd_team_df, on=['gamecode','teamid'], how='left')
game_df.info()

In [None]:
# we should not simply fill the NA value since it may indicate missing the game
id = game_df.rushertotalrushingattempts.isna()
game_df['game_totalRushingAttempts'][id] = np.nan

gd = game_df.groupby(['season', 'playerid'])

game_df['ytd_totalRushingAttempts'] = gd.game_totalRushingAttempts.cumsum()
game_df['ytd_rushertotalrushingattempts'] = gd.rushertotalrushingattempts.cumsum()

game_df['ytd_rushingShare'] = game_df.ytd_rushertotalrushingattempts / game_df.ytd_totalRushingAttempts

# For missing games, fill players' rushing share with previous game results
game_df[['ytd_totalRushingAttempts','ytd_rushertotalrushingattempts','ytd_rushingShare']] =\
        gd[['ytd_totalRushingAttempts','ytd_rushertotalrushingattempts','ytd_rushingShare']].fillna(method='ffill')

game_df[['ytd_totalRushingAttempts', 'ytd_rushertotalrushingattempts', 'ytd_rushingShare']] = \
        gd[['ytd_totalRushingAttempts', 'ytd_rushertotalrushingattempts', 'ytd_rushingShare']].shift(1)

game_df.info()

In [None]:
game_df[['game_totalRushingAttempts','rushertotalrushingattempts','rushingShare','ytd_rushingShare','ytd_totalRushingAttempts','ytd_rushertotalrushingattempts']] =\
        game_df[['game_totalRushingAttempts','rushertotalrushingattempts','rushingShare','ytd_rushingShare','ytd_totalRushingAttempts','ytd_rushertotalrushingattempts']].fillna(0)

game_df.info()

## 0.4 Previous game data

In [None]:
# previous (active) game rush share, regardless of team id

gd = game_df.groupby(['season', 'playerid'])
game_df['prev_rushingShare'] = gd.rushingShare.shift(1)
game_df.info()
game_df.prev_rushingShare = game_df.prev_rushingShare.fillna(0.0)

## 0.5 Prepare ytd data by position rank

In [None]:
# create ytd targetShare by position rank
# only ytd data is used, no baseline information is used!
# 'gamecode' and 'playerid' are irrelevant here!

ytd_byPosRank_df = game_df[['season','week','gamecode','teamid','playerid','positionid',
                   'rushertotalrushingattempts','totalrushingattempts','Rank']].copy()

gd = ytd_byPosRank_df.groupby(['season','teamid','positionid','Rank'])

ytd_byPosRank_df['ytd_rushingShareByPositionRank'] = gd.rushertotalrushingattempts.cumsum() / \
                                            gd.totalrushingattempts.cumsum()
ytd_byPosRank_df['ytd_rushingShareByPositionRank'] = gd.ytd_rushingShareByPositionRank.shift(1)

ytd_byPosRank_df.info()

## 0.6 Identify weekly injury

In [None]:
# identify weekly injury. If a top player is out, the remaining players on that position will use ytd_targetShare according to adjusted position rank

# to be addressed (TBA) cases:
# (1) Player 835814 with zero baseline info, who joined team 334 in mid of 2019 and became lead RB.
#     the current algo has difficulty to catch up and update this player to be new #1 RB
#     later, 880548 returns who is a top RB of the team and out temporarily due to injury -> percentage normalization issue!!!
# (2) Player normalization
#     all active player's share should be scaled accoring to total rushing shares by individual RB and total rushing share by RB position

In [None]:
# Function to calculate target share by ranks for each positin type when there is a roster change
# return a list of objects, each object contains targetShares for players for a game when roster change happens
# Note: there is a bias to use targetShare by position to estimate each player's performance!!!

# For rush share, we may have to rely on previous game ranking instead of ytd_ranking!

def calculateRushingShareAdjByRank(game_df, positionIds, seasons, printDetails=False):

    adjustedRates = []
    
    teams = game_df.teamid.unique()

    for season in seasons:
        for team in teams:
            for positionId in positionIds:
                print(team)

                id = (game_df.season==season) & (game_df.teamid==team) & (game_df.positionid==positionId)
                one_team = game_df[id].copy()

                # ranking_data is used to dynamically track ytd players' ranking for certain position
                ranking_data = []

                for i,week in enumerate(one_team.week.unique()):
                    id = (one_team.week == week) & (one_team.isActive)

                    if i == 0:
                        ranking_data = one_team.loc[id, ['rushertotalrushingattempts','playerid']].copy()
                        ranking_data['ytd_rank'] = -1
                        ranking_data.set_index('playerid', inplace=True)
                        continue

                    data = one_team.loc[id, ['teamid','gamecode','positionid','playerid',
                                        'rushertotalrushingattempts','rushingShare','ytd_rushingShare']].copy()
                    data.set_index('playerid', inplace=True)

                    # add according to playerid index
                    ranking_data = ranking_data.add(data[['rushertotalrushingattempts']], fill_value=0)
                    ranking_data.sort_values('rushertotalrushingattempts', inplace=True, ascending=False)
                    ranking_data.loc[:,'ytd_rank'] = np.arange(len(ranking_data)) + 1
                    
                    #print(week, ranking_data)

                    current_week_data = one_team[id]
                    activeMajorPlayers = current_week_data.playerid[current_week_data.rushertotalrushingattempts > 0]

                    #check if any top (1) player(s) is missing for this week
                    missingPlayers = [player for player in ranking_data.index.values
                                     #if player not in current_week_data.playerid.values and
                                      if player not in activeMajorPlayers.values and
                                        ranking_data.loc[player].ytd_rank <= 2]

                    #print(current_week_data[['week','playerid','rushertotalrushingattempts','isActive']])
                    
                    if missingPlayers:
                        if printDetails:
                            for player in missingPlayers:
                                print(week, player, ranking_data.loc[player].ytd_rank)
                                print(ranking_data)

                        # re-arrange ranks of active players to reflect currently predicted rank
                        data['onFieldRank'] = ranking_data.loc[data.index].ytd_rank.\
                                                rank(method='first', na_option='bottom')
                        data = data.astype({'onFieldRank':'int64'})
                        data.reset_index(inplace=True)

                        # merge target%_by_rank into data
                        data = pd.merge(data, 
                                        ytd_byPosRank_df[['teamid','gamecode','positionid','Rank',
                                                 'ytd_rushingShareByPositionRank']], 
                                        left_on=['teamid','gamecode','positionid','onFieldRank'],
                                        right_on=['teamid','gamecode','positionid','Rank'], how='left')
                        
                        # adjustment
                        #data.ytd_targetShareByPositionRank = data.ytd_targetShareByPositionRank * 0.9

                        adjustedRates.append(data)

    return(adjustedRates)

In [None]:
# we only consider the position rank adjustment for running back position

re = calculateRushingShareAdjByRank(game_df, positionIds=[9], seasons=[2018,2019], printDetails=True)

adjustedRates = pd.concat(re, ignore_index=True)

In [None]:
game_df = pd.merge(game_df, 
                 adjustedRates[['teamid','gamecode','playerid','onFieldRank','ytd_rushingShareByPositionRank']], 
                 on=['teamid','gamecode','playerid'], 
                 how='left')
game_df.info()

In [None]:
# we create a new column 'ytd_targetShareAdj' to contain ytd data with adjustment by injury situation
game_df['ytd_rushingShareAdj'] = game_df.ytd_rushingShare

id = game_df.ytd_rushingShareByPositionRank.isnull()
game_df.ytd_rushingShareAdj[~id] = game_df.ytd_rushingShareByPositionRank[~id] 

game_df.info()

In [None]:
# new method has some improvement from ytd baseline for running back

id = (game_df.season.isin([2018,2019])) & (game_df.positionid.isin([9])) & \
         (game_df.isActive) & (~game_df.exp_rushingShare.isnull()) & \
         (~ game_df.ytd_rushingShareByPositionRank.isnull()) 

print('{:.3f}'.format(abs(game_df[id].rushingShare - game_df[id].exp_rushingShare).mean()) )

print('{:.3f}'.format(abs(game_df[id].rushingShare - game_df[id].ytd_rushingShare).mean()) )

print('{:.3f}\n'.format(abs(game_df[id].rushingShare - game_df[id].ytd_rushingShareByPositionRank).mean()) )


id = (game_df.season.isin([2018,2019])) & (game_df.positionid.isin([9])) &\
        (game_df.isActive) & (~game_df.exp_rushingShare.isnull())

print('{:.3f}'.format(abs(game_df.rushingShare - game_df.exp_rushingShare)[id].mean()) )

print('{:.3f}'.format(abs(game_df.rushingShare - game_df.ytd_rushingShare)[id].mean()) )

print('{:.3f}'.format(abs(game_df.rushingShare - game_df.ytd_rushingShareAdj)[id].mean()) )

## 0.7 Prepare baseline data

In [None]:
# create baseline case from ytd data
# Note: each year, many new players join the league without baseline-info;
#       many players retire so their baseline won't be in use for next year

baseline_df = game_df[['season','playerid',
                       'ytd_totalRushingAttempts',
                       'ytd_rushertotalrushingattempts',
                       'ytd_rushingShare',
                       'ytd_rushingShareAdj']].copy()
baseline_df = baseline_df.groupby(['season','playerid']).tail(1)

baseline_df.rename(columns={
                            'ytd_totalRushingAttempts':'base_totalRushingAttempts',
                            'ytd_rushertotalrushingattempts':'base_rushertotalrushingattempts',
                            'ytd_rushingShare':'base_rushingShare',
                            'ytd_rushingShareAdj':'base_rushingShareAdj'},
                                inplace=True)

baseline_df.season = baseline_df.season + 1

# merge baseline info into game_df, in this case, we will lose 2017

game_df = pd.merge(game_df, baseline_df, on=['season','playerid'], how='left')
game_df.info()

In [None]:
id = game_df.season.isin([2018, 2019])
game_df = game_df[id]

game_df[['base_rushingShare','base_rushingShareAdj','base_totalRushingAttempts','base_rushertotalrushingattempts']]=\
    game_df[['base_rushingShare','base_rushingShareAdj','base_totalRushingAttempts','base_rushertotalrushingattempts']].fillna(0)

game_df.info()

## 0.8 weighted ytd data

In [None]:
# Weighted historical values
# note the fill of na is after the target percentage has been calculated

alpha = 5.0

w = game_df.ytd_totalRushingAttempts * alpha / \
        (game_df.ytd_totalRushingAttempts * alpha + game_df.base_totalRushingAttempts)

id = (game_df.ytd_totalRushingAttempts==0) & (game_df.base_totalRushingAttempts==0)
w[id] = 1.0

game_df['w_rushingShareAdj'] = game_df.ytd_rushingShareAdj * w + game_df.base_rushingShareAdj * (1-w)

game_df.info()

## 0.9 Share Normalization

In [None]:
# create rush share by position id
# seems there is some issue with following algo
# It should be: the average of team's rush share for RB / the sum of ytd_rushingShareAdj for active RB

gd = game_df.groupby(['season','teamid','positionid','week'] )
tmp = gd.rushertotalrushingattempts.sum() / gd.totalrushingattempts.median()
tmp.rename('teamPositionRushShare', inplace = True)
tmp = tmp.to_frame() #.reset_index()

gd = tmp.groupby(level=['season','teamid','positionid'], as_index=False, group_keys=False)
tmp2 = gd.expanding().mean()
tmp2.rename(columns = {'teamPositionRushShare':'m_teamPositionRushShare'}, inplace = True)

gd = tmp2.groupby(level = ['season','teamid','positionid'], as_index=False, group_keys=False)
tmp2 = gd.shift(1)
tmp['m_teamPositionRushShare'] = tmp2['m_teamPositionRushShare']

tmp.reset_index()

game_df = pd.merge(game_df, tmp, on=['season','teamid','week','positionid'], how='left')

game_df.info()

In [None]:
# normalized w_rushShareAdj, only for RB
game_df['w_rushingShareAdj_norm'] = game_df.w_rushingShareAdj

id = game_df.positionid==9
game_df.loc[id,'w_rushingShareAdj_norm'] = game_df[id].teamPositionRushShare / game_df[id].m_teamPositionRushShare * game_df[id].w_rushingShareAdj

game_df[game_df.positionid==9].w_rushingShareAdj_norm.describe()

# 1. Feature preparation

In [None]:
# create a copy in case we need to re-run the following steps
#tmp = game_df.copy()
game_df = tmp.copy()

In [None]:
# only keep cases that players are active and have override predictions
# the filter of skipping starting weeks and zero output game is very important!!!

id = (~ game_df.exp_rushingShare.isnull()) & (game_df.isActive) & (game_df.week > 1) & (game_df.rushertotalrushingattempts > 0)  #& (game_df.positionid==9)
game_df = game_df[id]
game_df.week = game_df.week.astype('float64')
game_df.ytd_rushertotalrushingattempts = game_df.ytd_rushertotalrushingattempts.astype('float64')

#game_df.info()
game_df.rushingShare.describe()

## 1.1 Feature transformation

In [None]:
game_df.info()

In [None]:
num_fields = [
                'w_rushingShareAdj_norm',
                'prev_rushingShare',
                'ytd_rushertotalrushingattempts',
                'week'
             ]

cat_fields = [
                'eventType',
                'positionid',
             ]
              
label = game_df.rushingShare

# StandardScaler version
transform_pipeline = ColumnTransformer(transformers=[
                                            ('num', StandardScaler(), num_fields),
                                            ('cat', OneHotEncoder(categories='auto'), cat_fields)
                                        ])
features_transformed = transform_pipeline.fit_transform(game_df)


# None-StandardScaler version
transform_pipeline_2 = ColumnTransformer(transformers=[
                                            ('num', 'passthrough', num_fields),
                                            ('cat', OneHotEncoder(categories='auto'), cat_fields)
                                        ])
features_transformed_2 = transform_pipeline_2.fit_transform(game_df)

feature_names = num_fields
feature_names.extend(transform_pipeline_2.named_transformers_.cat.get_feature_names())

if type(features_transformed_2) == np.ndarray:
    features_transformed_2 = pd.DataFrame(features_transformed_2, columns=feature_names)
else:
    features_transformed_2 = pd.DataFrame(features_transformed_2.toarray(), columns=feature_names)

cat_one_hot_fields = list(transform_pipeline.named_transformers_.cat.get_feature_names())
print(cat_one_hot_fields)
pd.DataFrame(features_transformed).info()

# 2. Model study

### 2.1 Benchmark model study

In [None]:
print("Rush share summary:")
printBenchmarkModelPerformance(game_df, [1,8,9])

print("\nRush share summary for WRs:")
printBenchmarkModelPerformance(game_df, [1])

print("\nRush share summary for QBs:")
printBenchmarkModelPerformance(game_df, [8])

print("\nRush share summary for RBs:")
printBenchmarkModelPerformance(game_df, [9])

In [None]:
def printBenchmarkModelPerformance(data, positionId):
    id = data.positionid.isin(positionId)
    
    data_df = data[id].copy()
    
    print(data_df.rushingShare.describe(), '\n')
    
    re = (data_df.rushingShare - data_df.exp_rushingShare)
    r2 = 1 - sum(re**2)/sum((data_df.rushingShare - np.mean(data_df.rushingShare))**2)
    print("                    {}     {}".format('MAE', 'R2') )
    print("Override model:     {:.4f}  {:.1%}".format(abs(re).mean(), r2) )

    re = (data_df.rushingShare - data_df.ytd_rushingShare)
    r2 = 1 - sum(re**2)/sum((data_df.rushingShare - np.mean(data_df.rushingShare))**2)
    print("ytd model:          {:.4f}  {:.1%}".format(abs(re).mean(), r2) )

    re = (data_df.rushingShare - data_df.w_rushingShareAdj)
    r2 = 1 - sum(re**2)/sum((data_df.rushingShare - np.mean(data_df.rushingShare))**2)
    print("Weighted ytd model: {:.4f}, {:.1%}".format(abs(re).mean(), r2) )
    
    re = (data_df.rushingShare - data_df.w_rushingShareAdj_norm)
    r2 = 1 - sum(re**2)/sum((data_df.rushingShare - np.mean(data_df.rushingShare))**2)
    print("W/N ytd model:      {:.4f}, {:.1%}".format(abs(re).mean(), r2) )

    return()

print("Rush share summary:")
printBenchmarkModelPerformance(game_df, [1,8,9])

print("\nRush share summary for WRs:")
printBenchmarkModelPerformance(game_df, [1])

print("\nRush share summary for QBs:")
printBenchmarkModelPerformance(game_df, [8])

print("\nRush share summary for RBs:")
printBenchmarkModelPerformance(game_df, [9])

#game_df.rushingShare.describe()

### 2.2 Machine learning model study

In [None]:
# instantiate models
MAE = make_scorer(mean_absolute_error)
folds = 5

model_linear = SGDRegressor(max_iter=10000, tol=1e-4)

model_svr = LinearSVR(random_state=0, tol=1e-4, max_iter=100000)

model_rf = RandomForestRegressor(n_estimators=200, max_depth=30, random_state=0)

In [None]:
print('                   MAE     R2' )

MAE_linear = cross_val_score(model_linear,
    features_transformed,
    label,
    cv=folds,
    scoring=(MAE))
R2_linear = cross_val_score(model_linear,
    features_transformed,
    label,
    cv=folds,
    scoring=('r2'))
print('Linear regression: {:.4f}  {:.1%}'.format(np.mean(MAE_linear), np.mean(R2_linear)))


MAE_rf = cross_val_score(model_rf,
    features_transformed_2,
    label,
    cv=folds,
    scoring=MAE)
R2_rf = cross_val_score(model_rf,
    features_transformed_2,
    label,
    cv=folds,
    scoring=('r2'))
print('RF regression:     {:.4f}  {:.1%}'.format(np.mean(MAE_rf), np.mean(R2_rf)))

MAE_svr = cross_val_score(model_svr,
    features_transformed,
    label,
    cv=folds,
    scoring=MAE)
R2_svr = cross_val_score(model_svr,
    features_transformed,
    label,
    cv=folds,
    scoring=('r2'))
print('SV regression:     {:.4f}  {:.1%}'.format(np.mean(MAE_svr), np.mean(R2_svr)))

In [None]:
# feature importance study

regr = RandomForestRegressor(n_estimators=200, max_depth=20, random_state=0)
regr.fit(features_transformed, label)

cat_one_hot_fields = list(transform_pipeline.named_transformers_.cat.get_feature_names())
feature_score = pd.DataFrame([num_fields + cat_one_hot_fields,regr.feature_importances_], 
                             index=['feature','importance']).transpose()
feature_score.sort_values(by='importance',ascending=False).reset_index(drop=True)

In [None]:
# linear regression model interpretation

import statsmodels.api as sm

mod = sm.OLS(np.array(label), features_transformed_2, missing='drop')
res = mod.fit()

mae = np.abs(res.resid).mean()

print('{:.3f}'.format(mae) )
res.summary()

In [None]:
res.resid.plot()

In [None]:
plt.figure(figsize=[9,9])
plt.plot(game_df.rushingShare, res.fittedvalues, 'o')
plt.xlabel('rushing share')
plt.xlim(0.0, 1.05)
plt.ylabel('prediction')
plt.ylim(0.0, 1.05)
plt.plot( [0,1],[0,1] )
plt.show()

In [None]:
game_df['fitted_rushingShare']=res.fittedvalues.values
game_df.to_csv("rush_share_modeling_results.csv")
game_df.info()

## 3. Large error analysis

In [None]:
id = (game_df.rushingShare - game_df.w_rushingShareAdj > 0.4) & (game_df.season == 2019)
sum(id)
game_df[id]