In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pyathena import connect
from pyathena.pandas_cursor import PandasCursor

from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import make_scorer

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import xgboost as xgb

pd.options.mode.chained_assignment = None  # default='warn'
pd.options.display.max_columns = 30

# Define connection to DB
conn = connect(
    s3_staging_dir='s3://aws-athena-query-results-323906537337-us-east-1/',
    region_name='us-east-1',
    cursor_class=PandasCursor
    )
cursor = conn.cursor()

In [None]:
# # thoughts

# Only 2018~2019 seasons contain data for override values, thus for validation purpose.
# could consider more seasons without validation data but for building the model purpose.

# 0. Data

## 0.1 Game data

In [2]:
simple_query = f'''
with teamquery as
(
select season,
       eventmetadata.week week,
       eventmetadata.gameCode gamecode,  
       teamid, 
       totaltruepassattempts
from datalakefootball.team_aggregated_game_stats
where season>='2017' and eventmetadata.eventtypeid in (1,2)
order by season, week, gamecode, teamid
)

select
    cast(t1.season as integer) season,
    t1.eventmetadata.week week,
    t1.eventmetadata.gamecode gamecode,
    t1.eventmetadata.eventtypeid eventType,
    t1.teamid teamid,
    t1.player.playerid,
    t1.player.positionid,
    t1.receivertotaltargetsontruepassattempts,
    t2.totaltruepassattempts,
    case when t2.totaltruepassattempts = 0 
        then null
        else t1.receivertotaltargetsontruepassattempts / cast (t2.totaltruepassattempts as double)
        end as targetShare,
    row_number() over (PARTITION BY
                         t1.season, t1.teamid, t1.eventmetadata.week, t1.player.positionid
                     ORDER BY
                        t1.receivertotaltargetsontruepassattempts DESC) Rank
from
    datalakefootball.player_aggregated_game_stats as t1
    left join
    teamquery t2
    on t1.season = t2.season and 
       t1.eventmetadata.gamecode = t2.gamecode and 
       t1.teamid = t2.teamid
where t1.season >= '2017' and 
     t1.eventmetadata.eventtypeid in (1,2) and
     t1.player.positionid in (1,7,9) -- 1:WR, 7:TE, 9:RB
order by season, week, gamecode, teamid
'''

if True:
    game_df = cursor.execute(simple_query).as_pandas()
    print(game_df.info())
else:
    print("Failed to query!")

# still some issue with missing data in "totaltruepassattempts"
#game_df['targetShare'] = game_df.receivertotaltargetsontruepassattempts / game_df.totaltruepassattempts
print(game_df.totaltruepassattempts.describe())
game_df.targetShare.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18855 entries, 0 to 18854
Data columns (total 11 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   season                                  18855 non-null  Int64  
 1   week                                    18855 non-null  Int64  
 2   gamecode                                18855 non-null  Int64  
 3   eventType                               18855 non-null  Int64  
 4   teamid                                  18855 non-null  Int64  
 5   playerid                                18855 non-null  Int64  
 6   positionid                              18855 non-null  Int64  
 7   receivertotaltargetsontruepassattempts  18855 non-null  Int64  
 8   totaltruepassattempts                   18855 non-null  Int64  
 9   targetShare                             18834 non-null  float64
 10  Rank                                    18855 non-null  In

count    18834.000000
mean         0.082664
std          0.091931
min          0.000000
25%          0.000000
50%          0.052632
75%          0.136364
max          0.583333
Name: targetShare, dtype: float64

In [None]:
game_df.head()
id = (game_df.season==2019) & (game_df.teamid==323)
np.sum(game_df.receivertotaltargetsontruepassattempts[id])

In [None]:
gd=game_df[['season','week','totaltruepassattempts']].groupby(['season','week'])
tmp=gd.median()
tmp.reset_index(inplace=True)
tmp

In [None]:
simple_query = f'''
select -- season,
       -- eventmetadata.week week,
       -- eventmetadata.gameCode gamecode,  
       -- teamid, 
       sum(totaltruepassattempts)
from datalakefootball.team_aggregated_game_stats
where season='2019' and eventmetadata.eventtypeid in (1,2) and
    teamid = 323
-- order by season, week, gamecode, teamid
'''

team_df = cursor.execute(simple_query).as_pandas()

team_df

## 0.2 Expected data

In [3]:
# we only have 2 years data 2018 ~ 2019 for research purpose

simple_query = f'''
select 
    eventmetadata.gamecode,
    player.playerid,
    targetpercentage exp_targetShare
from datalakefootball.player_expected_rates
where 
    season >= '2017' and 
    eventmetadata.eventtypeid in (1,2) and
    player.positionid in (1,7,9) and version='override'
order by season
'''

if True:
    exp_df = cursor.execute(simple_query).as_pandas()
    print(exp_df.info())
else:
    print("Failed to query!")
    
# merge expected value (override version) into player game stats
# Note that 2018 season missing about 20% records as well!
# when we do model calibration, only the data entry with exp_values should be used!

game_df = pd.merge(game_df, exp_df, on=['gamecode','playerid'], how='left')
game_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16687 entries, 0 to 16686
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   gamecode         16687 non-null  Int64  
 1   playerid         16687 non-null  Int64  
 2   exp_targetShare  16687 non-null  float64
dtypes: Int64(2), float64(1)
memory usage: 423.8 KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 18855 entries, 0 to 18854
Data columns (total 12 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   season                                  18855 non-null  Int64  
 1   week                                    18855 non-null  Int64  
 2   gamecode                                18855 non-null  Int64  
 3   eventType                               18855 non-null  Int64  
 4   teamid                                  18855 non-null  Int64  
 5   playerid

In [None]:
exp_df.head()

## 0.3 Ytd data

In [12]:
simple_query = f'''
with teamquery as
(
select eventmetadata.gameCode gamecode,  
       teamid, 
       totaltruepassattempts game_totalTruePassAttempts
from datalakefootball.team_aggregated_game_stats
where season>='2017' and eventmetadata.eventtypeid in (1,2)
)

select 
  cast(t1.season as integer) season, 
  t1.eventmetadata.week, 
  t1.eventmetadata.gamecode, 
  t1.teamid,
  t1.player.playerid,
  t1.player.positionid,
  t1.onfieldtotaltruepassattempts ytd_onFieldTotalTruePassAttempts,
  t1.receivertotaltargetsontruepassattempts ytd_totalTargetsOnTruePassAttempts,
  t2.game_totaltruepassattempts,
  row_number() over (PARTITION BY
                         t1.season, t1.teamid, t1.eventmetadata.week, t1.player.positionid
                     ORDER BY
                         t1.receivertotaltargetsontruepassattempts DESC) ytd_rank
from datalakefootball.player_aggregated_ytd_stats t1
    left join teamquery t2
    on t1.eventmetadata.gamecode = t2.gamecode and 
       t1.teamid=t2.teamid
where 
    t1.season >='2017' and 
    t1.player.positionid in (1,7,9) and 
    t1.eventmetadata.eventtypeid in (1,2) and
    t1.eventmetadata.week is not null -- when a week is missing, the player may not be active
order by season, week, gamecode, teamid
'''

if True:
    ytd_df = cursor.execute(simple_query).as_pandas()
    #print(ytd_df.info())
else:
    print("Failed to query!")
    
    
# prepare ytd_truepassattempts for each player. in this way, no update for missed games
gd = ytd_df.groupby(['season','playerid'])

ytd_df['ytd_totaltruepassattempts'] = gd.game_totaltruepassattempts.cumsum()
ytd_df['ytd_totaltruepassattempts'] = gd.ytd_totaltruepassattempts.shift(1)

ytd_df['ytd_targetShare'] = ytd_df.ytd_totalTargetsOnTruePassAttempts / ytd_df.ytd_totaltruepassattempts
print(ytd_df.info())




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18855 entries, 0 to 18854
Data columns (total 12 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   season                              18855 non-null  Int64  
 1   week                                18855 non-null  Int64  
 2   gamecode                            18855 non-null  Int64  
 3   teamid                              18855 non-null  Int64  
 4   playerid                            18855 non-null  Int64  
 5   positionid                          18855 non-null  Int64  
 6   ytd_onFieldTotalTruePassAttempts    17212 non-null  Int64  
 7   ytd_totalTargetsOnTruePassAttempts  17212 non-null  Int64  
 8   game_totaltruepassattempts          18855 non-null  Int64  
 9   ytd_rank                            18855 non-null  Int64  
 10  ytd_totaltruepassattempts           17212 non-null  Int64  
 11  ytd_targetShare                     17211

In [16]:
id = (ytd_df.ytd_totaltruepassattempts == 0) |
np.sum(id)

1

In [11]:
# merge ytd data into game data
game_df = pd.merge(game_df, ytd_df[['gamecode','playerid',
                                    'ytd_rank',
                                   'ytd_onFieldTotalTruePassAttempts',
                                   'ytd_totalTargetsOnTruePassAttempts',
                                   'ytd_totaltruepassattempts',
                                   'ytd_targetShare']], 
                    on=['gamecode','playerid'], how='left')
game_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18855 entries, 0 to 18854
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   season                                  18855 non-null  Int64  
 1   week                                    18855 non-null  Int64  
 2   gamecode                                18855 non-null  Int64  
 3   eventType                               18855 non-null  Int64  
 4   teamid                                  18855 non-null  Int64  
 5   playerid                                18855 non-null  Int64  
 6   positionid                              18855 non-null  Int64  
 7   receivertotaltargetsontruepassattempts  18855 non-null  Int64  
 8   totaltruepassattempts                   18855 non-null  Int64  
 9   targetShare                             18834 non-null  float64
 10  Rank                                    18855 non-null  In

## 0.4 Prepare ytd data by position rank

In [None]:
# create ytd targetShare by position rank
# only ytd data is used, no baseline information is used!

rank_df = game_df[['season','week','gamecode','teamid','playerid','positionid',
                   'receivertotaltargetsontruepassattempts','totaltruepassattempts','Rank']].copy()

gd = rank_df.groupby(['season','teamid','positionid','Rank'])

rank_df['ytd_targetShareByPositionRank'] = gd.receivertotaltargetsontruepassattempts.cumsum() / \
                                        gd.totaltruepassattempts.cumsum()
rank_df['ytd_targetShareByPositionRank'] = gd.ytd_targetShareByPositionRank.shift(1)

rank_df.info()

## 0.5 Identify weekly injury

In [None]:
# identify weekly injury and adjust ytd_targetShare accordingly

# example:
# Calvin Ridley (884013), Atlanta Falcon (323), 2019 game 14 out -> what impact on other WR
# 822014, previously ranked 3rd, now move to #2 for game #14 ~ #17
## what if 884013 is not out right after game #14?
# the ytd for 822014 is much lower before game #14, it would be better if we use byRank value instead for 
# the rest of the season
# a step further is to use a combo of reset ytd and byRank value

In [None]:
# Function to calculate target share by ranks for each positin type when there is a roster change
# return a list of objects, each object contains targetShares for players for a game when roster change happens
# Note: there is a bias to use targetShare by position to estimate each player's performance!!!

def calculateTargetShareAdjByRank(game_df, positionIds, seasons, rank_df, printDetails=False):

    adjustedRates = []
    
    teams = game_df.teamid.unique()

    for season in seasons:
        for team in teams:
            for positionId in positionIds:
                #print(team)

                id = (game_df.season==season) & (game_df.teamid==team) & (game_df.positionid==positionId)
                one_team = game_df[id]

                #print(one_team.shape)

                ranking_data = []

                for i,week in enumerate(one_team.week.unique()):
                    id = one_team.week == week

                    if i == 0:
                        ranking_data = one_team.loc[id, ['receivertotaltargetsontruepassattempts','playerid']].copy()
                        ranking_data['ytd_rank'] = -1
                        ranking_data.set_index('playerid', inplace=True)
                        continue

                    data = one_team.loc[id, ['teamid','gamecode','positionid','playerid',
                                        'receivertotaltargetsontruepassattempts']].copy()
                    data.set_index('playerid', inplace=True)

                    # add according to playerid index
                    ranking_data = ranking_data.add(data, fill_value=0)
                    ranking_data.sort_values('receivertotaltargetsontruepassattempts', inplace=True, ascending=False)
                    ranking_data.loc[:,'ytd_rank'] = np.arange(len(ranking_data)) + 1

                    current_week_data = one_team[one_team.week == week]

                    #check if any top (2) players is missing for this week
                    missingPlayers = [player for player in ranking_data.index.values
                                     if player not in current_week_data.playerid.values and
                                        ranking_data.loc[player].ytd_rank <= 2]

                    if missingPlayers:
                        if printDetails:
                            for player in missingPlayers:
                                print(week, player, ranking_data.loc[player].ytd_rank)

                        # re-arrange ranks of active players to reflect currently predicted rank
                        data['onFieldRank'] = ranking_data.loc[data.index].ytd_rank.\
                                                rank(method='first', na_option='bottom')
                        data = data.astype({'onFieldRank':'int64'})
                        data.reset_index(inplace=True)

                        # merge target%_by_rank into data
                        data = pd.merge(data, 
                                        rank_df[['teamid','gamecode','positionid','Rank',
                                                 'ytd_targetShareByPositionRank']], 
                                        left_on=['teamid','gamecode','positionid','onFieldRank'],
                                        right_on=['teamid','gamecode','positionid','Rank'], how='left')
                        
                        # adjustment
                        #data.ytd_targetShareByPositionRank = data.ytd_targetShareByPositionRank * 0.9

                        adjustedRates.append(data)

    return(adjustedRates)

In [None]:
re = calculateTargetShareAdjByRank(game_df, positionIds=[1,7,9], seasons=[2017, 2018,2019])

adjustedRates = pd.concat(re, ignore_index=True)

In [None]:
game_df = pd.merge(game_df, 
                 adjustedRates[['teamid','gamecode','playerid','onFieldRank','ytd_targetShareByPositionRank']], 
                 on=['teamid','gamecode','playerid'], 
                 how='left')
game_df.shape

In [None]:
# we create a new column 'ytd_targetShareAdj' column to contain ytd data with adjustment by injury situation
game_df['ytd_targetShareAdj'] = game_df.ytd_targetShare

id = game_df.ytd_targetShareByPositionRank.isnull()
game_df.ytd_targetShareAdj[~id] = game_df.ytd_targetShareByPositionRank[~id] 

game_df.shape

In [None]:
# new method has some improvement from ytd baseline

id1 = (game_df.season.isin([2018,2019])) & (game_df.positionid.isin([1,7,9]))
id2 = game_df.ytd_targetShareByPositionRank.isnull()

print(abs(game_df[id1 & ~id2].targetShare - game_df[id1 & ~id2].exp_targetShare).mean())

print(abs(game_df[id1 & ~id2].targetShare - game_df[id1 & ~id2].ytd_targetShare).mean())

print(abs(game_df[id1 & ~id2].targetShare - game_df[id1 & ~id2].ytd_targetShareByPositionRank).mean(),'\n')

id = (~game_df.ytd_targetShare.isnull()) & (~game_df.ytd_targetShareAdj.isnull()) & \
    (game_df.season.isin([2018,2019])) & (game_df.positionid.isin([1,7,9]))

print(abs(game_df.targetShare - game_df.exp_targetShare)[id].mean())

print(abs(game_df.targetShare - game_df.ytd_targetShare)[id].mean())

print(abs(game_df.targetShare - game_df.ytd_targetShareAdj)[id].mean())

## 0.6 Prepare baseline data

In [None]:
# create baseline case from ytd data

baseline_df = game_df[['season','playerid','ytd_onFieldTotalTruePassAttempts',
                       'ytd_totalTargetsOnTruePassAttempts','ytd_totaltruepassattempts',
                       'ytd_targetShare','ytd_targetShareAdj']].copy()
baseline_df = baseline_df.groupby(['season','playerid']).tail(1)

baseline_df.rename(columns={'ytd_onFieldTotalTruePassAttempts':'base_onFieldTotalTruePassAttempts',
                            'ytd_totalTargetsOnTruePassAttempts':'base_totalTargetsOnTruePassAttempts',
                            'ytd_totaltruepassattempts':'base_totaltruepassattempts',
                            'ytd_targetShare':'base_targetShare',
                            'ytd_targetShareAdj':'base_targetShareAdj'},
                                inplace=True)

baseline_df.season = baseline_df.season + 1

# merge baseline info into game_df, in this case, we will lose 2017

game_df = pd.merge(game_df, baseline_df, on=['season','playerid'], how='left')
game_df.info()

In [None]:
id = game_df.season.isin([2018, 2019])
game_df = game_df[id]
game_df.shape

# 1. Featuring preparation

In [None]:
# create a copy in case we need to re-run the following steps
tmp = game_df.copy()
#game_df = tmp.copy()

In [None]:
game_df.ytd_targetShareByPositionRank.fillna(-1, inplace=True)

game_df.fillna(0, inplace=True)

print(game_df.info())

In [None]:
# Weighted historical values
# note the fill of na is after the target percentage has been calculated

alpha = 5.0

w = game_df.ytd_onFieldTotalTruePassAttempts * alpha / \
        (game_df.ytd_onFieldTotalTruePassAttempts * alpha + game_df.base_onFieldTotalTruePassAttempts)

id = (game_df.ytd_onFieldTotalTruePassAttempts==0) & (game_df.base_onFieldTotalTruePassAttempts==0)
w[id] = 1.0

game_df['w_targetShareAdj'] = game_df.ytd_targetShareAdj * w + game_df.base_targetShare * (1-w)

In [None]:
sum(game_df.isna().any(axis=1))
game_df.shape

In [None]:
game_df.season.unique()

## 1.1 Feature transformation

In [None]:
game_df.ytd_rank = game_df.ytd_rank.astype('float64')

num_fields = [
                'w_targetShareAdj',
                'ytd_rank'
             ]

cat_fields = [
                #'eventType',
                #'teamid',
                'positionid',
             ]
              
label = game_df.targetShare

# StandardScaler version
transform_pipeline = ColumnTransformer(transformers=[
                                            #('num', StandardScaler(), num_fields),
                                            ('num', 'passthrough', num_fields),
                                            ('cat', OneHotEncoder(categories='auto'), cat_fields)
                                        ])
features_transformed = transform_pipeline.fit_transform(game_df)


# None-StandardScaler version
transform_pipeline_2 = ColumnTransformer(transformers=[
                                            #('num', StandardScaler(), num_fields),
                                            ('num', 'passthrough', num_fields),
                                            ('cat', OneHotEncoder(categories='auto'), cat_fields)
                                        ])
features_transformed_2 = transform_pipeline_2.fit_transform(game_df)

feature_names = num_fields
feature_names.extend(transform_pipeline_2.named_transformers_.cat.get_feature_names(input_features=cat_fields))
print(feature_names)

if type(features_transformed_2) == np.ndarray:
    features_transformed_2 = pd.DataFrame(features_transformed_2, columns=feature_names)
else:
    features_transformed_2 = pd.DataFrame(features_transformed_2.toarray(), columns=feature_names)

#cat_one_hot_fields = list(transform_pipeline.named_transformers_.cat.get_feature_names())
pd.DataFrame(features_transformed).info()

# 2. Model study

In [None]:
def printBenchmarkModelPerformance(data, positionId):
    id = data.positionid.isin(positionId)
    
    data_df = data[id].copy()
    
    print(data_df.targetShare.describe(), '\n')
    
    re = (data_df.targetShare - data_df.exp_targetShare)
    r2 = 1 - sum(re**2)/sum((data_df.targetShare - np.mean(data_df.targetShare))**2)
    print("                    {}     {}".format('MAE', 'R2') )
    print("Override model:     {:.4f}  {:.1%}".format(abs(re).mean(), r2) )

    re = (data_df.targetShare - data_df.ytd_targetShare)
    r2 = 1 - sum(re**2)/sum((data_df.targetShare - np.mean(data_df.targetShare))**2)
    print("ytd model:          {:.4f}  {:.1%}".format(abs(re).mean(), r2) )

    re = (data_df.targetShare - data_df.w_targetShareAdj)
    r2 = 1 - sum(re**2)/sum((data_df.targetShare - np.mean(data_df.targetShare))**2)
    print("Weighted ytd model: {:.4f}, {:.1%}".format(abs(re).mean(), r2) )
    
    #re = (data_df.rushingShare - data_df.w_rushingShareAdj_norm)
    #r2 = 1 - sum(re**2)/sum((data_df.rushingShare - np.mean(data_df.rushingShare))**2)
    #print("W/N ytd model:      {:.4f}, {:.1%}".format(abs(re).mean(), r2) )

    return()

print("Target share summary:")
printBenchmarkModelPerformance(game_df, [1,7,9])

print("\nTarget share summary for WRs:")
printBenchmarkModelPerformance(game_df, [1])

print("\nTarget share summary for TEs:")
printBenchmarkModelPerformance(game_df, [7])

print("\nTarget share summary for RBs:")
printBenchmarkModelPerformance(game_df, [9])


In [None]:
# instantiate models
MAE = make_scorer(mean_absolute_error)
folds = 5

model_linear = SGDRegressor(max_iter=10000, tol=1e-4)

model_svr = LinearSVR(random_state=42, tol=1e-6, max_iter=10000)

model_rf = RandomForestRegressor(n_estimators=200, max_depth=30, random_state=0)

In [None]:
print('Mean absolute error:' )

MAE_linear = cross_val_score(model_linear,
    features_transformed,
    label,
    cv=folds,
    scoring=MAE)
#print('Linear regression: {:.4f}'.format(np.mean(MAE_linear)))
R2_linear = cross_val_score(model_linear,
    features_transformed,
    label,
    cv=folds,
    scoring=('r2'))
print('Linear regression: {:.4f}  {:.1%}'.format(np.mean(MAE_linear), np.mean(R2_linear)))


MAE_rf = cross_val_score(model_rf,
    features_transformed_2,
    label,
    cv=folds,
    scoring=MAE)
#print('RF regression:     {:.4f}'.format(np.mean(MAE_rf)))
R2_rf = cross_val_score(model_rf,
    features_transformed_2,
    label,
    cv=folds,
    scoring=('r2'))
print('RF regression:     {:.4f}  {:.1%}'.format(np.mean(MAE_rf), np.mean(R2_rf)))

MAE_svr = cross_val_score(model_svr,
    features_transformed,
    label,
    cv=folds,
    scoring=MAE)
#print('SV regression:     {:.4f}'.format(np.mean(MAE_svr)))
R2_svr = cross_val_score(model_svr,
    features_transformed,
    label,
    cv=folds,
    scoring=('r2'))
print('SV regression:     {:.4f}  {:.1%}'.format(np.mean(MAE_svr), np.mean(R2_svr)))

In [None]:
# feature importance study

regr = RandomForestRegressor(n_estimators=200, max_depth=20, random_state=0)
regr.fit(features_transformed, label)

cat_one_hot_fields = list(transform_pipeline.named_transformers_.cat.get_feature_names())
feature_score = pd.DataFrame([feature_names,regr.feature_importances_], 
                             index=['feature','importance']).transpose()
feature_score.sort_values(by='importance',ascending=False).reset_index(drop=True)

In [None]:
# linear regression model interpretation

import statsmodels.api as sm

mod = sm.OLS(np.array(label), features_transformed_2, missing='drop')
res = mod.fit()

mae = np.abs(res.resid).mean()

print('{:.3f}'.format(mae) )
res.summary()

In [None]:
res = model_svr.fit(features_transformed, label)

In [None]:
plt.figure(figsize=[9,9])
plt.plot(game_df.targetShare, res.predict(features_transformed), 'o')
plt.xlabel('target share')
plt.xlim(0.0, 0.65)
plt.ylabel('prediction')
plt.ylim(0.0, 0.65)
plt.plot( [0,1],[0,1] )
plt.show()

In [None]:
game_df['fitted_targetShare']=res.fittedvalues.values
game_df.to_csv("target_share_modeling_results.csv")
game_df.info()

## 3. Large error analysis

In [None]:
id = (game_df.targetShare - game_df.w_targetShareAdj > 0.25) & (game_df.season == 2019)
sum(id)
game_df[id]

In [None]:
# study how much normalization is needed for target shares (sum of all target shares should be near 100%)