In [None]:
# these are common imports for every notebook
# pandas and numpy are for analysis
# matplotlib and seaborn are for visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Initial Datasets

In [1]:
# idea behind this chapter is to build a knn model from WR college stats / athleticism
# are going to predict whether the receiver has a 1000 yard season in his first five years

In [None]:
qb.dtypes

In [None]:
# we don't need a few of these columns
qb = qb.drop(['gs', 'pos', 'pass_cmp_perc'], axis=1)

In [None]:
# drop seasons with less than 100 pass attempts
# this should filter out non-QBs who threw some passes
# as well as very marginal players
qb = qb.loc[qb['pass_att'] >= 100, :]

In [None]:
# rename some columns
renames = {
    'source_player_name': 'player',
    'source_player_id': 'player_id',
    'pass_adj_yds_per_att': 'aya',
    'pass_adj_net_yds_per_att': 'anya'
}

qb = qb.rename(columns=renames)

# convert columns to string
qb['player'] = qb['player'].astype('string')
qb['player_id'] = qb['player_id'].astype('string')

In [None]:
# check missing values
qb.loc[qb.isna().any(axis=1), :]

## QB Metrics: Adjusted Net Yards Per Attempt

In [None]:
# anya identifies all-time greats like Manning, Brady, Rodgers
# also highlights massive seasons like Mahomes 2018, Ryan 2016, Foles 2013
qb.sort_values('anya', ascending=False).head(10)

In [None]:
# let's look at how anya is distributed
# we have 960 QB seasons
# 25th percentile is is 4.6, median is 5.5, 75th is 6.44
qb['anya'].describe()

In [None]:
# looks like anya is normally distributed 
# skew and kurtosis near zero, histogram looks normal
from scipy.stats import skew, kurtosis
print(kurtosis(qb['anya']))
print(skew(qb['anya']))
qb['anya'].hist()

## Create Age Curves with "Delta Method"

### Unadjusted Delta Method

In [None]:
# delta method starts with calculating the change or delta in a metric
# from one year to the next
# here, we will start with adjusted net yards per attempt
# will be easier if we sort the data at the beginning
qb = qb.sort_values(['player_id', 'season_year'])

In [None]:
# create two new columns
# anya_lag shows the anya from the previous year
# anya_d shows the change in anya from the previous year
# a positive anya_d means improved, negative means regressed
qb['anya_lag'] = qb.groupby(['player_id'])['anya'].shift(1)
qb['anya_d'] = qb['anya'] - qb['anya_lag']

In [None]:
# the delta method doesn't allow for gaps in seasons
# so we also need to measure the change in season_year
qb['season_lag'] = qb.groupby(['player_id'])['season_year'].shift(1)
qb['season_d'] = qb['season_year'] - qb['season_lag']

In [None]:
# now we can filter out the na rows
# which are the first row of that player in the dataset
qb = qb.loc[~qb.isna().any(axis=1), :]

In [None]:
# we can also filter out rows where season_d > 1
# so we ensure consecutive seasons
qb = qb.loc[qb['season_d'] == 1, :]

In [None]:
# now we'll make a dataframe of age and anya_d
qb_age_curve = (
    qb.groupby('age')['anya_d']
    .agg(['count', 'mean'])
    .reset_index()
)

In [None]:
qb_age_curve.plot(x='age', y='mean', kind='scatter')

### Weighted Delta Method

In [None]:
# as before, we will use adjusted net yards / attempt as the metric
# will be easier if we sort the data at the beginning
# that way we can visually see the lag
qb = qb.sort_values(['player_id', 'season_year'])

In [None]:
# create two new columns
# anya_lag shows the anya from the previous year
# anya_d shows the change in anya from the previous year
# a positive anya_d means improved, negative means regressed
qb['anya_lag'] = qb.groupby(['player_id'])['anya'].shift(1)
qb['anya_d'] = qb['anya'] - qb['anya_lag']

In [None]:
# the delta method doesn't allow for gaps in seasons
# so we also need to measure the change in season_year
qb['season_lag'] = qb.groupby(['player_id'])['season_year'].shift(1)
qb['season_d'] = qb['season_year'] - qb['season_lag']

In [None]:
# now we can filter out the na rows
# which are the first row of that player in the dataset
qb = qb.loc[~qb.isna().any(axis=1), :]

In [None]:
# we can also filter out rows where season_d > 1
# so we ensure consecutive seasons
qb = qb.loc[qb['season_d'] == 1, :]

In [None]:
qb_age_curve['anya_d_wm'] = (
 qb
 .groupby('age')
 .apply(lambda df_: np.average(df_.anya_d, weights=df_.pass_att))
)

In [None]:
qb_age_curve

In [None]:
qb_age_curve.reset_index().plot(x='age', y='weighted_mean', kind='scatter')

In [None]:
# polynomial fit

In [None]:
poly_params = np.polyfit(qb_age_curve.index, qb_age_curve.anya_d_mean, 3)
poly_3 = np.poly1d(poly_params)
xpoly = np.linspace(x.min(), x.max(), 100)
ypoly = poly_3(xpoly)

In [None]:
plt.plot(x, y, 'o', xpoly, ypoly)

## Create Age Curves with Peak Method

In [None]:
# idea here is to identify the player's peak year and then
# express every other season as a % of the player's peak
# so if Manning's best season was 10 aya
# a season with 9.2 aya would be 92 (we are using 1-100 scale)

In [None]:
# as before, we will use adjusted net yards / attempt as the metric
# will be easier if we sort the data at the beginning
# that way we can visually check the calculations
qb = qb.sort_values(['player_id', 'season_year'])

In [None]:
# create two new columns
# peak shows the maximum anya for the player
# normally, groupby produces one row per group
# but we want the peak value for every row
# tranform produces series of the same length as the original series
# so if there are 5 Aikman rows, it sets the peak in all of those rows
display(qb.groupby(['player_id'])['anya'].max().head())
display(qb.groupby(['player_id'])['anya'].transform('max').head())
qb['peak'] = qb.groupby(['player_id'])['anya'].transform('max')

In [None]:
# anya_d shows the difference between peak and anya for this row
from math import floor
qb['anya_d'] = qb.apply(lambda df_: floor((df_.anya / df_.peak) * 100), axis=1)

In [None]:
# now we'll make a dataframe of age and anya_d
# we want to use the weighted average of anya_d
# meaning that a QB that throws 600 passes will contribute
# more to the average than one who throws 350 passes.
qb_age_curve = (
    qb.query('(age > 21) & (age < 40)')
    .groupby('age')
    .agg({'anya_d': ['count', 'mean']})
)

In [None]:
qb_age_curve.columns = ['_'.join([el for el in c if el]) 
                        for c in qb_age_curve.columns.to_flat_index()]

In [None]:
poly_params = np.polyfit(qb_age_curve.index, qb_age_curve.anya_d_mean, 3)
poly_3 = np.poly1d(poly_params)
xpoly = np.linspace(x.min(), x.max(), 100)
ypoly = poly_3(xpoly)

In [None]:
fig, ax = plt.subplots(figsize=(9, 5))
plt.plot(x, y, 'o', xpoly, ypoly)
plt.xticks(range(21, 40))

In [None]:
# try the same plot with a spline
x = qb_age_curve.index
y = qb_age_curve['anya_d_mean']
spl = UnivariateSpline(x, y, s=25)
xx = np.linspace(x.min(), x.max(), 100)
plt.plot(x, y, 'bo', xx, spl(xx))

In [None]:
x = qb_age_curve.index
y = qb_age_curve['anya_d_mean']
spl = InterpolatedUnivariateSpline(x, y)
xx = np.linspace(x.min(), x.max(), 100)
plt.plot(x, y, 'bo', xx, spl(xx))

In [None]:
# weighted mean
qb_age_curve['anya_d_wm'] = (
    qb
    .groupby('age')
    .apply(lambda df_: np.average(df_.anya_d, weights=df_.pass_att))
)

In [None]:
x = qb_age_curve.index
y = qb_age_curve.anya_d_wm
poly_params = np.polyfit(x, y, 3)
poly_3 = np.poly1d(poly_params)
xx = np.linspace(x.min(), x.max(), 100)
yy = poly_3(xx)

In [None]:
fig, ax = plt.subplots(figsize=(9, 5))
plt.plot(x, y, 'o', xx, yy)
plt.xticks(range(21, 40))

In [None]:
# try the same plot with a spline
x = qb_age_curve.index
y = qb_age_curve['anya_d_wm']
spl = UnivariateSpline(x, y, s=25)
xx = np.linspace(x.min(), x.max(), 100)
yy = spl(xx)
fig, ax = plt.subplots(figsize=(9, 5))
plt.plot(x, y, 'o', xx, yy)
plt.xticks(range(21, 40))

In [None]:
x = qb_age_curve.index
y = qb_age_curve['anya_d_wm']
spl = InterpolatedUnivariateSpline(x, y)
xx = np.linspace(x.min(), x.max(), 100)
yy = spl(xx)
fig, ax = plt.subplots(figsize=(9, 5))
plt.plot(x, y, 'o', xx, yy)
plt.xticks(range(21, 40))

## Helper Functions

In [None]:
# calculate fantasy points
def qb_points(row, add_bonus=False):
    """Calculates qb fantasy points from row in dataframe"""
    # assume 4 points pass TD, 1 point per 25 yards
    # NOTE: our dataset does not have fumbles
    points = 0
    points += row.pass_yds * .04
    points += row.pass_td * 4
    points -= row.pass_int
    points += row.rush_yds * .10
    points += row.rush_td * 6
    if add_bonus and row.pass_yds >= 300:
        points += 3
    return points

In [None]:
# add fantasy points
def add_fantasy_points(df):
    """Adds fantasy points columns to dataframe"""
    df['fpts'] = df.apply(qb_points, axis=1)
    df['dkpts'] = df.apply(qb_points, args=(True,), axis=1)
    return df

In [None]:
def yearly_stats(df):
    statcols = ['pass_att', 'pass_cmp', 'pass_int', 'pass_td', 'pass_yds', 'rush_att',
                'rush_td', 'rush_yds', 'air_yards', 'fpts', 'dkpts']
    return df.groupby(['nflid', 'player', 'season_year'])[statcols].sum()

In [None]:
def age_as_of_game(df):
    """Player age as of game date"""
    # calculate the age by subtracting birthdate from gamedate
    # convert the timedelta to days, then divide by 365
    return df.apply(lambda df_: (df_.game_date - df_.birthdate).days / 365, axis=1)

In [None]:
def age_as_of_season(df):
    """Player age as of season start (Sept 1)"""
    # create index that is cross join of nflid and seasons
    idx = pd.MultiIndex.from_product(
            [df.nflid.unique(), df.season_year.unique()], 
            names = ["nflid", "season_year"]
    )

    df = pd.DataFrame(idx).reset_index().join(df, how='left', on='nflid')
    return (
        df
        .assign(start_date=lambda df_: df_.season_year.apply(lambda x: datetime(x, 9, 1)))
        .assign(age=lambda df_: df_.apply(lambda row: (row.start_date - row.birthdate).days / 365, axis=1))
        .drop(['birthdate', 'start_date'], axis=1)
        .set_index(['nflid', 'season_year'])
    )
