Read in the offensive stats of the 2019-2022 seasons

We will grab receiving stats and passing stats to show the QBRs of the QBs that will be throwing to the WRs

In [1]:
import copy, math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from helper import compute_cost, compute_gradient, gradient_descent

rcv_2020_df = pd.read_csv('receiving_stats_2020.csv')
rcv_2021_df = pd.read_csv('receiving_stats_2021.csv')
rcv_2022_df = pd.read_csv('receiving_stats_2022.csv')

pass_2020_df = pd.read_csv('passing_stats_2020.csv')
pass_2021_df = pd.read_csv('passing_stats_2021.csv')
pass_2022_df = pd.read_csv('passing_stats_2022.csv')

We want to clean the tables here and filter the receiving csvs by only looking at WRs

We will reindex here to make our filtered data look nice

In [2]:
rcv_2020_df = rcv_2020_df[rcv_2020_df['Rk'] != 'Rk']
rcv_2021_df = rcv_2021_df[rcv_2021_df['Rk'] != 'Rk']
rcv_2022_df = rcv_2022_df[rcv_2022_df['Rk'] != 'Rk']

pass_2020_df = pass_2020_df[pass_2020_df['Rk'] != 'Rk']
pass_2021_df = pass_2021_df[pass_2021_df['Rk'] != 'Rk']
pass_2022_df = pass_2022_df[pass_2022_df['Rk'] != 'Rk']

rcv_2020_df = rcv_2020_df[rcv_2020_df['Pos'] == 'WR']
rcv_2021_df = rcv_2021_df[rcv_2021_df['Pos'] == 'WR']
rcv_2022_df = rcv_2022_df[rcv_2022_df['Pos'] == 'WR']

pass_2020_df = pass_2020_df[pass_2020_df['Pos'] == 'QB']
pass_2021_df = pass_2021_df[pass_2021_df['Pos'] == 'QB']
pass_2022_df = pass_2022_df[pass_2022_df['Pos'] == 'QB']

rcv_2020_df.reset_index(drop=True, inplace=True)
rcv_2021_df.reset_index(drop=True, inplace=True)
rcv_2022_df.reset_index(drop=True, inplace=True)

In [3]:
# Convert 'G' column to numeric (assuming it contains non-numeric or NaN values)
pass_2020_df['G'] = pd.to_numeric(pass_2020_df['G'], errors='coerce')

# Find the quarterback with the most games started for each team
qb_leaders_2020 = pass_2020_df.sort_values(by=['Tm', 'G'], ascending=[True, False]).drop_duplicates('Tm')

# Convert 'G' column to numeric (assuming it contains non-numeric or NaN values)
pass_2021_df['G'] = pd.to_numeric(pass_2021_df['G'], errors='coerce')

# Find the quarterback with the most games started for each team
qb_leaders_2021 = pass_2021_df.sort_values(by=['Tm', 'G'], ascending=[True, False]).drop_duplicates('Tm')

# Convert 'G' column to numeric (assuming it contains non-numeric or NaN values)
pass_2022_df['G'] = pd.to_numeric(pass_2022_df['G'], errors='coerce')

# Find the quarterback with the most games started for each team
qb_leaders_2022 = pass_2022_df.sort_values(by=['Tm', 'G'], ascending=[True, False]).drop_duplicates('Tm')



We only want the QBR for each QB so we will only keep the Tm and QBR columns. The Tm column is kept to merge with the WRs dfs

In [4]:
rcv_2020_df = rcv_2020_df.merge(qb_leaders_2020[['Tm', 'QBR']], on='Tm', how='left')
rcv_2021_df = rcv_2021_df.merge(qb_leaders_2021[['Tm', 'QBR']], on='Tm', how='left')
rcv_2022_df = rcv_2022_df.merge(qb_leaders_2022[['Tm', 'QBR']], on='Tm', how='left')

In [5]:
rcv_2020_df.columns

Index(['Rk', 'Player', 'Tm', 'Age', 'Pos', 'G', 'GS', 'Tgt', 'Rec', 'Ctch%',
       'Yds', 'Y/R', 'TD', '1D', 'Succ%', 'Lng', 'Y/Tgt', 'R/G', 'Y/G', 'Fmb',
       'QBR'],
      dtype='object')

In [6]:
# Convert columns to numeric (assuming it contains non-numeric or NaN values)
rcv_2021_df['Rec'] = pd.to_numeric(rcv_2021_df['Rec'], errors='coerce')
rcv_2021_df['Yds'] = pd.to_numeric(rcv_2021_df['Yds'], errors='coerce')
rcv_2021_df['TD'] = pd.to_numeric(rcv_2021_df['TD'], errors='coerce')
rcv_2021_df['Fmb'] = pd.to_numeric(rcv_2021_df['Fmb'], errors='coerce')

rcv_2022_df['Rec'] = pd.to_numeric(rcv_2022_df['Rec'], errors='coerce')
rcv_2022_df['Yds'] = pd.to_numeric(rcv_2022_df['Yds'], errors='coerce')
rcv_2022_df['TD'] = pd.to_numeric(rcv_2022_df['TD'], errors='coerce')
rcv_2022_df['Fmb'] = pd.to_numeric(rcv_2022_df['Fmb'], errors='coerce')


rcv_2021_df['fpts'] = rcv_2021_df['Rec'] + rcv_2021_df['Yds'] * 0.1 + rcv_2021_df['TD'] * 6 - rcv_2021_df['Fmb'] * 2

rcv_2022_df['fpts'] = rcv_2022_df['Rec'] + rcv_2022_df['Yds'] * 0.1 + rcv_2022_df['TD'] * 6 - rcv_2022_df['Fmb'] * 2

In [7]:
rcv_2021_df

Unnamed: 0,Rk,Player,Tm,Age,Pos,G,GS,Tgt,Rec,Ctch%,...,TD,1D,Succ%,Lng,Y/Tgt,R/G,Y/G,Fmb,QBR,fpts
0,1,Cooper Kupp*+,LAR,28,WR,17,17,191,145,75.9%,...,16,89,63.4,59,10.2,8.5,114.5,0,63.8,435.7
1,2,Davante Adams*+,GNB,29,WR,16,16,169,123,72.8%,...,11,84,59.8,59,9.2,7.7,97.1,0,69.1,344.3
2,3,Tyreek Hill*,KAN,27,WR,17,16,159,111,69.8%,...,9,75,61.6,75,7.8,6.5,72.9,2,62.2,284.9
3,4,Justin Jefferson*,MIN,22,WR,17,17,167,108,64.7%,...,10,75,55.7,56,9.7,6.4,95.1,1,52.3,327.6
4,6,Diontae Johnson*,PIT,25,WR,16,14,169,107,63.3%,...,8,59,47.3,50,6.9,6.7,72.6,2,35.6,267.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
473,518,Tyrie Cleveland,DEN,24,WR,7,0,1,0,0.0%,...,0,0,0.0,0,0.0,0.0,0.0,0,47.4,0.0
474,519,Isaiah Coulter,CHI,23,WR,3,0,1,0,0.0%,...,0,0,0.0,0,0.0,0.0,0.0,0,26.4,0.0
475,526,David Moore,2TM,26,WR,3,0,2,0,0.0%,...,0,0,0.0,0,0.0,0.0,0.0,0,51.7,0.0
476,530,Devin Smith,JAX,29,WR,4,0,1,0,0.0%,...,0,0,0.0,0,0.0,0.0,0.0,0,33.5,0.0


In [8]:
rcv_2020_df.drop(['Fmb', 'Succ%', '1D'], axis=1, inplace=True)
rcv_2021_df.drop(['Fmb', 'Succ%', '1D'], axis=1, inplace=True)
rcv_2022_df.drop(['Fmb', 'Succ%', '1D'], axis=1, inplace=True)

rcv_2020_df['Player'] = rcv_2020_df['Player'].apply(lambda x: ''.join(c for c in x if c.isalpha() or c.isspace()))
rcv_2021_df['Player'] = rcv_2021_df['Player'].apply(lambda x: ''.join(c for c in x if c.isalpha() or c.isspace()))
rcv_2022_df['Player'] = rcv_2022_df['Player'].apply(lambda x: ''.join(c for c in x if c.isalpha() or c.isspace()))

rcv_2021_df

Unnamed: 0,Rk,Player,Tm,Age,Pos,G,GS,Tgt,Rec,Ctch%,Yds,Y/R,TD,Lng,Y/Tgt,R/G,Y/G,QBR,fpts
0,1,Cooper Kupp,LAR,28,WR,17,17,191,145,75.9%,1947,13.4,16,59,10.2,8.5,114.5,63.8,435.7
1,2,Davante Adams,GNB,29,WR,16,16,169,123,72.8%,1553,12.6,11,59,9.2,7.7,97.1,69.1,344.3
2,3,Tyreek Hill,KAN,27,WR,17,16,159,111,69.8%,1239,11.2,9,75,7.8,6.5,72.9,62.2,284.9
3,4,Justin Jefferson,MIN,22,WR,17,17,167,108,64.7%,1616,15.0,10,56,9.7,6.4,95.1,52.3,327.6
4,6,Diontae Johnson,PIT,25,WR,16,14,169,107,63.3%,1161,10.9,8,50,6.9,6.7,72.6,35.6,267.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
473,518,Tyrie Cleveland,DEN,24,WR,7,0,1,0,0.0%,0,,0,0,0.0,0.0,0.0,47.4,0.0
474,519,Isaiah Coulter,CHI,23,WR,3,0,1,0,0.0%,0,,0,0,0.0,0.0,0.0,26.4,0.0
475,526,David Moore,2TM,26,WR,3,0,2,0,0.0%,0,,0,0,0.0,0.0,0.0,51.7,0.0
476,530,Devin Smith,JAX,29,WR,4,0,1,0,0.0%,0,,0,0,0.0,0.0,0.0,33.5,0.0


Now we want to place the FPts for the next year into the current year and remove rows that have any NaN entries

In [9]:
rcv_2020_df = rcv_2020_df.merge(rcv_2021_df[['Player', 'fpts']], on='Player', how='left').drop_duplicates('Player')
rcv_2021_df = rcv_2021_df.drop('fpts', axis=1)
rcv_2021_df = rcv_2021_df.merge(rcv_2022_df[['Player', 'fpts']], on='Player', how='left').drop_duplicates('Player')

rcv_2020_df = rcv_2020_df.dropna()
rcv_2021_df = rcv_2021_df.dropna()

rcv_2020_df['Ctch%'] = rcv_2020_df['Ctch%'].str.rstrip('%').astype('float')
rcv_2021_df['Ctch%'] = rcv_2021_df['Ctch%'].str.rstrip('%').astype('float')

rcv_2020_df = rcv_2020_df.drop(columns=['Rk', 'Tm', 'Pos', 'Player'])
rcv_2021_df = rcv_2021_df.drop(columns=['Rk', 'Tm', 'Pos', 'Player'])

rcv_2020_df

Unnamed: 0,Age,G,GS,Tgt,Rec,Ctch%,Yds,Y/R,TD,Lng,Y/Tgt,R/G,Y/G,QBR,fpts
0,27,16,15,166,127,76.5,1535,12.1,8,55,9.2,7.9,95.9,76.6,283.5
2,28,14,14,149,115,77.2,1374,11.9,18,56,9.2,8.2,98.1,79.8,344.3
4,28,16,16,160,115,71.9,1407,12.2,6,60,8.8,7.2,87.9,61.9,147.2
6,27,16,16,151,102,67.5,1250,12.3,6,42,8.3,6.4,78.1,53.8,85.0
8,28,14,13,147,100,68.0,992,9.9,8,28,6.7,7.1,70.9,62.6,253.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,28,6,0,1,1,100.0,22,22.0,1,22,22.0,0.2,3.7,76.6,4.8
380,28,5,0,3,1,33.3,5,5.0,0,5,1.7,0.2,1.0,68.3,1.7
383,24,14,0,3,1,33.3,14,14.0,0,14,4.7,0.1,1.0,67.3,36.2
386,24,7,0,3,1,33.3,13,13.0,0,13,4.3,0.1,1.9,32.9,0.0


In [10]:
y_train = rcv_2020_df.pop('fpts').values

X_train = rcv_2020_df.apply(pd.to_numeric, errors='coerce').values
X_features = rcv_2020_df.columns

print(X_train)
rcv_2020_df

[[27.  16.  15.  ...  7.9 95.9 76.6]
 [28.  14.  14.  ...  8.2 98.1 79.8]
 [28.  16.  16.  ...  7.2 87.9 61.9]
 ...
 [24.  14.   0.  ...  0.1  1.  67.3]
 [24.   7.   0.  ...  0.1  1.9 32.9]
 [27.   2.   0.  ...  0.5  2.  44. ]]


Unnamed: 0,Age,G,GS,Tgt,Rec,Ctch%,Yds,Y/R,TD,Lng,Y/Tgt,R/G,Y/G,QBR
0,27,16,15,166,127,76.5,1535,12.1,8,55,9.2,7.9,95.9,76.6
2,28,14,14,149,115,77.2,1374,11.9,18,56,9.2,8.2,98.1,79.8
4,28,16,16,160,115,71.9,1407,12.2,6,60,8.8,7.2,87.9,61.9
6,27,16,16,151,102,67.5,1250,12.3,6,42,8.3,6.4,78.1,53.8
8,28,14,13,147,100,68.0,992,9.9,8,28,6.7,7.1,70.9,62.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,28,6,0,1,1,100.0,22,22.0,1,22,22.0,0.2,3.7,76.6
380,28,5,0,3,1,33.3,5,5.0,0,5,1.7,0.2,1.0,68.3
383,24,14,0,3,1,33.3,14,14.0,0,14,4.7,0.1,1.0,67.3
386,24,7,0,3,1,33.3,13,13.0,0,13,4.3,0.1,1.9,32.9


In [11]:
def zscore_normalize_features(X):
    """
    computes  X, zcore normalized by column
    
    Args:
      X (ndarray (m,n))     : input data, m examples, n features
      
    Returns:
      X_norm (ndarray (m,n)): input normalized by column
      mu (ndarray (n,))     : mean of each feature
      sigma (ndarray (n,))  : standard deviation of each feature
    """
    # find the mean of each column/feature
    mu     = np.mean(X, axis=0)                 # mu will have shape (n,)
    # find the standard deviation of each column/feature
    sigma  = np.std(X, axis=0)                  # sigma will have shape (n,)
    # element-wise, subtract mu for that column from each example, divide by std for that column
    X_norm = (X - mu) / sigma      

    return (X_norm, mu, sigma)
 
#check our work
#from sklearn.preprocessing import scale
#scale(X_orig, axis=0, with_mean=True, with_std=True, copy=True)


In [12]:
# normalize the original features
X_norm, X_mu, X_sigma = zscore_normalize_features(X_train)
print(f"X_mu = {X_mu}, \nX_sigma = {X_sigma}")
print(f"Peak to Peak range by column in Raw        X:{np.ptp(X_train,axis=0)}")   
print(f"Peak to Peak range by column in Normalized X:{np.ptp(X_norm,axis=0)}")

X_mu = [ 25.40361446  12.63253012   6.6626506   56.85542169  37.30722892
  65.35301205 474.92771084  12.6626506    3.05421687  42.02409639
   8.16746988   2.79156627  35.57590361  56.87349398], 
X_sigma = [  2.7811303    4.02894971   5.51619108  43.73734103  29.99069447
  13.02296246 382.18522864   3.42547898   3.26399863  19.52352668
   2.48787946   1.91671184  24.95495986  13.50724111]
Peak to Peak range by column in Raw        X:[  14.    15.    16.   165.   126.    71.4 1532.    19.    18.    89.
   20.3    8.1   97.9   51.6]
Peak to Peak range by column in Normalized X:[5.03392452 3.72305466 2.90055217 3.77252014 4.20130318 5.4826235
 4.0085275  5.54666957 5.5147082  4.55860263 8.15955931 4.22598735
 3.92306782 3.82017316]


In [13]:
init_w = np.zeros(14)
init_b = 0.
iterations = 20000
alpha = 2.0e-3

w_norm, b_norm, hist = gradient_descent(X_norm, y_train, init_w, init_b, compute_cost, compute_gradient, alpha, iterations)

Iteration    0: Cost  7863.06   
Iteration 2000: Cost  1738.89   
Iteration 4000: Cost  1729.85   
Iteration 6000: Cost  1728.12   
Iteration 8000: Cost  1727.60   
Iteration 10000: Cost  1727.37   
Iteration 12000: Cost  1727.23   
Iteration 14000: Cost  1727.12   
Iteration 16000: Cost  1727.03   
Iteration 18000: Cost  1726.95   


In [14]:
print(X_features)

Index(['Age', 'G', 'GS', 'Tgt', 'Rec', 'Ctch%', 'Yds', 'Y/R', 'TD', 'Lng',
       'Y/Tgt', 'R/G', 'Y/G', 'QBR'],
      dtype='object')


In [15]:
p1 = np.array([27, 16, 15, 166, 127, 76.5, 1535, 12.1, 8, 55, 9.2, 7.9, 95.9, 76.6])
p2 = np.array([28,14,14,149,115,77.2,1374,11.9,18,56,9.2,8.2,98.1,79.8])
p3 = np.array([28,14,13,147,100,68.0,992,9.9,8,28,6.7,7.1,70.9,62.6])

p1_norm = (p1 - X_mu) / X_sigma
p2_norm = (p2 - X_mu) / X_sigma
p3_norm = (p3 - X_mu) / X_sigma

p1_predict = np.dot(p1_norm, w_norm) + b_norm
p2_predict = np.dot(p2_norm, w_norm) + b_norm
p3_predict = np.dot(p3_norm, w_norm) + b_norm

print(f"Predicted fantasy points for the next season = {p1_predict:0.0f} pts")
print("Actual = 283.5")

print(f"Predicted fantasy points for the next season = {p2_predict:0.0f} pts")
print("Actual = 344.3")

print(f"Predicted fantasy points for the next season = {p3_predict:0.0f} pts")
print("Actual = 253.8")


Predicted fantasy points for the next season = 268 pts
Actual = 283.5
Predicted fantasy points for the next season = 271 pts
Actual = 344.3
Predicted fantasy points for the next season = 203 pts
Actual = 253.8


In [16]:
print(w_norm)

[-10.74300955   0.32695927 -10.78607513  11.08953432  18.76981582
   0.68158267  10.64741278   1.57440268   5.28229147   5.01449373
  -4.00121603  14.10322688  10.51696343   7.8430291 ]
