In [125]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import seaborn as sns
import statsmodels.api as sm
import requests
import io
import re

### Very rough plan

data:
- standardize data types for each predictor
- dummy variables for categorical
- for NA values: either 0 or mean values of category or eliminate player?
- graphs
    - CarAV by position, CarAV versus all factors, counts of categorical factors, means of numerical factors



analysis:
- linear reg for entire model, offense/defense, by position 
    - approximate value (CAV) as outcome 
- refit dropping factors with low p-values
- normalize features so coefficient values are meaningful
- lasso regression
- classification into groups based on cut off of CAV for labels such as pro-bowl level, starter, bench, bad
- cluster?


Use best model on most recent draft class to predict success

### Raw data processing

In [101]:
#access data from github repo
url1 = "https://raw.githubusercontent.com/kirkhach/nfl-draft/main/data/predictor_data.csv"
download1 = requests.get(url1).content
url2 = "https://raw.githubusercontent.com/kirkhach/nfl-draft/main/data/outcome_data.csv"
download2 = requests.get(url2).content

#read in predictor data (ncaa, combine)
predictor2 = pd.read_csv(io.StringIO(download1.decode('utf-8')))

#limit data for 2000 (start of data) to 2016
predictor = predictor2[predictor2['year']<=2016]
# test 1
#read in outcome data (nfl)
outcome2 = pd.read_csv(io.StringIO(download2.decode('utf-8')))
outcome2 = outcome2.rename(columns={'Player':'player'})

#select career approximate value metric for each player
outcome3 = outcome2[['player','CarAV']]

#add approximate value metric for each player in predictor dataset
df = pd.merge(predictor,outcome3,on='player')
df = df[df['missing_combine_vals']==0]
df = df.drop(columns=['url','missing_combine_vals'])
df.reset_index(inplace=True,drop=True)
df

Unnamed: 0,pick,team,year,pos,player,college,height_inches,weight,forty,vertical,...,rushing.rec.yards,rushing.receptions,rushing.rush.att,rushing.rush.td,rushing.rush.yds,rushing.scrim.plays,rushing.scrim.tds,rushing.scrim.yds,rushing.seasons,CarAV
0,300.0,,2010,OLB,A.J. Edds,Iowa,76,246,4.62,33.0,...,0,0,0,0,0,0,0,0,0,0.0
1,4.0,CIN,2011,WR,A.J. Green,Georgia,76,211,4.48,34.5,...,0,0,0,0,0,0,0,0,0,53.0
2,5.0,GNB,2006,OLB,A.J. Hawk,Ohio State,73,248,4.59,40.0,...,0,0,0,0,0,0,0,0,0,59.0
3,300.0,,2009,C,A.Q. Shipley,Penn State,73,304,5.19,31.0,...,0,0,0,0,0,0,0,0,0,12.0
4,46.0,DET,2016,DT,A'Shawn Robinson,Alabama,76,307,5.20,26.0,...,0,0,0,0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1423,210.0,NOR,2006,OT,Zach Strief,Northwestern,80,330,5.38,21.0,...,0,0,0,0,0,0,0,0,0,43.0
1424,16.0,DAL,2014,OG,Zack Martin,Notre Dame,76,308,5.22,28.0,...,0,0,0,0,0,0,0,0,0,22.0
1425,97.0,TEN,2013,OLB,Zaviar Gooden,Missouri,73,234,4.47,34.0,...,0,0,0,0,0,0,0,0,0,3.0
1426,144.0,BUF,2012,OT,Zebrie Sanders,Florida State,78,320,5.33,27.0,...,0,0,0,0,0,0,0,0,0,0.0


In [104]:
#convert numerical predictors to numeric data types
X = ['year', 'height_inches','weight','forty','vertical','bench','broad','threecone','shuttle',
    'defense.ast.tackles','defense.fum.forced','defense.fum.rec','defense.fum.tds','defense.fum.yds',
    'defense.games','defense.int','defense.int.td','defense.int.yards','defense.loss.tackles','defense.pd',
    'defense.sacks','defense.seasons','defense.solo.tackes','defense.tackles','passing.attempts',
    'passing.comp.pct','passing.completions','passing.games','passing.pass.ints','passing.pass.tds',
    'passing.pass.yards','passing.seasons','receiving.games','receiving.rec.td','receiving.rec.yards',
    'receiving.receptions','receiving.rush.att','receiving.rush.td','receiving.rush.yds','receiving.scrim.plays',
    'receiving.scrim.tds','receiving.scrim.yds','receiving.seasons','rushing.games','rushing.rec.td',
    'rushing.rec.yards','rushing.receptions','rushing.rush.att','rushing.rush.td','rushing.rush.yds',
    'rushing.scrim.plays','rushing.scrim.tds','rushing.scrim.yds','rushing.seasons']

for col in X:
    X2 = pd.to_numeric(df[col],errors='coerce').fillna(0.0)
    df[col]=X2
    
#change N/A team to undrafted
df['team']=df['team'].fillna("Undrafted")

df.dtypes

pick                     float64
team                      object
year                       int64
pos                       object
player                    object
college                   object
height_inches              int64
weight                     int64
forty                    float64
vertical                 float64
bench                    float64
broad                    float64
threecone                float64
shuttle                  float64
defense.ast.tackles        int64
defense.fum.forced         int64
defense.fum.rec            int64
defense.fum.tds            int64
defense.fum.yds            int64
defense.games              int64
defense.int                int64
defense.int.td             int64
defense.int.yards          int64
defense.loss.tackles       int64
defense.pd                 int64
defense.sacks              int64
defense.seasons            int64
defense.solo.tackes        int64
defense.tackles            int64
passing.attempts           int64
passing.co

In [119]:
#full dataset: 1,428 unique players and 59 predictors (player name is not a predictor)
df

Unnamed: 0,pick,team,year,pos,player,college,height_inches,weight,forty,vertical,...,rushing.rec.yards,rushing.receptions,rushing.rush.att,rushing.rush.td,rushing.rush.yds,rushing.scrim.plays,rushing.scrim.tds,rushing.scrim.yds,rushing.seasons,CarAV
0,300.0,Undrafted,2010,OLB,A.J. Edds,Iowa,76,246,4.62,33.0,...,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0.0
1,4.0,CIN,2011,WR,A.J. Green,Georgia,76,211,4.48,34.5,...,0.0,0,0.0,0,0.0,0.0,0,0.0,0,53.0
2,5.0,GNB,2006,OLB,A.J. Hawk,Ohio State,73,248,4.59,40.0,...,0.0,0,0.0,0,0.0,0.0,0,0.0,0,59.0
3,300.0,Undrafted,2009,C,A.Q. Shipley,Penn State,73,304,5.19,31.0,...,0.0,0,0.0,0,0.0,0.0,0,0.0,0,12.0
4,46.0,DET,2016,DT,A'Shawn Robinson,Alabama,76,307,5.20,26.0,...,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1423,210.0,NOR,2006,OT,Zach Strief,Northwestern,80,330,5.38,21.0,...,0.0,0,0.0,0,0.0,0.0,0,0.0,0,43.0
1424,16.0,DAL,2014,OG,Zack Martin,Notre Dame,76,308,5.22,28.0,...,0.0,0,0.0,0,0.0,0.0,0,0.0,0,22.0
1425,97.0,TEN,2013,OLB,Zaviar Gooden,Missouri,73,234,4.47,34.0,...,0.0,0,0.0,0,0.0,0.0,0,0.0,0,3.0
1426,144.0,BUF,2012,OT,Zebrie Sanders,Florida State,78,320,5.33,27.0,...,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0.0


### Data Visualizations

### Initial Linear Regression

In [123]:
#add dummy variables for categorical predictors
dfd = df.drop(columns=['player'])
dfd = pd.get_dummies(dfd)
dfd.reset_index(inplace=True,drop=True)
dfd

Unnamed: 0,pick,year,height_inches,weight,forty,vertical,bench,broad,threecone,shuttle,...,college_Virginia,college_Virginia Tech,college_Wake Forest,college_Washington,college_Washington State,college_West Virginia,college_Western Kentucky,college_Western Michigan,college_Wisconsin,college_Wyoming
0,300.0,2010,76,246,4.62,33.0,16.0,117.0,7.19,4.28,...,0,0,0,0,0,0,0,0,0,0
1,4.0,2011,76,211,4.48,34.5,18.0,126.0,6.91,4.21,...,0,0,0,0,0,0,0,0,0,0
2,5.0,2006,73,248,4.59,40.0,24.0,115.0,6.82,3.96,...,0,0,0,0,0,0,0,0,0,0
3,300.0,2009,73,304,5.19,31.0,33.0,100.0,7.46,4.40,...,0,0,0,0,0,0,0,0,0,0
4,46.0,2016,76,307,5.20,26.0,22.0,106.0,7.80,4.74,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1423,210.0,2006,80,330,5.38,21.0,19.0,94.0,8.01,4.83,...,0,0,0,0,0,0,0,0,0,0
1424,16.0,2014,76,308,5.22,28.0,29.0,106.0,7.65,4.59,...,0,0,0,0,0,0,0,0,0,0
1425,97.0,2013,73,234,4.47,34.0,27.0,131.0,6.71,4.18,...,0,0,0,0,0,0,0,0,0,0
1426,144.0,2012,78,320,5.33,27.0,28.0,100.0,8.15,4.99,...,0,0,0,0,0,0,0,0,0,0


In [124]:
print(dfd.columns.tolist())

['pick', 'year', 'height_inches', 'weight', 'forty', 'vertical', 'bench', 'broad', 'threecone', 'shuttle', 'defense.ast.tackles', 'defense.fum.forced', 'defense.fum.rec', 'defense.fum.tds', 'defense.fum.yds', 'defense.games', 'defense.int', 'defense.int.td', 'defense.int.yards', 'defense.loss.tackles', 'defense.pd', 'defense.sacks', 'defense.seasons', 'defense.solo.tackes', 'defense.tackles', 'passing.attempts', 'passing.comp.pct', 'passing.completions', 'passing.games', 'passing.pass.ints', 'passing.pass.tds', 'passing.pass.yards', 'passing.seasons', 'receiving.games', 'receiving.rec.td', 'receiving.rec.yards', 'receiving.receptions', 'receiving.rush.att', 'receiving.rush.td', 'receiving.rush.yds', 'receiving.scrim.plays', 'receiving.scrim.tds', 'receiving.scrim.yds', 'receiving.seasons', 'rushing.games', 'rushing.rec.td', 'rushing.rec.yards', 'rushing.receptions', 'rushing.rush.att', 'rushing.rush.td', 'rushing.rush.yds', 'rushing.scrim.plays', 'rushing.scrim.tds', 'rushing.scrim.yds

In [135]:
#predictors are all columns except CarAV
X = df
X = X.drop(columns=['CarAV','player','pos','team','college'])
X = sm.add_constant(X)
#CarAV is the outcome
Y = dfd['CarAV']

#test/train split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

In [136]:
linreg = linear_model.LinearRegression()
scores = cross_validate(linreg, X_train, Y_train, scoring=('r2', 'neg_mean_squared_error'), cv=10, return_train_score=True)
model_final = linreg.fit(X_train,Y_train)

In [141]:
final_predict = model_final.predict(X_test)
print('Coefficients: \n', model_final.coef_)
print('Mean squared error: %.2f'
      % mean_squared_error(Y_test, final_predict))
print('R-squared: %.2f'
      % r2_score(Y_test, final_predict))

Coefficients: 
 [-7.83321834e-12 -5.34251906e-02 -1.37057540e+00  1.28701597e-01
  1.01984922e-01 -1.37699044e+01 -2.78313761e-01  7.53753038e-02
  1.84660663e-02 -7.10236536e+00  5.09678046e+00 -2.41528469e-03
 -5.69416578e-01 -2.74494397e-01 -2.05377999e+00  6.75488395e-02
  1.19269581e-01  1.42183686e-01 -1.21937046e-02 -1.21913664e-02
  3.16008029e-01  1.87096225e-02 -3.61526503e-01 -3.26141203e+00
  1.29403662e-02  1.05250815e-02  6.50080247e+00 -1.02652068e+00
 -9.92465164e+00 -4.43837262e-01  1.14021368e+01 -5.78453042e+00
 -4.85769480e-01 -4.61844700e-01 -3.93566389e-01 -1.91820996e-01
 -7.29852429e-02  7.66207548e-02 -9.75476381e-02  3.35738438e-01
  2.52381928e-02 -2.09268833e-02  1.43917443e-01  6.94877437e-02
  1.03027670e+00 -3.40861053e-01  8.40514165e-02  7.87901802e-03
  2.21044225e-02 -8.78179428e-03  7.52660114e-02 -6.11497857e-03
  1.33226282e-02  1.59317428e-01  1.69517052e-03 -7.76595456e-01]
Mean squared error: 257.03
R-squared: 0.07


### Linear Regression by Position

Group into position groups: quarterback, running back/full back, wide reciever/tight end, offensive line, defensive line, linebacker, secondary