In [10]:
#Basic Imports
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import precision_score, recall_score

import warnings
warnings.filterwarnings('ignore') 

In [11]:
data0 = pd.read_excel('wrs.xlsx',header=[0,1])

In [12]:
#Set up some nans for cleaning
data = data0.replace('-',np.nan)
data = data.replace('UDFA',np.nan)

#Replace nans with median/maximum/0 depending on the variable
for col in data.columns:
    if 'Career' in col[0]:
        med = np.median(data[col].dropna())
        data[col] = data[col].fillna(med)

    if 'NFL Combine' in col[0]:
        med = np.median(data[col].dropna())
        data[col] = data[col].fillna(med)

    if 'Pro Day' in col[0]:
        med = np.median(data[col].dropna())
        data[col] = data[col].fillna(med)

    if 'Dominator' in col[0]:
        med = np.median(data[col].dropna())
        data[col] = data[col].fillna(med)

    if 'Breakout' in col[0]:
        max = np.max(data[col])+1
        data[col] = data[col].fillna(max)

    if 'Draft' in col[0]:
        max = np.max(data[col])+1
        data[col] = data[col].fillna(max)

    if 'Finishes' in col[0]:
        data[col] = data[col].fillna(0)

#One hot encode conference data
one_hot_conf = pd.get_dummies(data[('School','Conf')])
data = pd.concat([data,one_hot_conf],axis=1)

#Set up target variable
target = np.zeros(data.shape[0])
for nv,val in enumerate(data[('NFL Stats, Finishes, & Milestones', 'Top 24 WR')].values):
    if val > 0:
        target[nv]=1

data['target'] = target

#List of features to be dropped.
drop_list = [
    ('School', 'NFL Player ID'),
    ('School', 'School'),
    ('School', 'Conf'),
    ('Draft', 'Draft Year'),
    ('NFL Stats, Finishes, & Milestones', 'Top 5 WR'),
    ('NFL Stats, Finishes, & Milestones', 'Top 12 WR'),
    ('NFL Stats, Finishes, & Milestones', 'Top 24 WR'),
    ('NFL Stats, Finishes, & Milestones', 'AVG PPG YR 1-3'),
    ('NFL Combine', '40 time'),
    ('NFL Combine', 'Bench'),
    ('NFL Combine', 'Vertical'),
    ('NFL Combine', 'Broad'),
    ('NFL Combine', 'Shuttle'),
    ('NFL Combine', '3 Cone')
    ]

drop_list = drop_list+[col for col in data.columns if 'Career Average' in col[0] or 'Pro Day' in col[0] or 'Career Average' in col[0]]

#Features to keep
feats = [col for col in data.columns if col not in drop_list]

#Data split
rookies = data.copy().loc[data[('Draft','Draft Year')] == 2022][feats]
young = data.copy().loc[(data[('Draft','Draft Year')] >= 2019)&(data[('Draft','Draft Year')] < 2022)][feats]
base = data.copy().loc[data[('Draft','Draft Year')] <= 2018][feats]

In [14]:
def scale_df(df,discrete_cols,scaler_obj,fit_scaler=False,return_scaler=False,drop_discrete=False):

    continuous_cols = [col for col in df.columns if col not in discrete_cols]
    discrete = df[discrete_cols]
    continuous = df[continuous_cols]

    scaler = scaler_obj
    if fit_scaler:
        continuous_scaled = scaler.fit_transform(continuous)
    else:
        continuous_scaled = scaler.transform(continuous)

    out = np.concatenate([continuous_scaled,discrete],axis=1)
    out = pd.DataFrame(out,columns=list(continuous_cols)+list(discrete_cols))

    if drop_discrete:
        out = out[continuous_cols]

    if return_scaler:
        return out, scaler
    else:
        return out

In [15]:
X = base.drop(['target',('WR', 'Player')],axis=1)
y = base['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13, stratify=y)


scaler = MinMaxScaler()

X_train, xscaler = scale_df(df=X_train,discrete_cols=one_hot_conf.columns,scaler_obj=scaler,fit_scaler=True,return_scaler=True)

X_test = scale_df(df=X_test,discrete_cols=one_hot_conf.columns,scaler_obj=xscaler)



clfs = [
    KNeighborsClassifier(),
    LogisticRegression(max_iter=100,verbose=0),
    xgb.XGBClassifier(objective='binary:logistic')
]

params=[

    #knn
    {
        'n_neighbors': [3,5,7,9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },

    #log reg
    {
        'penalty' : ['l1', 'l2'],
        'solver' : ['liblinear'],
        'fit_intercept': [True, False]
    },

    #xgboost
    {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
        'n_estimators': [10, 100, 1000]
        }
]

perf=0
for nc,clf in enumerate(clfs):

    cv = RandomizedSearchCV(clf,params[nc],cv=2,n_iter=100,verbose=-3,scoring='precision')

    cv.fit(X_train,y_train)
    newperf=np.max(pd.DataFrame(cv.cv_results_)['mean_test_score'])
    if newperf > perf:
        perf=newperf
        model = cv
    print('CV Mean Score: '+str(newperf))

CV Mean Score: 0.35
CV Mean Score: 0.5166666666666666
CV Mean Score: 0.5961538461538461


In [16]:
test_pred = cv.predict_proba(X_test)[:,1]

ts = [ 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9 ]
for thresh in ts:
    prec = precision_score(y_test,test_pred>thresh)
    rec = recall_score(y_test,test_pred>thresh)
    f1 = 2*((prec*rec)/(rec+prec))

    print(thresh,prec,rec,f1)

0.1 0.28846153846153844 0.75 0.4166666666666667
0.2 0.37142857142857144 0.65 0.4727272727272728
0.3 0.44 0.55 0.48888888888888893
0.4 0.47619047619047616 0.5 0.4878048780487805
0.5 0.5833333333333334 0.35 0.4375
0.6 0.5 0.2 0.28571428571428575
0.7 0.6666666666666666 0.2 0.30769230769230765
0.8 0.0 0.0 nan
0.9 0.0 0.0 nan


In [17]:
yg_names = young.iloc[:,0].values
yg_preds = model.predict_proba(young.drop(['target',('WR', 'Player')],axis=1))[:,1]

yg = np.zeros((len(yg_names),2))
yg = pd.DataFrame(yg)
yg.columns = ['Player', 'Prob']
yg['Player'] = yg_names
yg['Prob'] = yg_preds
yg = yg.sort_values('Prob',ascending=False)

yg.to_csv('yg_wr.csv')

yg

Unnamed: 0,Player,Prob
108,Lynn Bowden,0.229288
129,Ashton Dulin,0.155296
147,Jalen Hurd,0.155141
140,Felton Davis,0.148556
154,Keelan Doss,0.148556
...,...,...
153,Juwann Winfree,0.050459
46,Racey McMath,0.050459
68,Austin Mack,0.050459
54,Tarik Black,0.050459


In [18]:
rk_names = rookies.iloc[:,0].values
rk_preds = model.predict_proba(rookies.drop(['target',('WR', 'Player')],axis=1))[:,1]

out = np.zeros((len(rk_names),2))
out = pd.DataFrame(out)
out.columns = ['Player', 'Prob']
out['Player'] = rk_names
out['Prob'] = rk_preds
out = out.sort_values('Prob',ascending=False)

out.to_csv('rk_wr.csv')

out

Unnamed: 0,Player,Prob
5,Calvin Turner,0.160469
17,Erik Ezukanma,0.148556
12,David Bell,0.148556
21,Jahan Dotson,0.143603
9,Christian Watson,0.142191
4,Calvin Austin,0.142191
26,Jaquarii Roberson,0.140379
33,Kevin Austin,0.140379
15,Drake London,0.140379
3,Britain Covey,0.139165
