In [83]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [84]:
data0 = pd.read_excel('rbs.xlsx',header=[0,1])

In [85]:
data = data0.replace('-',np.nan)
data = data.replace('UDFA',np.nan)

for col in data.columns:
    if 'Career' in col[0]:
        med = np.median(data[col].dropna())
        data[col] = data[col].fillna(med)

    elif 'NFL Combine' in col[0]:
        med = np.median(data[col].dropna())
        data[col] = data[col].fillna(med)

    elif 'Pro Day' in col[0]:
        med = np.median(data[col].dropna())
        data[col] = data[col].fillna(med)

    elif 'Dominator' in col[0]:
        med = np.median(data[col].dropna())
        data[col] = data[col].fillna(med)

    elif 'Breakout' in col[0]:
        max = np.max(data[col])+1
        data[col] = data[col].fillna(max)

    elif 'Draft' in col[0]:
        max = np.max(data[col])+1
        data[col] = data[col].fillna(max)

    elif 'Finishes' in col[0]:
        data[col] = data[col].fillna(0)

    # else:
    #     print(col)
    
target = []
for val in data[('NFL Stats, Finishes, & Milestones', 'Top 24 RB')].values:
    if val > 0:
        target.append(1)
    else:
        target.append(0)

data['target'] = target

drop_list = [('School', 'NFL Player ID'),('School', 'School'),('School', 'Conf'),('Draft', 'Draft Year')]

for col in data.columns:
    if 'Finishes' in col[0]:
        drop_list.append(col)

one_hot_conf = pd.get_dummies(data[('School','Conf')])

data = pd.concat([data,one_hot_conf],axis=1)

In [86]:
rookies = data.copy().loc[data[('Draft','Draft Year')] == 2022][[col for col in data.columns if col not in drop_list]].dropna()
young = data.copy().loc[(data[('Draft','Draft Year')] >= 2017)&(data[('Draft','Draft Year')] < 2022)][[col for col in data.columns if col not in drop_list]].dropna()
base = data.copy().loc[data[('Draft','Draft Year')] <= 2016][[col for col in data.columns if col not in drop_list]].dropna()

  return array(a, dtype, copy=False, order=order)


In [98]:
from sklearn.model_selection import RandomizedSearchCV

params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
        'n_estimators': [10, 100, 1000]
        }

clf = xgb.XGBClassifier(n_estimators=100, objective='binary:logistic')

cv = RandomizedSearchCV(clf,params,cv=5,n_iter=100,verbose=1,scoring='precision')

X_train = base.drop([('target',''),('RB', 'Player')],axis=1)
y_train = base[('target','')]

In [99]:
cv.fit(X_train,y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=XGBClassifier(), n_iter=100,
                   param_distributions={'colsample_bytree': [0.6, 0.8, 1.0],
                                        'gamma': [0.5, 1, 1.5, 2, 5],
                                        'max_depth': [3, 4, 5],
                                        'min_child_weight': [1, 5, 10],
                                        'n_estimators': [10, 100, 1000],
                                        'subsample': [0.6, 0.8, 1.0]},
                   scoring='precision', verbose=1)

In [100]:
np.max(pd.DataFrame(cv.cv_results_)['mean_test_score'])

0.7235964912280701

In [101]:
yg_names = young.iloc[:,0].values
yg_preds = cv.predict_proba(young.drop([('target',''),('RB', 'Player')],axis=1))[:,1]

val = np.zeros((len(yg_names),2))
val = pd.DataFrame(val)
val.columns = ['Player', 'Prob']
val['Player'] = yg_names
val['Prob'] = yg_preds
val = val.sort_values('Prob',ascending=False)

val.to_csv('val_rb.csv')

val

Unnamed: 0,Player,Prob
138,Rashaad Penny,0.958013
143,Saquon Barkley,0.948892
38,Travis Etienne,0.944834
31,Najee Harris,0.933674
164,Joe Mixon,0.930040
...,...,...
33,Rakeem Boyd,0.017052
14,Greg McCrae,0.016328
102,Patrick Laird,0.015104
70,Rico Dowdle,0.014824


In [102]:
rk_names = rookies.iloc[:,0].values
rk_preds = cv.predict_proba(rookies.drop([('target',''),('RB', 'Player')],axis=1))[:,1]

out = np.zeros((len(rk_names),2))
out = pd.DataFrame(out)
out.columns = ['Player', 'Prob']
out['Player'] = rk_names
out['Prob'] = rk_preds
out = out.sort_values('Prob',ascending=False)

out

Unnamed: 0,Player,Prob
29,Rachaad White,0.714174
11,Hassan Haskins,0.692788
14,James Cook,0.592504
21,Kenneth Walker,0.578917
3,Breece Hall,0.553587
39,Tyler Allgeier,0.549246
4,Brian Robinson,0.461939
12,Isaiah Spiller,0.459494
40,Tyler Badie,0.441499
38,Ty Davis-Price,0.317261


In [104]:
from sklearn.metrics import precision_score, recall_score

# thresh=0.5
ts = [ 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9 ]
for thresh in ts:
    prec = precision_score(young[('target', '')].values,((yg_preds>thresh)*1))
    rec = recall_score(young[('target', '')].values,((yg_preds>thresh)*1))
    f1 = 2*((prec*rec)/(rec+prec))

    print(thresh,prec,rec,f1)

0.1 0.25663716814159293 0.90625 0.4
0.2 0.38461538461538464 0.78125 0.5154639175257733
0.3 0.43478260869565216 0.625 0.5128205128205128
0.4 0.47368421052631576 0.5625 0.5142857142857142
0.5 0.5666666666666667 0.53125 0.5483870967741935
0.6 0.7142857142857143 0.46875 0.5660377358490566
0.7 0.7058823529411765 0.375 0.48979591836734687
0.8 0.7 0.21875 0.3333333333333333
0.9 0.6666666666666666 0.125 0.21052631578947367


In [105]:
out.to_csv('rk_rb.csv')

In [None]:
s