In [1]:
import pandas as pd
import os
from functools import partial

from bs4 import BeautifulSoup
import requests
import re
from datetime import datetime
import pickle
import numpy as np

In [8]:
with open('../data/created/reg_set.p', 'rb') as handle:
    reg_set1 = pickle.load(handle)

final_reg = reg_set1[reg_set1['Year'] != 2022]
final_pred = reg_set1[reg_set1['Year'] == 2022]

exclude1415 = True
if exclude1415:
    final_reg = final_reg[final_reg['Year'] > 2015].copy()

In [9]:
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

reg_train, reg_test = train_test_split(final_reg, test_size = 0.2)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
    
base_vars = ['AverageDraftPositionPPR',
            'AverageDraftPositionPPRSq',
            'foundAdp',
            'QB','RB','TE','WR']
base_x = (reg_train[base_vars].copy(), reg_test[base_vars].copy())

x_vars = ['Year',
            'AverageDraftPositionPPR',
            'AverageDraftPositionPPRSq',
            'foundAdp',
            'PrvPts_PPR',
            'foundLastYearStats',
            'qbDiff',
            'Draft',
            'Age',
            'PrvYrTmPts',
            'PlayersAtPosition',
            'PrvYrPtsShare',
            'QB','RB','TE','WR']
x = (reg_train[x_vars].copy(), reg_test[x_vars].copy())


y_pts = (reg_train['Pts_PPR'].copy(), reg_test['Pts_PPR'].copy())
y_cl = (reg_train['aboveBase'].copy(), reg_test['aboveBase'].copy())

# def encodeYears(x):
#     encoder = OneHotEncoder()
#     y = x.copy()
#     yrNames = [str(i) for i in list(y['Year'].unique())]
#     yrNames.sort()
#     y[yrNames] = encoder.fit_transform(y[['Year']]).toarray()
#     return y.drop('Year', axis = 1)


x_0 = tuple([i.drop('Year', axis = 1) for i in list(x)])
# x_a = tuple([encodeYears(i) for i in list(x)])

# x_b1, y_pts_b1, y_cl_b1 = 
# x_b2 = encodeYears(x_b1)

### 1. Predict points scored

In [10]:
from sklearn.linear_model import LassoCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MaxAbsScaler

def try_lassos(X, y, polys = 2):
    X_train, X_test  = X
    y_train, y_test = y
    X_train = X_train.reset_index().drop('index', axis = 1)
    y_train = np.array(y_train, dtype = float)
    model = make_pipeline(PolynomialFeatures(polys), MaxAbsScaler(), LassoCV(cv = 5, random_state = 0, max_iter = 500000))
    model.fit(X_train, y_train)
    print(f"R^2 score on training data {model.score(X_train, y_train)}")
    print(f"R^2 score on test data {model.score(X_test, y_test)}")
    print()
    return model

full_pts_model =try_lassos(x_0, y_pts, polys = 2)
# try_lassos(x_a, y_pts, polys = 2)
adp_pts_model = try_lassos(base_x, y_pts, polys = 3)

R^2 score on training data 0.6802859088101214
R^2 score on test data 0.674309232119654

R^2 score on training data 0.5783228258371138
R^2 score on test data 0.5801931855422693



### 2. Predict starter percentage

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

def try_classifier(classifier, X, y, polys = 2):
    X_train, X_test  = X
    y_train, y_test = y
    X_train = X_train.reset_index().drop('index', axis = 1)
    y_train = np.array(y_train, dtype = float)
    model = make_pipeline(PolynomialFeatures(polys), MaxAbsScaler(), classifier)
    scores = cross_val_score(model, X_train, y_train, cv = 5)
    # print(scores)
    return scores.sum() / len(scores)

def run_model(classifier, X, y, polys = 2):
    X_train, X_test  = X
    y_train, y_test = y
    X_train = X_train.reset_index().drop('index', axis = 1)
    y_train = np.array(y_train, dtype = float)
    model = make_pipeline(PolynomialFeatures(polys), MaxAbsScaler(), classifier)
    model.fit(X_train, y_train)
    print(f"Score on training data {model.score(X_train, y_train)}")
    print(f"Score on test data {model.score(X_test, y_test)}")
    return model

#### 2a. Try Logistic Regression

In [12]:
def testLogReg(x, y):
    best_c = None
    c_max = 0
    for c in [0.01, 0.1, 0.25, 0.33, 0.5, 0.95, 0.995, 1, 1.005, 1.05, 1.5]:
        a = LogisticRegression(penalty = 'l2', C = c, max_iter = 1000)
        res = try_classifier(a, x, y)
        if res >= c_max:
            best_c = c
            c_max = res
    print(best_c)
    print(c_max)
    print()

    final_model = LogisticRegression(penalty = 'l2', C = best_c, max_iter = 1000)
    final_model = run_model(final_model, x, y)
    return final_model

final_lr_model = testLogReg(x_0, y_cl)
base_lr_model = testLogReg(base_x, y_cl)

1.5
0.9309139784946237

Score on training data 0.9354838709677419
Score on test data 0.9398496240601504
0.25
0.9309139784946237

Score on training data 0.9314516129032258
Score on test data 0.9377013963480129


#### 2b. Try SVM

In [10]:
def testSVM(x, y):
    c_max = 0
    for c in [0.01, 0.1, 0.25, 0.33, 0.5, 0.95, 0.995, 1, 1.005, 1.05, 1.5]:
        a = SVC(C = c)
        res = try_classifier(a, x, y)
        if res >= c_max:
            best_c = c
            c_max = res
    print(best_c)
    print(c_max)
    print()
            
    final_model = SVC(C = best_c)
    final_model = run_model(final_model, x, y)
    return final_model
final_svm_model = testSVM(x_0, y_cl)
base_svm_model = testSVM(base_x, y_cl)

# base_x[0].columns
# base_x[1].columns

1.5
0.932258064516129

Score on training data 0.9362903225806452
Score on test data 0.9215896885069818
1.05
0.9336021505376344

Score on training data 0.935752688172043
Score on test data 0.9119226638023631


#### 2c. Try Random Forest Classifier

In [11]:
from sklearn.ensemble import RandomForestClassifier
def testRFC(x, y):
    c_max = 0
    for c in [2, 5]:
        for d in [100, 250, 500]:
            a = RandomForestClassifier(max_depth = c, n_estimators = d)
            res = try_classifier(a, x, y)
            if res >= c_max:
                best_c = c
                best_d = d
                c_max = res

    print(best_c, best_d)
    print(c_max)
    print()
            
    final_model = RandomForestClassifier(max_depth = best_c, n_estimators = d)
    final_model = run_model(final_model, x, y)
    return final_model

final_rfc_model = testRFC(x_0, y_cl)
base_rfc_model = testRFC(base_x, y_cl)

5 500
0.9341397849462366

Score on training data 0.9475806451612904
Score on test data 0.9194414607948442
5 500
0.9346774193548386

Score on training data 0.943010752688172
Score on test data 0.9140708915145005


### 3a. ADP-based projections

In [13]:
base_vars = ['AverageDraftPositionPPR',
            'AverageDraftPositionPPRSq',
            'foundAdp',
            'QB','RB','TE','WR']

def prepare_draft_template():
    # pred_x = pred_set[['AverageDraftPositionPPR', 'AverageDraftPositionPPRSq', 'foundAdp', 'QB','RB','TE','WR']]
    a = np.column_stack((np.arange(1, 201), np.square(np.arange(1, 201)), np.ones(200), 
                            np.ones(200), np.zeros(200), np.zeros(200), np.zeros(200)))
    arrs = [a]
    for i in range(4, 7):
        tmp = arrs[-1].copy()
        tmp[:, [i-1, i]] = tmp[:, [i, i - 1]]
        arrs.append(tmp)
    return arrs

arrs = prepare_draft_template()
final = pd.DataFrame(np.concatenate(arrs))

In [14]:
final.columns = ['AverageDraftPositionPPR', 'AverageDraftPositionPPRSq', 'foundAdp', 'QB','RB','TE','WR']

np_res = np.column_stack((adp_pts_model.predict(final), [i[1] for i in base_lr_model.predict_proba(final)]))
final_additions = pd.DataFrame(np_res)
final_additions.columns = ['Predicted Points', 'Predicted Prob']
final = final.join(final_additions)

In [38]:
# excelBook = load_workbook(EXPORT_FILE)
# with pd.ExcelWriter(EXPORT_FILE, engine='xlsxwriter') as writer:
#     print(writer)
#     # Save your file workbook as base
#     writer.sheets = dict((ws.title, ws) for ws in excelBook.worksheets)

#     # Now here add your new sheets
#     final.to_excel(writer,'saw', index = False)

#     # Save the file
#     writer.save()

### 3b. 2022 Player Projections

In [18]:
x_pred_0 = final_pred[x_vars].copy().drop('Year', axis = 1)
x_pred_adp = final_pred[['AverageDraftPositionPPR', 'AverageDraftPositionPPRSq', 'foundAdp', 'QB','RB','TE','WR']]
final_pred = final_pred.reset_index().drop('index', axis = 1)
final_pred['Preds'] = full_pts_model.predict(x_pred_0)
final_pred['Preds_adp'] = adp_pts_model.predict(x_pred_adp)
final_pred['Prob'] = [i[1] for i in final_lr_model.predict_proba(x_pred_0)]
final_pred.sort_values('AverageDraftPositionPPR', ascending = True, inplace = True)
final_pred.columns
preds = final_pred[['Player','Tm','FantPos','PrvPts_PPR','AverageDraftPositionPPR','OldQBs',
                    'NewQBs','DOB','Draft','Age','PrvYrTmPts','PlayersAtPosition',
                    'PrvYrPtsShare','Preds','Preds_adp','Prob']].copy()

with open('../projections/player_proj_2022.p', 'wb') as handle:
    pickle.dump(preds, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [50]:
EXPORT_FILE = './projections2022.xlsx'
with pd.ExcelWriter(EXPORT_FILE, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
    final.to_excel(writer, sheet_name = 'ADPBase', index = False)
    preds.to_excel(writer, sheet_name = 'Players2022', index = False)