In [55]:
import pandas as pd
import requests
import time
from bs4 import BeautifulSoup
from io import StringIO
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import math

In [38]:
stats = pd.read_csv("fullstats.csv")
del stats["Unnamed: 0"]
# weird stuff happened at the end of the df
stats = stats.head(14627)
stats = stats.fillna(0)
stats.head()

Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,...,Pts Won,Share,TeamFull,W,L,W/L%,GB,PS/G,PA/G,SRS
0,170.0,Antoine Carr,27.0,ATL,PF,78.0,12.0,19.1,2.9,6.0,...,0.0,0.0,Atlanta Hawks,52.0,30.0,0.634,11.0,111.0,106.1,5.26
1,143.0,Cliff Levingston,28.0,ATL,PF,80.0,52.0,27.3,3.8,7.1,...,0.0,0.0,Atlanta Hawks,52.0,30.0,0.634,11.0,111.0,106.1,5.26
2,91.0,Doc Rivers,27.0,ATL,PG,76.0,76.0,32.4,4.9,10.7,...,0.0,0.0,Atlanta Hawks,52.0,30.0,0.634,11.0,111.0,106.1,5.26
3,7.0,Dominique Wilkins,29.0,ATL,SF,80.0,80.0,37.5,10.2,22.0,...,0.0,0.0,Atlanta Hawks,52.0,30.0,0.634,11.0,111.0,106.1,5.26
4,300.0,Duane Ferrell,23.0,ATL,SF,41.0,0.0,5.6,0.9,2.0,...,0.0,0.0,Atlanta Hawks,52.0,30.0,0.634,11.0,111.0,106.1,5.26


In [39]:
stats.columns

Index(['Rk', 'Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'MP', 'FG', 'FGA',
       'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA',
       'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
       'Awards', 'Year', 'Pts Won', 'Share', 'TeamFull', 'W', 'L', 'W/L%',
       'GB', 'PS/G', 'PA/G', 'SRS'],
      dtype='object')

In [40]:
predictors = ['G', 'GS', 'MP', 'FG', 'FGA',
       'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 
       'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 
       'Year', 'W', 'L', 'W/L%', 'GB', 'PS/G', 'PA/G', 'SRS']
# segmenting the data into train and test
train = stats[stats["Year"] < 2024]
test = stats[stats["Year"] == 2024]

In [41]:
reg = Ridge(alpha=.1)
reg.fit(train[predictors], train["Share"])

In [42]:
predictions = reg.predict(test[predictors])
predictions = pd.DataFrame(predictions, columns = ["Predictions"], index = test.index)
predictions.head()

Unnamed: 0,Predictions
500,-0.028303
501,0.002377
502,-0.004372
503,0.006914
504,-0.00254


In [47]:
combined = pd.concat([test[["Player", "Share"]], predictions], axis=1)
combined = combined.sort_values(by="Share", ascending=False)
combined["Rk"] = list(range(1, combined.shape[0]+1))
combined = combined.sort_values(by="Predictions", ascending=False)
combined["Predicted_Rk"] = list(range(1, combined.shape[0]+1))
combined.head(10)
# even though embiid is first and he has no share, the model is actually working quite well, as embiid had an mvp-level
# season but just did not play enough games to be considered.

Unnamed: 0,Player,Share,Predictions,Rk,Predicted_Rk
10852,Joel Embiid,0.0,0.222221,372,1
8027,Giannis Antetokounmpo,0.194,0.222129,4,2
3157,Luka Dončić,0.572,0.203762,3,3
3644,Nikola Jokić,0.935,0.187863,1,4
9876,Shai Gilgeous-Alexander,0.646,0.178363,2,5
6675,Anthony Davis,0.0,0.141029,213,6
6686,LeBron James,0.0,0.132319,224,7
1005,Jayson Tatum,0.087,0.121937,6,8
12837,Victor Wembanyama,0.0,0.110933,445,9
7017,Ja Morant,0.0,0.108134,234,10


In [52]:
# time to make an error metric
# methodology:
# * reward predicting someone in the top 5 and they are actually in the top 5 of the real thing
# * if they are in the top 5 and not predicted there, penalize based on how far away they are
def find_error(comb):
    actual = comb.sort_values(by="Share", ascending=False).head(5)
    predicted = comb.sort_values(by="Predictions", ascending=False)
    found = 0
    score = []
    seen = 1
    # for every player in predictions:
    # if predicted top 5 and in top 5 real then found goes up which is good
    # else seen goes up which is bad for score
    for i, row in predicted.iterrows():
        if row["Player"] in actual["Player"].values:
            found += 1
            score.append(found/seen)
        seen += 1
    return sum(score) / len(score)
print(find_error(combined))
# 1 is perfect, lower number means it takes longer for us to find our mvps in the predicted list

0.6202564102564103


In [53]:
# backtesting
years = list(range(1989, 2024))
scores = []
preds = []
for year in years[5:]:
    train = stats[stats["Year"] < year]
    test = stats[stats["Year"] == year]
    reg.fit(train[predictors], train["Share"])
    predictions = reg.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns = ["Predictions"], index = test.index)
    combined = pd.concat([test[["Player", "Share"]], predictions], axis=1)
    combined = combined.sort_values(by="Share", ascending=False)
    combined["Rk"] = list(range(1, combined.shape[0]+1))
    combined = combined.sort_values(by="Predictions", ascending=False)
    combined["Predicted_Rk"] = list(range(1, combined.shape[0]+1))
    preds.append(combined)
    scores.append(find_error(combined))

In [58]:
print(sum(scores) / len(scores))
# not too shabby, seems like our algorithm works decently well without joel embiid messing us up

0.7581544496221915


In [64]:
for df in preds:
    df["Diff"] = df["Rk"] - df["Predicted_Rk"]
print(preds[0][preds[0]["Rk"] < 11].sort_values(by="Diff", ascending=False))

                 Player  Share  Predictions  Rk  Predicted_Rk  Diff
10928   Charles Barkley  0.005     0.135402  10             5     5
13589       Karl Malone  0.017     0.135139   8             6     2
9942   Shaquille O'Neal  0.286     0.161528   4             3     1
12384    David Robinson  0.723     0.204884   2             1     1
4718    Hakeem Olajuwon  0.880     0.170842   1             2    -1
1740     Scottie Pippen  0.386     0.139689   3             4    -1
12911        Shawn Kemp  0.017     0.108850   7             8    -1
9232      Patrick Ewing  0.252     0.134021   5             7    -2
2234         Mark Price  0.007     0.072534   9            12    -3
12904       Gary Payton  0.020     0.033434   6            47   -41


In [69]:
allpreds = pd.concat(preds)
allpreds[allpreds["Rk"] < 11].sort_values(by="Diff", ascending=True).head(10)
# future: Look into these players and see what could be throwing the model?

Unnamed: 0,Player,Share,Predictions,Rk,Predicted_Rk,Diff
1440,Glen Rice,0.117,0.032254,5,52,-47
12904,Gary Payton,0.02,0.033434,6,47,-41
11167,Steve Nash,0.006,0.034497,9,46,-37
11080,Steve Nash,0.839,0.039342,1,37,-36
13064,Ray Allen,0.032,0.034032,10,41,-31
11095,Steve Nash,0.739,0.059485,1,31,-30
791,Paul Pierce,0.017,0.0506,6,35,-29
8693,Jason Kidd,0.712,0.038708,2,31,-29
12044,Peja Stojaković,0.228,0.042534,4,31,-27
831,Rajon Rondo,0.01,0.045316,8,34,-26


In [73]:
pd.concat([pd.Series(predictors), pd.Series(reg.coef_)], axis=1).sort_values(1, ascending=False)

Unnamed: 0,0,1
12,eFG%,0.103467
28,W/L%,0.038313
17,DRB,0.029821
9,2P,0.028974
16,ORB,0.016009
20,STL,0.013214
21,BLK,0.010373
14,FTA,0.00896
6,3P,0.00835
24,PTS,0.008327


In [None]:
# future: add more predictors for the model, run on the upcoming season?