In [None]:
import pandas as pd
stats = pd.read_csv('player_mvp_stats.csv',encoding='latin-1')
del stats['Unnamed: 0']
stats

# Check if nulls in columns
pd.isnull(stats).sum()
stats = stats[:-11]

# Nulls are in percentage colums with players who attempted 0 of those types of shots
stats[pd.isnull(stats["3P%"])][["Player", "3P%"]]

stats = stats.fillna(0)
stats.head()

stats.columns



Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [None]:
predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS']

In [None]:

# Train on years before 2021 to test 2021
train = stats[stats['Year'] < 2021]
test = stats[stats['Year'] == 2021]

In [None]:
from sklearn.linear_model import Ridge

reg = Ridge(alpha=.1)

In [None]:
# Use predictors columns to predict Share and convert to dataframe
reg.fit(train[predictors], train["Share"])

predictions = reg.predict(test[predictors])

predictions = pd.DataFrame(predictions, columns=['predictions'], index = test.index)
predictions

Unnamed: 0,predictions
630,0.013543
631,-0.013747
632,0.002437
633,-0.004468
634,0.010707
...,...
13897,-0.012564
13898,-0.011559
13899,0.016469
13900,-0.020427


In [None]:
# Combine player and share columns and add predictions
combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)
combination

Unnamed: 0,Player,Share,predictions
630,Aaron Gordon,0.0,0.013543
631,Austin Rivers,0.0,-0.013747
632,Bol Bol,0.0,0.002437
633,Facundo Campazzo,0.0,-0.004468
634,Greg Whittington,0.0,0.010707
...,...,...,...
13897,Patty Mills,0.0,-0.012564
13898,Quinndary Weatherspoon,0.0,-0.011559
13899,Rudy Gay,0.0,0.016469
13900,Tre Jones,0.0,-0.020427


In [None]:
combination.sort_values("Share", ascending=False).head(10)

Unnamed: 0,Player,Share,predictions
641,Nikola JokiÛ,0.961,0.154287
8624,Joel Embiid,0.58,0.162746
3651,Stephen Curry,0.449,0.142361
9907,Giannis Antetokounmpo,0.345,0.207471
1389,Chris Paul,0.138,0.072276
10997,Luka DonÛiÛ,0.042,0.151395
7464,Damian Lillard,0.038,0.116263
3536,Julius Randle,0.02,0.0889
3531,Derrick Rose,0.01,0.033035
11358,Rudy Gobert,0.008,0.095352


In [None]:
from sklearn.metrics import mean_squared_error
# Error metric without ranking
mean_squared_error(combination["Share"], combination["predictions"])

0.0026670698853649977

In [None]:
combination["Share"].value_counts()

Share
0.000    525
0.001      3
0.961      1
0.138      1
0.010      1
0.020      1
0.449      1
0.005      1
0.038      1
0.003      1
0.580      1
0.345      1
0.042      1
0.008      1
Name: count, dtype: int64

In [None]:
# Adding ranking column
combination = combination.sort_values("Share", ascending=False)
combination["Rk"] = list(range(1,combination.shape[0]+1))
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk
641,Nikola JokiÛ,0.961,0.154287,1
8624,Joel Embiid,0.58,0.162746,2
3651,Stephen Curry,0.449,0.142361,3
9907,Giannis Antetokounmpo,0.345,0.207471,4
1389,Chris Paul,0.138,0.072276,5
10997,Luka DonÛiÛ,0.042,0.151395,6
7464,Damian Lillard,0.038,0.116263,7
3536,Julius Randle,0.02,0.0889,8
3531,Derrick Rose,0.01,0.033035,9
11358,Rudy Gobert,0.008,0.095352,10


In [None]:
# Predictions with rankings
combination = combination.sort_values("predictions", ascending=False)
combination["pred_Rk"] = list(range(1,combination.shape[0]+1))
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk,pred_Rk
9907,Giannis Antetokounmpo,0.345,0.207471,4,1
8624,Joel Embiid,0.58,0.162746,2,2
641,Nikola JokiÛ,0.961,0.154287,1,3
10997,Luka DonÛiÛ,0.042,0.151395,6,4
3736,LeBron James,0.001,0.147549,15,5
3651,Stephen Curry,0.449,0.142361,3,6
4177,Kevin Durant,0.0,0.141431,531,7
4174,James Harden,0.001,0.1406,13,8
11784,Zion Williamson,0.0,0.127925,251,9
3876,Russell Westbrook,0.005,0.120263,11,10


In [None]:
# Sort dataframe by share and take top 5, then go through predictions to see
# how many are in top 5 for both predictions and actual share and compare to get error metric

def find_ap(combination):
  actual = combination.sort_values("Share", ascending=False).head(5)
  predicted = combination.sort_values("predictions", ascending=False)
  ps = []
  found = 0
  seen = 1
  for index, row in predicted.iterrows():
    if row["Player"] in actual["Player"].values:
      found += 1
      ps.append(found/seen)
    seen += 1
  return sum(ps)/len(ps)

find_ap(combination)


0.7636363636363636

In [None]:
# Getting average by predicting through each year

years = list(range(1991,2022))
aps = []
all_predictions = []
for year in years[5:]:
  train = stats[stats["Year"] < year]
  test = stats[stats["Year"] == year]
  reg.fit(train[predictors], train["Share"])
  predictions = reg.predict(test[predictors])
  predictions = pd.DataFrame(predictions, columns=['predictions'], index = test.index)
  combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)
  all_predictions.append(combination)
  aps.append(find_ap(combination))

sum(aps)/len(aps)


0.7112884360789578

In [None]:
def add_ranks(combination):
  combination = combination.sort_values("Share", ascending=False)
  combination["Rk"] = list(range(1,combination.shape[0]+1))
  combination = combination.sort_values("predictions", ascending=False)
  combination["pred_Rk"] = list(range(1,combination.shape[0]+1))
  combination["diff"] = combination["Rk"] - combination["pred_Rk"]
  return combination

add_ranks(all_predictions[1]).sort_values("diff", ascending=False)



Unnamed: 0,Player,Share,predictions,Rk,pred_Rk,diff
4680,Matt Fish,0.0,0.038264,440,46,394
4669,Bruce Bowen,0.0,0.077537,398,13,385
3583,Rasheed Wallace,0.0,0.049905,414,29,385
5125,Anfernee Hardaway,0.0,0.058405,391,22,369
4675,James Scott,0.0,0.040917,392,42,350
...,...,...,...,...,...,...
9850,Sean Elliott,0.0,-0.019072,23,374,-351
85,A.C. Green,0.0,-0.024921,38,394,-356
10383,Hubert Davis,0.0,-0.026893,43,403,-360
9497,Lorenzo Williams,0.0,-0.036818,64,428,-364


In [None]:
ranking = add_ranks(all_predictions[1])

ranking[ranking["Rk"] <= 5].sort_values("diff", ascending=False)


Unnamed: 0,Player,Share,predictions,Rk,pred_Rk,diff
1600,Karl Malone,0.857,0.192318,1,2,-1
10524,Michael Jordan,0.832,0.167629,2,3,-1
908,Grant Hill,0.327,0.128646,3,6,-3
4682,Tim Hardaway,0.207,0.059984,4,20,-16
8248,Glen Rice,0.117,0.03311,5,53,-48


In [None]:
# Using mean ap to create a backtesting model with our prediction logic in one function,
# to get single error metric for all years

def backtest(stats, model, year, predictors):
  aps = []
  all_predictions = []
  for year in years[5:]:
    train = stats[stats["Year"] < year]
    test = stats[stats["Year"] == year]
    reg.fit(train[predictors], train["Share"])
    predictions = reg.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns=['predictions'], index = test.index)
    combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)
    combination = add_ranks(combination)
    all_predictions.append(combination)
    aps.append(find_ap(combination))
  return sum(aps)/len(aps), aps, pd.concat(all_predictions)

mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)

mean_ap


0.7112884360789578

In [None]:
all_predictions[all_predictions["Rk"] <= 5].sort_values("diff").head(10)

Unnamed: 0,Player,Share,predictions,Rk,pred_Rk,diff
1224,Jason Kidd,0.712,0.02821,2,52,-50
8248,Glen Rice,0.117,0.03311,5,53,-48
5175,Steve Nash,0.839,0.034099,1,45,-44
8516,Peja StojakoviÛ,0.228,0.036262,4,38,-34
5193,Steve Nash,0.739,0.054125,1,34,-33
12726,Joakim Noah,0.258,0.046969,4,37,-33
3657,Chauncey Billups,0.344,0.052694,5,35,-30
1389,Chris Paul,0.138,0.072276,5,33,-28
5208,Steve Nash,0.785,0.074418,2,21,-19
4682,Tim Hardaway,0.207,0.059984,4,20,-16


In [None]:
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)

# Get mean stats for all players and divide them by average stats for season.
stat_ratios = stats[["PTS", "AST", "STL", "BLK", "3P", "Year"]].groupby("Year").apply(lambda x: x/x.mean()).reset_index(drop=True)
stat_ratios

Unnamed: 0,PTS,AST,STL,BLK,3P,Year
0,1.013334,0.420714,0.961127,0.673469,0.508587,1.0
1,1.614653,1.028412,1.647646,0.673469,4.577279,1.0
2,0.311795,0.093492,0.274608,1.571429,0.000000,1.0
3,0.200440,0.186984,0.274608,0.000000,0.000000,1.0
4,2.383005,1.636110,1.784950,0.897959,1.525760,1.0
...,...,...,...,...,...,...
14076,1.207728,1.207491,0.971223,0.000000,2.393794,1.0
14077,0.257201,0.201248,0.647482,0.240964,0.099741,1.0
14078,1.274824,0.704370,1.133094,1.445783,1.695604,1.0
14079,0.279567,0.553433,0.323741,0.000000,0.099741,1.0


In [None]:
aligned_stat_ratios = stat_ratios.reindex(stats.index)

stats[["PTS_T", "AST_R", "STL_R", "BLK_R", "3P_R"]] = stat_ratios[["PTS", "AST", "STL", "BLK", "3P"]]
stats.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,W/L%,GB,PS/G,PA/G,SRS,PTS_T,AST_R,STL_R,BLK_R,3P_R
0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,0.476,...,0.707,5.0,106.3,99.6,6.73,1.013334,0.420714,0.961127,0.673469,0.508587
1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,0.477,...,0.707,5.0,106.3,99.6,6.73,1.614653,1.028412,1.647646,0.673469,4.577279
2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,0.455,...,0.707,5.0,106.3,99.6,6.73,0.311795,0.093492,0.274608,1.571429,0.0
3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,0.34,...,0.707,5.0,106.3,99.6,6.73,0.20044,0.186984,0.274608,0.0,0.0
4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,0.492,...,0.707,5.0,106.3,99.6,6.73,2.383005,1.63611,1.78495,0.897959,1.52576


In [None]:
predictors += ["PTS_T", "AST_R", "STL_R", "BLK_R", "3P_R"]
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)
mean_ap

0.713278284329526

In [None]:
stats["Pos"].unique()
stats["NPos"] = stats["Pos"].astype("category").cat.codes
stats["NTm"] = stats["Tm"].astype("category").cat.codes

stats.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,GB,PS/G,PA/G,SRS,PTS_T,AST_R,STL_R,BLK_R,3P_R,NPos
0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,0.476,...,5.0,106.3,99.6,6.73,1.013334,0.420714,0.961127,0.673469,0.508587,2
1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,0.477,...,5.0,106.3,99.6,6.73,1.614653,1.028412,1.647646,0.673469,4.577279,12
2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,0.455,...,5.0,106.3,99.6,6.73,0.311795,0.093492,0.274608,1.571429,0.0,2
3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,0.34,...,5.0,106.3,99.6,6.73,0.20044,0.186984,0.274608,0.0,0.0,2
4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,0.492,...,5.0,106.3,99.6,6.73,2.383005,1.63611,1.78495,0.897959,1.52576,8


In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=50, random_state=1, min_samples_split=5)

mean_ap, aps, all_predictions = backtest(stats, rf, years[28:], predictors)
mean_ap

0.713278284329526

In [None]:
# Linear model
mean_ap, aps, all_predictions = backtest(stats, reg, years[28:], predictors)
mean_ap

0.713278284329526