In [5]:
%pip install pandas
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
stats = pd.read_csv('player_mvp_stats.csv')

In [4]:
stats = stats.fillna(0)
stats.columns

Index(['Unnamed: 0', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG',
       'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT',
       'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PTS', 'Year', 'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%',
       'GB', 'PS/G', 'PA/G', 'SRS'],
      dtype='object')

## Filtering Numerical Columns for Predictor Variables

In this section, we will filter only the numerical columns from the dataset, excluding non-numeric columns that we don't need. Additionally, we'll remove columns such as 'MVP points' and 'Win Shares', as 'MVP points' directly influence 'Win Shares', which we are trying to predict.

### Steps:
1. Select only the numerical columns from the dataset.
2. Exclude 'MVP points' and 'Win Shares' from the predictors to avoid data leakage.
3. Prepare the predictor variables `X` for model training.


In [5]:
if 'Unnamed: 0' in stats.columns:
    del stats['Unnamed: 0']

predictors = ['Age','G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS']

In [6]:
train = stats[stats['Year'] < 2021]
test = stats[stats['Year'] == 2021]

In [7]:
from sklearn.linear_model import Ridge

reg = Ridge(alpha=0.1)

In [8]:
reg.fit(train[predictors], train["Share"])

In [9]:
predictions = reg.predict(test[predictors])

In [10]:
predictions = pd.DataFrame(predictions, columns=['predictions'], index=test.index)

In [11]:
predictions

Unnamed: 0,predictions
630,0.013567
631,-0.013756
632,0.002414
633,-0.004421
634,0.010734
...,...
13897,-0.012571
13898,-0.011575
13899,0.016424
13900,-0.020434


In [12]:
combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)

In [13]:
combination

Unnamed: 0,Player,Share,predictions
630,Aaron Gordon,0.0,0.013567
631,Austin Rivers,0.0,-0.013756
632,Bol Bol,0.0,0.002414
633,Facundo Campazzo,0.0,-0.004421
634,Greg Whittington,0.0,0.010734
...,...,...,...
13897,Patty Mills,0.0,-0.012571
13898,Quinndary Weatherspoon,0.0,-0.011575
13899,Rudy Gay,0.0,0.016424
13900,Tre Jones,0.0,-0.020434


In [14]:
combination.sort_values("Share", ascending=False)

Unnamed: 0,Player,Share,predictions
641,Nikola Jokić,0.961,0.154307
8624,Joel Embiid,0.580,0.162713
3651,Stephen Curry,0.449,0.142386
9907,Giannis Antetokounmpo,0.345,0.207436
1389,Chris Paul,0.138,0.072294
...,...,...,...
4171,Chris Chiozza,0.000,0.007047
4170,Bruce Brown,0.000,0.003656
4169,Blake Griffin,0.000,0.011746
4168,Andre Roberson,0.000,-0.026907


In [15]:
from sklearn.metrics import mean_squared_error

mean_squared_error(combination["Share"], combination["predictions"])

np.float64(0.0026668954567104337)

In [16]:
# As we can see from this result, a lot of players don't get a vote, which can lead to a bad mse.
combination["Share"].value_counts()

Share
0.000    525
0.001      3
0.961      1
0.138      1
0.010      1
0.020      1
0.449      1
0.005      1
0.038      1
0.003      1
0.580      1
0.345      1
0.042      1
0.008      1
Name: count, dtype: int64

In [22]:
# Assign a rank to each player
combination = combination.sort_values("Share", ascending=False)
combination["Rk"] = list(range(1, combination.shape[0]+1))

In [23]:
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk
641,Nikola Jokić,0.961,0.154307,1
8624,Joel Embiid,0.58,0.162713,2
3651,Stephen Curry,0.449,0.142386,3
9907,Giannis Antetokounmpo,0.345,0.207436,4
1389,Chris Paul,0.138,0.072294,5
10997,Luka Dončić,0.042,0.15143,6
7464,Damian Lillard,0.038,0.116303,7
3536,Julius Randle,0.02,0.088878,8
3531,Derrick Rose,0.01,0.033,9
11358,Rudy Gobert,0.008,0.095349,10


In [24]:
# Calculate the predicted rank of each player
combination = combination.sort_values("predictions", ascending=False)
combination["Predicted_Rk"] = list(range(1, combination.shape[0]+1))

In [25]:
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk
9907,Giannis Antetokounmpo,0.345,0.207436,4,1
8624,Joel Embiid,0.58,0.162713,2,2
641,Nikola Jokić,0.961,0.154307,1,3
10997,Luka Dončić,0.042,0.15143,6,4
3736,LeBron James,0.001,0.147511,15,5
3651,Stephen Curry,0.449,0.142386,3,6
4177,Kevin Durant,0.0,0.14135,529,7
4174,James Harden,0.001,0.140598,13,8
11784,Zion Williamson,0.0,0.127926,253,9
3876,Russell Westbrook,0.005,0.120228,11,10


In [32]:
# Average presicion error metric
def find_ap(combination):
    actual = combination.sort_values("Share", ascending=False).head(5)
    predicted = combination.sort_values("predictions", ascending=False)
    ps = []
    found = 0
    seen = 1
    for index, row in predicted.iterrows():
        if row["Player"] in actual["Player"].values:
            found += 1
            ps.append(found/seen)
        seen += 1 
    return sum(ps) / len(ps)

In [33]:
find_ap(combination)

0.7636363636363637