In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [4]:
stats = pd.read_csv('Data/player_mvp_stats.csv')

In [5]:
stats

Unnamed: 0.1,Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
1,1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
2,2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
3,3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
4,4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14087,14087,Spencer Hawes,PF,28,MIL,54,1,14.8,2.5,5.1,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
14088,14088,Steve Novak,PF,33,MIL,8,0,2.8,0.3,0.9,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
14089,14089,Terrence Jones,PF,25,MIL,54,12,23.5,4.3,9.1,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
14090,14090,Thon Maker,C,19,MIL,57,34,9.9,1.5,3.2,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45


In [6]:
del stats['Unnamed: 0']

Comprobamos si tenemos algún dato faltante:

In [7]:
pd.isnull(stats).sum()

Player        0
Pos           0
Age           0
Tm            0
G             0
GS            0
MP            0
FG            0
FGA           0
FG%          50
3P            0
3PA           0
3P%        2042
2P            0
2PA           0
2P%          84
eFG%         50
FT            0
FTA           0
FT%         462
ORB           0
DRB           0
TRB           0
AST           0
STL           0
BLK           0
TOV           0
PF            0
PTS           0
Year          0
Pts Won       0
Pts Max       0
Share         0
Team          0
W             0
L             0
W/L%          0
GB            0
PS/G          0
PA/G          0
SRS           0
dtype: int64

In [8]:
stats[pd.isnull(stats['3P%'])][['Player',"3PA"]]

Unnamed: 0,Player,3PA
2,Elden Campbell,0.0
3,Irving Thomas,0.0
18,Jack Haley,0.0
20,Keith Owens,0.0
30,Benoit Benjamin,0.0
...,...,...
14061,Evan Eschmeyer,0.0
14062,Gheorghe Mureșan,0.0
14064,Jim McIlvaine,0.0
14070,Mark Hendrickson,0.0


In [9]:
stats[pd.isnull(stats['FT%'])][['Player',"FTA"]]

Unnamed: 0,Player,FTA
77,John Coker,0.0
92,Jason Sasser,0.0
103,Adrian Caldwell,0.0
119,Bruno Šundov,0.0
158,Jamal Robinson,0.0
...,...,...
13951,Mark McNamara,0.0
13979,Luke Zeller,0.0
14032,Myron Brown,0.0
14054,Malcolm Lee,0.0


Con esto nos damos cuenta de que los datos faltantes de los porcentajes de tiro se deben a que dicho jugador no realizó ningún tiro de ese tipo. Entonces vamos a modificar los `NaN` por 0

In [10]:
stats = stats.fillna(0.0)
pd.isnull(stats).sum()

Player     0
Pos        0
Age        0
Tm         0
G          0
GS         0
MP         0
FG         0
FGA        0
FG%        0
3P         0
3PA        0
3P%        0
2P         0
2PA        0
2P%        0
eFG%       0
FT         0
FTA        0
FT%        0
ORB        0
DRB        0
TRB        0
AST        0
STL        0
BLK        0
TOV        0
PF         0
PTS        0
Year       0
Pts Won    0
Pts Max    0
Share      0
Team       0
W          0
L          0
W/L%       0
GB         0
PS/G       0
PA/G       0
SRS        0
dtype: int64

In [11]:
stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

Vamos a empezar con el modelo de Machine Learning. Para ello, tenemos que elegir que variables vamos a tener en cuenta y cúales no. No vamos a tener en cuenta aquellos campos de tipo **string**, por ejemplo `'Player'`, `'Team'` ... Asimismo quitamos `'Pts Won'`, `'Pts Max'`, `'Share'`. Estas variables le indican al algoritmo lo que estamos queriendo predecir, y no queremos eso. 

In [12]:
predictors = [ 'Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS']

Vamos a intentar predecir el `Share` que obtiene un jugador para el MVP

In [13]:
train = stats[stats['Year'] < 2021]
test = stats[stats['Year'] == 2021]

In [14]:
reg = Ridge(alpha=.1)

Con fit entrenamos el modelo, al cual le indicamos que intente predecir el `Share`

In [15]:
reg.fit(train[predictors], train['Share'])             

Ridge(alpha=0.1)

In [16]:
predictions =  reg.predict(test[predictors])

In [17]:
predictions = pd.DataFrame(predictions, columns=['predictions'],index=test.index)

In [18]:
predictions

Unnamed: 0,predictions
630,0.013567
631,-0.013756
632,0.002414
633,-0.004421
634,0.010734
...,...
13897,-0.012571
13898,-0.011575
13899,0.016424
13900,-0.020434


Vamos a comparar los valores actuales con los obtenidos. Combinamos entonces los dos DataFrames

In [19]:
combination = pd.concat([test[['Player','Share']], predictions], axis=1)

In [20]:
combination

Unnamed: 0,Player,Share,predictions
630,Aaron Gordon,0.0,0.013567
631,Austin Rivers,0.0,-0.013756
632,Bol Bol,0.0,0.002414
633,Facundo Campazzo,0.0,-0.004421
634,Greg Whittington,0.0,0.010734
...,...,...,...
13897,Patty Mills,0.0,-0.012571
13898,Quinndary Weatherspoon,0.0,-0.011575
13899,Rudy Gay,0.0,0.016424
13900,Tre Jones,0.0,-0.020434


In [21]:
combination.sort_values("Share",ascending=False).head(10)

Unnamed: 0,Player,Share,predictions
641,Nikola Jokić,0.961,0.154307
8624,Joel Embiid,0.58,0.162713
3651,Stephen Curry,0.449,0.142386
9907,Giannis Antetokounmpo,0.345,0.207436
1389,Chris Paul,0.138,0.072294
10997,Luka Dončić,0.042,0.15143
7464,Damian Lillard,0.038,0.116303
3536,Julius Randle,0.02,0.088878
3531,Derrick Rose,0.01,0.033
11358,Rudy Gobert,0.008,0.095349


In [22]:
# Error metric
mse = mean_squared_error
mse(combination['Share'],combination['predictions'])

0.002666895456710407

In [23]:
combination['Share'].value_counts()

0.000    525
0.001      3
0.961      1
0.138      1
0.010      1
0.020      1
0.449      1
0.005      1
0.038      1
0.003      1
0.580      1
0.345      1
0.042      1
0.008      1
Name: Share, dtype: int64

La mayoria de los jugadores no han recibido votos. Por tanto el dato obtenido anteriormente nos engaña un poco.  
Por tanto, creamos una columna para indicar el puesto en la votación del MVP.

In [24]:
combination = combination.sort_values('Share', ascending=False)
combination['Rk'] = list(range(1,combination.shape[0]+1))
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk
641,Nikola Jokić,0.961,0.154307,1
8624,Joel Embiid,0.58,0.162713,2
3651,Stephen Curry,0.449,0.142386,3
9907,Giannis Antetokounmpo,0.345,0.207436,4
1389,Chris Paul,0.138,0.072294,5
10997,Luka Dončić,0.042,0.15143,6
7464,Damian Lillard,0.038,0.116303,7
3536,Julius Randle,0.02,0.088878,8
3531,Derrick Rose,0.01,0.033,9
11358,Rudy Gobert,0.008,0.095349,10


Ordenamos según el resultado predicho por el modelo y añadimos una columna para indicar dicho resultado.

In [25]:
combination = combination.sort_values('predictions', ascending=False)
combination['Predicted_rk'] = list(range(1,combination.shape[0]+1))
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_rk
9907,Giannis Antetokounmpo,0.345,0.207436,4,1
8624,Joel Embiid,0.58,0.162713,2,2
641,Nikola Jokić,0.961,0.154307,1,3
10997,Luka Dončić,0.042,0.15143,6,4
3736,LeBron James,0.001,0.147511,15,5
3651,Stephen Curry,0.449,0.142386,3,6
4177,Kevin Durant,0.0,0.14135,531,7
4174,James Harden,0.001,0.140598,13,8
11784,Zion Williamson,0.0,0.127926,251,9
3876,Russell Westbrook,0.005,0.120228,11,10


In [26]:
combination.sort_values('Share', ascending=False).head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_rk
641,Nikola Jokić,0.961,0.154307,1,3
8624,Joel Embiid,0.58,0.162713,2,2
3651,Stephen Curry,0.449,0.142386,3,6
9907,Giannis Antetokounmpo,0.345,0.207436,4,1
1389,Chris Paul,0.138,0.072294,5,33
10997,Luka Dončić,0.042,0.15143,6,4
7464,Damian Lillard,0.038,0.116303,7,12
3536,Julius Randle,0.02,0.088878,8,24
3531,Derrick Rose,0.01,0.033,9,76
11358,Rudy Gobert,0.008,0.095349,10,19


In [27]:
def find_ap(combination):
    actual = combination.sort_values('Share', ascending=False).head(5)
    predicted = combination = combination.sort_values('predictions', ascending=False)
    ps = []
    found = 0
    seen = 1
    for index, row in predicted.iterrows():
        if row['Player'] in actual['Player'].values:
            found += 1
            ps.append(found/seen)
        seen += 1
    return sum(ps)/len(ps)

In [28]:
find_ap(combination)

0.7636363636363636

In [29]:
years =  list(range(1991,2022))

In [30]:
aps = []
all_predictions = []
for year in years[5:]: 
    train = stats[stats['Year'] < year]
    test = stats[stats['Year'] == year]
    reg.fit(train[predictors], train['Share'])
    predictions =  reg.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns=['predictions'],index=test.index)
    combination = pd.concat([test[['Player','Share']], predictions], axis=1)
    all_predictions.append(combination)
    aps.append(find_ap(combination))

In [31]:
sum (aps) / len(aps)

0.7112884360789578

In [32]:
def add_ranks(combination):
    combination = combination.sort_values('Share', ascending=False)
    combination['Rk'] = list(range(1,combination.shape[0]+1))
    combination = combination.sort_values('predictions', ascending=False)
    combination['Predicted_rk'] = list(range(1,combination.shape[0]+1))
    combination['Diff'] = combination['Rk']-combination['Predicted_rk']
    return combination

In [33]:
ranking = add_ranks(all_predictions[1])
ranking[ranking['Rk']<5].sort_values('Diff',ascending=False)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_rk,Diff
1600,Karl Malone,0.857,0.192318,1,2,-1
10524,Michael Jordan,0.832,0.167629,2,3,-1
908,Grant Hill,0.327,0.128646,3,6,-3
4682,Tim Hardaway,0.207,0.059984,4,20,-16


In [34]:
def backtest(stats,model,year,predictors):
    aps = []
    all_predictions = []
    for year in years[5:]: 
        train = stats[stats['Year'] < year]
        test = stats[stats['Year'] == year]
        reg.fit(train[predictors], train['Share'])
        predictions =  reg.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns=['predictions'],index=test.index)
        combination = pd.concat([test[['Player','Share']], predictions], axis=1)
        combination = add_ranks(combination)
        all_predictions.append(combination)
        aps.append(find_ap(combination))
    return sum(aps)/len(aps), aps, pd.concat(all_predictions)

In [35]:
mean_ap, ap, all_predictions = backtest(stats,reg,years[5:],predictors)

In [36]:
mean_ap

0.7112884360789578

In [37]:
all_predictions[all_predictions['Rk']<=5].sort_values('Diff').head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_rk,Diff
1224,Jason Kidd,0.712,0.028209,2,52,-50
8248,Glen Rice,0.117,0.03311,5,53,-48
5175,Steve Nash,0.839,0.0341,1,45,-44
8516,Peja Stojaković,0.228,0.036269,4,38,-34
5193,Steve Nash,0.739,0.054128,1,34,-33
12726,Joakim Noah,0.258,0.046968,4,37,-33
3657,Chauncey Billups,0.344,0.052698,5,35,-30
1389,Chris Paul,0.138,0.072294,5,33,-28
5208,Steve Nash,0.785,0.074421,2,21,-19
4682,Tim Hardaway,0.207,0.059984,4,20,-16


In [38]:
# Con esto vemos cúal es la variable mas importante para el algoritmo
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)],axis=1).sort_values(0,ascending=False)

Unnamed: 0,0,1
13,0.070002,eFG%
18,0.035041,DRB
29,0.027126,W/L%
17,0.02161,ORB
10,0.016945,2P
21,0.011635,STL
15,0.011351,FTA
22,0.011234,BLK
20,0.007456,AST
25,0.005893,PTS


In [39]:
stat_ratios = stats[['PTS','AST','STL','BLK','3P','Year']].groupby('Year').apply(lambda x: x/x.mean())

In [40]:
# Esto nos muestra las estadísticas de un jugador con respecto a la media
stat_ratios

Unnamed: 0,PTS,AST,STL,BLK,3P,Year
0,1.013334,0.420714,0.961127,0.673469,0.508587,1.0
1,1.614653,1.028412,1.647646,0.673469,4.577279,1.0
2,0.311795,0.093492,0.274608,1.571429,0.000000,1.0
3,0.200440,0.186984,0.274608,0.000000,0.000000,1.0
4,2.383005,1.636110,1.784950,0.897959,1.525760,1.0
...,...,...,...,...,...,...
14087,0.735752,0.819562,0.479763,1.528302,0.650951,1.0
14088,0.071202,0.000000,0.000000,0.000000,0.130190,1.0
14089,1.281633,0.601012,1.119447,2.547170,0.520761,1.0
14090,0.474679,0.218550,0.319842,1.273585,0.650951,1.0


In [41]:
stats[["PTS_R", "AST_R", "STL_R", "BLK_R", "3P_R"]] = stat_ratios[["PTS", "AST", "STL", "BLK", "3P"]]

In [42]:
stats.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,W/L%,GB,PS/G,PA/G,SRS,PTS_R,AST_R,STL_R,BLK_R,3P_R
0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,0.476,...,0.707,5.0,106.3,99.6,6.73,1.013334,0.420714,0.961127,0.673469,0.508587
1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,0.477,...,0.707,5.0,106.3,99.6,6.73,1.614653,1.028412,1.647646,0.673469,4.577279
2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,0.455,...,0.707,5.0,106.3,99.6,6.73,0.311795,0.093492,0.274608,1.571429,0.0
3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,0.34,...,0.707,5.0,106.3,99.6,6.73,0.20044,0.186984,0.274608,0.0,0.0
4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,0.492,...,0.707,5.0,106.3,99.6,6.73,2.383005,1.63611,1.78495,0.897959,1.52576


In [43]:
predictors += ["PTS_R", "AST_R", "STL_R", "BLK_R", "3P_R"]

In [44]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)

In [45]:
mean_ap

0.7208380973034985

In [46]:
stats["NPos"] = stats["Pos"].astype("category").cat.codes
stats["NTm"] = stats["Tm"].astype("category").cat.codes
stats.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,PS/G,PA/G,SRS,PTS_R,AST_R,STL_R,BLK_R,3P_R,NPos,NTm
0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,0.476,...,106.3,99.6,6.73,1.013334,0.420714,0.961127,0.673469,0.508587,2,15
1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,0.477,...,106.3,99.6,6.73,1.614653,1.028412,1.647646,0.673469,4.577279,12,15
2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,0.455,...,106.3,99.6,6.73,0.311795,0.093492,0.274608,1.571429,0.0,2,15
3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,0.34,...,106.3,99.6,6.73,0.20044,0.186984,0.274608,0.0,0.0,2,15
4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,0.492,...,106.3,99.6,6.73,2.383005,1.63611,1.78495,0.897959,1.52576,8,15


In [47]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=50, random_state=1, min_samples_split=5)

mean_ap, aps, all_predictions = backtest(stats, rf, years[28:], predictors + ["NPos", "NTm"])
mean_ap

0.7184008100935044

In [48]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[28:], predictors)
mean_ap
0.7981818181818182
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

In [49]:
def backtest(stats, model, years, predictors):
    aps = []
    all_predictions = []
    for year in years:
        train = stats[stats["Year"] < year].copy()
        test = stats[stats["Year"] == year].copy()
        sc.fit(train[predictors])
        train[predictors] = sc.transform(train[predictors])
        test[predictors] = sc.transform(test[predictors])
        model.fit(train[predictors],train["Share"])
        predictions = model.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
        combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)
        combination = add_ranks(combination)
        all_predictions.append(combination)
        aps.append(find_ap(combination))
    return sum(aps) / len(aps), aps, pd.concat(all_predictions)

In [50]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[28:], predictors)
mean_ap

0.7981818181818182

In [51]:
sc.transform(stats[predictors])

array([[ 0.04758869,  1.15207811, -0.1657698 , ..., -0.05582165,
        -0.27719578, -0.38154061],
       [ 0.51919526,  1.15207811,  1.91917626, ...,  0.93001944,
        -0.27719578,  2.77745153],
       [-1.13142771, -0.03932364, -0.88353811, ..., -1.04166274,
         0.48509261, -0.77641463],
       ...,
       [-0.42401787,  0.04010314, -0.47338479, ...,  0.17152606,
         1.31341113, -0.37208834],
       [-1.83883755,  0.15924332,  0.27856297, ..., -0.97670625,
         0.23224953, -0.27100677],
       [-0.42401787,  1.07265133,  1.85081738, ...,  0.17152606,
        -0.41644743,  1.04305367]])

In [52]:
all_predictions.head(20)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_rk,Diff
11443,Giannis Antetokounmpo,0.932,0.204018,1,1,0
9634,James Harden,0.768,0.190997,2,2,0
3698,LeBron James,0.001,0.169241,12,3,9
5460,Joel Embiid,0.049,0.150977,7,4,3
6990,Anthony Davis,0.0,0.140179,410,5,405
4143,Kevin Durant,0.025,0.12769,8,6,2
13037,Kawhi Leonard,0.013,0.127146,9,7,2
1363,Russell Westbrook,0.008,0.122346,10,8,2
4149,Stephen Curry,0.173,0.120651,5,9,-4
1361,Paul George,0.352,0.117182,3,10,-7


A continuación vamos a aplicar un par de algoritmos procedentes de la librería de `Sklearn`

In [53]:
train = stats[stats['Year'] <= 2020]
test = stats[stats['Year'] == 2021]

x_train = np.array(train[predictors])      
y_train = np.array(train['Share'])     
x_test = np.array(test[predictors])
y_test = np.array(test['Share'])    

El primero de ellos va a ser el de bosques aleatorios. Asimismo, vamos a obtener los valores de los parámetros para los cúales nuestro modelo obtiene un mejor resultado. Para ello, usamos `GridSearchCV`.

In [55]:
rfr = RandomForestRegressor(random_state=1)
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : range(1,11),
}
CV_rfr = GridSearchCV(estimator=rfr, param_grid=param_grid, cv=5)
CV_rfr.fit(x_train, y_train)
CV_rfr.best_params_

{'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 200}

In [58]:
predictions_cv = CV_rfr.predict(x_test)
rf_val_mae = mean_absolute_error(predictions_cv, y_test)
rf_accuracy = CV_rfr.score(x_test, y_test)

print("Validation MAE for Random Forest Model: {}".format(rf_val_mae))
print("Score for Random Forest Model: {}".format(rf_accuracy))

Validation MAE for Random Forest Model: 0.006441682591249825
Score for Random Forest Model: 0.4181924306211092


In [59]:
prediction = pd.DataFrame(predictions_cv)
prediction['Player'] = np.array(stats[stats['Year'] == 2021]['Player'])
prediction['Predicted Share'] = prediction[0]
prediction['Share'] = y_test
del prediction[0]
prediction_sorted = prediction.sort_values(by='Predicted Share',ascending=False)
prediction_sorted.head(10)

Unnamed: 0,Player,Predicted Share,Share
350,Giannis Antetokounmpo,0.370775,0.345
314,Joel Embiid,0.308653,0.58
138,Stephen Curry,0.277551,0.449
11,Nikola Jokić,0.213056,0.961
416,Donovan Mitchell,0.183318,0.0
185,Kyrie Irving,0.181934,0.0
159,Bradley Beal,0.17818,0.0
388,Luka Dončić,0.175971,0.042
184,Kevin Durant,0.171838,0.0
181,James Harden,0.159127,0.001


In [60]:
def add_ranks(combination):
    combination = combination.sort_values('Share', ascending=False)
    combination['Rk'] = list(range(1,combination.shape[0]+1))
    combination = combination.sort_values('Predicted Share', ascending=False)
    combination['Predicted_rk'] = list(range(1,combination.shape[0]+1))
    combination['Diff'] = combination['Rk']-combination['Predicted_rk']
    return combination

In [62]:
rank_prediction = add_ranks(prediction_sorted)
rank_prediction.set_index("Predicted_rk", inplace=True)
rank_prediction[rank_prediction['Predicted Share']>0].head(10)

Unnamed: 0_level_0,Player,Predicted Share,Share,Rk,Diff
Predicted_rk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Giannis Antetokounmpo,0.370775,0.345,4,3
2,Joel Embiid,0.308653,0.58,2,0
3,Stephen Curry,0.277551,0.449,3,0
4,Nikola Jokić,0.213056,0.961,1,-3
5,Donovan Mitchell,0.183318,0.0,375,370
6,Kyrie Irving,0.181934,0.0,374,368
7,Bradley Beal,0.17818,0.0,373,366
8,Luka Dončić,0.175971,0.042,6,-2
9,Kevin Durant,0.171838,0.0,372,363
10,James Harden,0.159127,0.001,13,3


Podemos comprobar entonces que este algoritmo mejora a la solución planteada previamente. Obtiene un menor error medio absoluto y el 'score' del modelo es mayor. Si nos paramos a observar la tabla podemos ver como los integrantes del top4 son los mismos integrantes que en el top4 que nuestro modelo ha predicho (con un par de cambios entre ellos). 

Por último, vamos a aplicar el algoritmo de `XGBoost`. 

In [None]:
#!pip install xgboost
from xgboost import XGBRegressor

xgb = XGBRegressor(n_jobs=-1)
param_grid = { 
    'n_estimators': [500,600],
    'learning_rate': [0.01,0.02,0.03,0.04,0.05,0.1,0.15],
    'max_depth': range(5,16),
    'booster': ['gbtree', 'gblinear','dart'],
}
CV_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5)
CV_xgb.fit(x_train, y_train)
CV_xgb.best_params_

In [None]:
#help(xgb)

In [73]:
predictions_xgb = CV_xgb.predict(x_test)
xgb_val_mae = mean_absolute_error(predictions_xgb, y_test)
xgb_accuracy = CV_xgb.score(x_test, y_test)

print("Validation MAE for XGBoost Model: {}".format(xgb_val_mae))
print("Score for XGBoost Model: {}".format(xgb_accuracy))

Validation MAE for XGBoost Model: 0.006424027835334479
Score for XGBoost Model: 0.43988894680001833


In [74]:
prediction = pd.DataFrame(predictions_xgb)
prediction['Player'] = np.array(stats[stats['Year'] == 2021]['Player'])
prediction['Predicted Share'] = prediction[0]
prediction['Share'] = y_test
del prediction[0]
prediction_sorted = prediction.sort_values(by='Predicted Share',ascending=False)
prediction_sorted.head(10)

Unnamed: 0,Player,Predicted Share,Share
350,Giannis Antetokounmpo,0.470269,0.345
138,Stephen Curry,0.371654,0.449
11,Nikola Jokić,0.355355,0.961
184,Kevin Durant,0.229316,0.0
172,Russell Westbrook,0.219464,0.005
388,Luka Dončić,0.217179,0.042
181,James Harden,0.197531,0.001
60,Devin Booker,0.1936,0.0
465,Zion Williamson,0.190636,0.0
256,Damian Lillard,0.187904,0.038


In [77]:
rank_prediction = add_ranks(prediction_sorted)
rank_prediction.set_index("Predicted_rk", inplace=True)
rank_prediction[rank_prediction['Predicted Share']>0].head(10)

Unnamed: 0_level_0,Player,Predicted Share,Share,Rk,Diff
Predicted_rk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Giannis Antetokounmpo,0.470269,0.345,4,3
2,Stephen Curry,0.371654,0.449,3,1
3,Nikola Jokić,0.355355,0.961,1,-2
4,Kevin Durant,0.229316,0.0,375,371
5,Russell Westbrook,0.219464,0.005,11,6
6,Luka Dončić,0.217179,0.042,6,0
7,James Harden,0.197531,0.001,13,6
8,Devin Booker,0.1936,0.0,374,366
9,Zion Williamson,0.190636,0.0,373,364
10,Damian Lillard,0.187904,0.038,7,-3
