In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import pickle
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from ngboost.ngboost import NGBoost
from sklearn.tree import DecisionTreeRegressor
from ngboost.distns import Normal

from sklearn.metrics import mean_squared_error, mean_absolute_error

seed = 24

In [2]:
with open('../pickle_files/lineup_data2.pkl', 'rb') as f:
    lineup_data2 = pickle.load(f)
lineup_data2

Unnamed: 0,PLAYER1,PLAYER2,PLAYER3,PLAYER4,PLAYER5,TEAM,OFFRTG,Season,STB,ISA,PUB,SBH,TRA,PBH,SUS,RCB,OSS,WWH
0,E. Ilyasova16,Marcus Morris16,R. Jackson16,A. Drummond16,K. Caldwell-Pope16,DET,105.200000,2015-16,1.023980,0.044395,0.970417,0.010700,0.115062,0.999668,0.010615,0.095730,0.007772,1.721660
1,K. Korver16,P. Millsap16,A. Horford16,J. Teague16,K. Bazemore16,ATL,104.300000,2015-16,1.229448,0.000245,0.769246,0.000248,0.464523,0.999968,0.000644,0.000580,1.000249,0.534849
2,K. Durant16,R. Westbrook16,S. Ibaka16,A. Roberson16,S. Adams16,OKC,114.300000,2015-16,1.009495,1.083898,0.000460,0.014805,0.955392,0.900081,0.031655,1.000501,0.000648,0.003066
3,Carmelo Anthony16,J. Calderon16,A. Afflalo16,R. Lopez16,K. Porzingis16,NYK,103.600000,2015-16,1.020470,0.782798,1.076187,1.016864,0.081544,0.007902,0.010472,0.077015,0.699647,0.227102
4,A. Johnson16,A. Bradley16,I. Thomas16,J. Sullinger16,J. Crowder16,BOS,106.400000,2015-16,0.534016,0.896970,0.471984,0.027941,0.999700,0.073625,0.000315,0.993749,0.999543,0.002156
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2265,T. Hardaway Jr.23,S. Dinwiddie23,C. Wood23,M. Kleber23,Josh Green23,DAL,112.194000,2022-23,0.959157,0.064763,0.043090,0.069731,0.869240,0.864663,2.118477,0.000333,0.001794,0.008751
2266,T. McConnell23,M. Turner23,B. Hield23,A. Nesmith23,C. Duarte23,IND,113.817000,2022-23,0.756754,0.001136,0.241460,0.007032,0.239464,0.992947,0.999470,0.003061,0.273525,1.485151
2267,A. Burks23,Bojan Bogdanovic23,I. Stewart23,J. Ivey23,J. Duren23,DET,108.400000,2022-23,0.999285,0.248650,0.000226,1.452039,0.006033,0.467037,0.000792,0.999842,0.054783,0.771315
2268,L. James23,R. Westbrook23,A. Davis23,L. Walker IV23,A. Reaves23,LAL,115.416667,2022-23,0.006535,1.839374,1.001077,0.134090,0.017973,0.021218,0.014181,0.000280,0.003531,1.961742


In [3]:
TEAMS = ["BOS",
        "BKN",
        "NYK",
        "PHI",
        "TOR",
        "CHI",
        "CLE",
        "DET",
        "IND",
        "MIL",
        "ATL",
        "CHA",
        "MIA",
        "ORL",
        "WAS",
        "DAL",
        "HOU",
        "MEM",
        "NOP",
        "SAS",
        "DEN",
        "MIN",
        "OKC",
        "POR",
        "UTA",
        "GSW",
        "LAC",
        "LAL",
        "PHX",
        "SAC"]

In [4]:
svr = SVR(kernel = "rbf", gamma = 0.1, C=0.1, epsilon = 0.1)
params = {'objective': 'regression',  
          'metric': 'mae',
          'random_state': seed,
          'boosting_type': 'gbdt',  
          'n_estimators': 10000,
          'verbose': -1,  
          'early_stopping_round': 10,
          'feature_fraction': 0.5,
          'num_leaves': 24,
          'min_child_samples': 5
          }
lgb = LGBMRegressor(**params)
ngb = NGBoost(Base=DecisionTreeRegressor(criterion='friedman_mse', max_depth=3, random_state=seed),
              Dist=Normal, 
              natural_gradient=True, 
              verbose=False,
              random_state=seed,
              col_sample=0.6
              )

In [5]:
lineup_data2_and_predictions = pd.DataFrame(columns=["PLAYER1", "PLAYER2", "PLAYER3", "PLAYER4", "PLAYER5", 
                                                     "TEAM",
                                                     "OFFRTG_true", 
                                                     "STB",
                                                     "ISA",
                                                     "PUB",
                                                     "SBH",
                                                     "TRA",
                                                     "PBH",
                                                     "SUS",
                                                     "RCB",
                                                     "OSS",
                                                     "WWH",
                                                     "OFFRTG_pred"
                                                     ])
svr_y_pred_list = []
lgb_y_pred_list = []
ngb_y_pred_list = []
y_dist_list = []

svr_test_RMSE_list = []
svr_test_MAE_list = []
lgb_test_RMSE_list = []
lgb_test_MAE_list = []
ngb_test_RMSE_list = []
ngb_test_MAE_list = []
test_NLL_list = []

test_NLL_dict = {}

random.seed(seed)

for TEAM in TEAMS:
    pred_target_team_df = lineup_data2[lineup_data2["TEAM"] == TEAM]
    pred_target_team_df.reset_index(inplace=True, drop=True)
    X_test, y_test = pred_target_team_df.drop(columns=["PLAYER1", "PLAYER2", "PLAYER3", "PLAYER4", "PLAYER5", "TEAM", "OFFRTG", "Season"]), pred_target_team_df["OFFRTG"]
    
    training_df = lineup_data2[lineup_data2["TEAM"] != TEAM]
    X_train, y_train = training_df.drop(columns=["PLAYER1", "PLAYER2", "PLAYER3", "PLAYER4", "PLAYER5", "TEAM", "OFFRTG", "Season"]), training_df["OFFRTG"]
    
    train_val_TEAMS = [team for team in TEAMS if team != TEAM]
    
    val_TEAMS = random.sample(train_val_TEAMS, 5)
    
    X_train_, y_train_ = training_df[~training_df["TEAM"].isin(val_TEAMS)].drop(columns=["PLAYER1", "PLAYER2", "PLAYER3", "PLAYER4", "PLAYER5", "TEAM", "OFFRTG", "Season"]), training_df[~training_df["TEAM"].isin(val_TEAMS)]["OFFRTG"]
    X_val, y_val = training_df[training_df["TEAM"].isin(val_TEAMS)].drop(columns=["PLAYER1", "PLAYER2", "PLAYER3", "PLAYER4", "PLAYER5", "TEAM", "OFFRTG", "Season"]), training_df[training_df["TEAM"].isin(val_TEAMS)]["OFFRTG"]
    
    svr.fit(X_train, y_train)
    lgb.fit(X_train_, y_train_, 
            eval_set=[(X_val, y_val)]
            )
    ngb.fit(X_train, y_train,
            #X_val, y_val, 
            #early_stopping_rounds=10
            )
    
    svr_y_pred = svr.predict(X_test)
    lgb_y_pred = lgb.predict(X_test)
    ngb_y_pred = ngb.predict(X_test)
    svr_y_pred_list.append(svr_y_pred)
    lgb_y_pred_list.append(lgb_y_pred)
    ngb_y_pred_list.append(ngb_y_pred)
    y_dist = ngb.pred_dist(X_test)
    y_dist_list.append(y_dist)
    
    svr_test_RMSE = np.sqrt(mean_squared_error(y_test, svr_y_pred))
    svr_test_RMSE_list.append(svr_test_RMSE)
    lgb_test_RMSE = np.sqrt(mean_squared_error(y_test, lgb_y_pred))
    lgb_test_RMSE_list.append(lgb_test_RMSE)
    ngb_test_RMSE = np.sqrt(mean_squared_error(y_test, ngb_y_pred))
    ngb_test_RMSE_list.append(ngb_test_RMSE)
    
    svr_test_MAE = mean_absolute_error(y_test, svr_y_pred)
    svr_test_MAE_list.append(svr_test_MAE)
    lgb_test_MAE = mean_absolute_error(y_test, lgb_y_pred)
    lgb_test_MAE_list.append(lgb_test_MAE)
    ngb_test_MAE = mean_absolute_error(y_test, ngb_y_pred)
    ngb_test_MAE_list.append(ngb_test_MAE)
    
    test_NLL = -y_dist.logpdf(y_test).mean()
    test_NLL_list.append(test_NLL)
    
    test_NLL_by_team = []
    
    for i in range(len(ngb_y_pred)):
        test_NLL_by_example = -y_dist[i].logpdf(y_test[i])
        test_NLL_by_team.append(test_NLL_by_example)
        
    test_NLL_dict[TEAM] = test_NLL_by_team
    
    lineup_data_with_pred_by_team = pd.concat([pred_target_team_df, pd.DataFrame(ngb_y_pred)], axis=1)
    print(lineup_data_with_pred_by_team)
    lineup_data_with_pred_by_team.columns = ["PLAYER1", "PLAYER2", "PLAYER3", "PLAYER4", "PLAYER5", 
                                             "TEAM",
                                             "OFFRTG_true",
                                             "Season",
                                             "STB",
                                             "ISA",
                                             "PUB",
                                             "SBH",
                                             "TRA",
                                             "PBH",
                                             "SUS",
                                             "RCB",
                                             "OSS",
                                             "WWH",
                                             "OFFRTG_pred"
                                             ]
    lineup_data2_and_predictions = pd.concat([lineup_data2_and_predictions, lineup_data_with_pred_by_team])


         PLAYER1       PLAYER2         PLAYER3         PLAYER4  \
0   A. Johnson16  A. Bradley16     I. Thomas16  J. Sullinger16   
1   J. Jerebko16   E. Turner16    A. Bradley16     K. Olynyk16   
2   A. Johnson16   E. Turner16    A. Bradley16     I. Thomas16   
3   J. Jerebko16   E. Turner16    A. Bradley16     T. Zeller16   
4   A. Bradley16   I. Thomas16  J. Sullinger16    J. Crowder16   
..           ...           ...             ...             ...   
74  A. Horford23    M. Smart23      J. Brown23      J. Tatum23   
75  A. Horford23    M. Smart23      J. Brown23      D. White23   
76  A. Horford23    J. Brown23      J. Tatum23      D. White23   
77  M. Brogdon23    J. Tatum23     L. Kornet23   G. Williams23   
78  B. Griffin23    M. Smart23      J. Brown23      J. Tatum23   

              PLAYER5 TEAM      OFFRTG   Season       STB       ISA       PUB  \
0        J. Crowder16  BOS  106.400000  2015-16  0.534016  0.896970  0.471984   
1          M. Smart16  BOS  110.854000  2015-

In [6]:
print(np.mean(svr_test_RMSE_list))
print(np.mean(svr_test_MAE_list))
print(np.mean(lgb_test_RMSE_list))
print(np.mean(lgb_test_MAE_list))
print(np.mean(ngb_test_RMSE_list))
print(np.mean(ngb_test_MAE_list))
print(np.mean(test_NLL_list))

4.824713755712488
3.93199407010619
4.854136589187533
3.9552916849082482
4.785518775902191
3.8693907723562204
3.066455646419163


In [7]:
lineup_data2_and_predictions

Unnamed: 0,PLAYER1,PLAYER2,PLAYER3,PLAYER4,PLAYER5,TEAM,OFFRTG_true,STB,ISA,PUB,SBH,TRA,PBH,SUS,RCB,OSS,WWH,OFFRTG_pred,Season
0,A. Johnson16,A. Bradley16,I. Thomas16,J. Sullinger16,J. Crowder16,BOS,106.400000,0.534016,0.896970,0.471984,0.027941,0.999700,0.073625,0.000315,0.993749,0.999543,0.002156,108.722405,2015-16
1,J. Jerebko16,E. Turner16,A. Bradley16,K. Olynyk16,M. Smart16,BOS,110.854000,1.491938,0.680458,0.000148,0.300628,0.210847,0.018273,0.299194,0.000039,0.999552,0.998924,108.895659,2015-16
2,A. Johnson16,E. Turner16,A. Bradley16,I. Thomas16,J. Sullinger16,BOS,107.324000,0.534012,1.577122,0.471988,0.327893,0.000269,0.091895,0.000052,0.993750,0.999502,0.003516,110.450937,2015-16
3,J. Jerebko16,E. Turner16,A. Bradley16,T. Zeller16,M. Smart16,BOS,106.170333,0.541306,0.680512,0.038468,0.300650,0.210770,0.018277,0.299223,0.912213,0.999588,0.998993,110.452700,2015-16
4,A. Bradley16,I. Thomas16,J. Sullinger16,J. Crowder16,K. Olynyk16,BOS,108.613000,1.531605,0.896970,0.467465,0.027941,1.000013,0.073625,0.000356,0.000290,0.999555,0.002181,107.478885,2015-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,H. Barnes23,D. Sabonis23,D. Fox23,M. Monk23,K. Huerter23,SAC,121.471333,0.033505,0.772507,0.881309,0.113482,0.997871,0.875658,0.005206,0.080929,1.002234,0.237299,109.328384,2022-23
63,H. Barnes23,D. Sabonis23,M. Monk23,K. Huerter23,Davion Mitchell23,SAC,117.191667,0.033509,0.675183,0.881309,0.086793,0.998290,0.001818,0.005279,0.080929,1.002254,1.234637,109.973910,2022-23
64,H. Barnes23,D. Sabonis23,D. Fox23,T. Davis23,K. Murray23,SAC,118.478000,0.075611,0.099304,0.881263,0.029120,2.106094,0.873948,0.691599,0.080966,0.105379,0.056717,110.020199,2022-23
65,H. Barnes23,D. Sabonis23,M. Monk23,Davion Mitchell23,K. Murray23,SAC,115.875333,0.075800,0.675220,0.881424,0.086858,1.167100,0.001820,0.685468,0.080974,0.099829,1.245507,111.114297,2022-23


In [8]:
lineup_data2_and_predictions = lineup_data2_and_predictions.sort_values("OFFRTG_pred", ascending=False).reset_index(drop=True)
print("TOP10")
print(lineup_data2_and_predictions.iloc[:10, ])
print("BOTTOM10")
print(lineup_data2_and_predictions.sort_values("OFFRTG_pred", ascending=True).iloc[:10,])

TOP10
         PLAYER1          PLAYER2          PLAYER3        PLAYER4  \
0   A. Horford23       J. Brown23       J. Tatum23     D. White23   
1     S. Ibaka21       N. Batum21    P. Beverley21    P. George21   
2    K. Durant23      K. Irving23      J. Harris23   R. O'Neale23   
3    P. George23  Marcus Morris23     K. Leonard23     I. Zubac23   
4  P. Beverley21      P. George21  Marcus Morris21   K. Leonard21   
5    K. Durant22      K. Irving22    A. Drummond22   Seth Curry22   
6   A. Horford23       M. Smart23       J. Brown23     J. Tatum23   
7    P. Tucker22      J. Butler22      D. Dedmon22  D. Robinson22   
8  A. Iguodala19      K. Durant19  Stephen Curry19  K. Thompson19   
9   A. Horford23       J. Brown23       J. Tatum23     D. White23   

             PLAYER5 TEAM  OFFRTG_true       STB       ISA       PUB  \
0      G. Williams23  BOS   116.325000  0.000693  1.566902  0.000066   
1       K. Leonard21  LAC   118.108000  0.805531  1.991986  0.153518   
2       N. Claxton

In [9]:
with open('../pickle_files/kalman2020_lineup_data2.pkl', 'rb') as f:
    kalman2020_lineup_data2 = pickle.load(f)

In [10]:
svr = SVR(kernel = "rbf", gamma = 0.1, C=0.1, epsilon = 1)
params = {'objective': 'regression',  
          'metric': 'mae',
          'random_state': seed,
          'boosting_type': 'gbdt',  
          'n_estimators': 10000,
          'verbose': -1,  
          'early_stopping_round': 10,
          'feature_fraction': 0.5,
          'num_leaves': 16,
          'min_child_samples': 5
          }
lgb = LGBMRegressor(**params)
ngb = NGBoost(Base=DecisionTreeRegressor(criterion='friedman_mse', max_depth=3, random_state=seed),
              Dist=Normal, 
              natural_gradient=True, 
              verbose=False,
              random_state=seed,
              col_sample=0.6
              )

In [11]:
kalman2020_lineup_data2_and_predictions = pd.DataFrame(columns=["LINEUPS", 
                                                                "TEAM",
                                                                "OFFRTG_true", 
                                                                "Mid-range Big",
                                                                "Traditional Center_A",
                                                                "Flore General",
                                                                "High-usage Guard",
                                                                "Versatile Role-player",
                                                                "Rim Plotecter",
                                                                "Traditional Center_B",
                                                                "Stretch Forward",
                                                                "3-point Shooting-guard",
                                                                "OFFRTG_pred"
                                                                ])

svr_y_pred_list = []
lgb_y_pred_list = []
ngb_y_pred_list = []
y_dist_list = []

svr_test_RMSE_list = []
svr_test_MAE_list = []
lgb_test_RMSE_list = []
lgb_test_MAE_list = []
ngb_test_RMSE_list = []
ngb_test_MAE_list = []
test_NLL_list = []

test_NLL_dict = {}

random.seed(seed)

for TEAM in TEAMS:
    pred_target_team_df = kalman2020_lineup_data2[kalman2020_lineup_data2["TEAM"] == TEAM]
    pred_target_team_df.reset_index(inplace=True, drop=True)
    X_test, y_test = pred_target_team_df.drop(columns=["LINEUPS", "TEAM", "OFFRTG"]), pred_target_team_df["OFFRTG"]
    
    training_df = kalman2020_lineup_data2[kalman2020_lineup_data2["TEAM"] != TEAM]
    X_train, y_train = training_df.drop(columns=["LINEUPS", "TEAM", "OFFRTG"]), training_df["OFFRTG"]
    
    train_val_TEAMS = [team for team in TEAMS if team != TEAM]
    
    val_TEAMS = random.sample(train_val_TEAMS, 5)
    
    X_train_, y_train_ = training_df[~training_df["TEAM"].isin(val_TEAMS)].drop(columns=["LINEUPS", "TEAM", "OFFRTG"]), training_df[~training_df["TEAM"].isin(val_TEAMS)]["OFFRTG"]
    X_val, y_val = training_df[training_df["TEAM"].isin(val_TEAMS)].drop(columns=["LINEUPS", "TEAM", "OFFRTG"]), training_df[training_df["TEAM"].isin(val_TEAMS)]["OFFRTG"]
    
    svr.fit(X_train, y_train)
    lgb.fit(X_train_, y_train_, 
            eval_set=[(X_val, y_val)]
            )
    ngb.fit(X_train, y_train,
            #X_val, y_val, 
            #early_stopping_rounds=10
            )
    
    svr_y_pred = svr.predict(X_test)
    lgb_y_pred = lgb.predict(X_test)
    ngb_y_pred = ngb.predict(X_test)
    svr_y_pred_list.append(svr_y_pred)
    lgb_y_pred_list.append(lgb_y_pred)
    ngb_y_pred_list.append(ngb_y_pred)
    y_dist = ngb.pred_dist(X_test)
    y_dist_list.append(y_dist)
    
    svr_test_RMSE = np.sqrt(mean_squared_error(y_test, svr_y_pred))
    svr_test_RMSE_list.append(svr_test_RMSE)
    lgb_test_RMSE = np.sqrt(mean_squared_error(y_test, lgb_y_pred))
    lgb_test_RMSE_list.append(lgb_test_RMSE)
    ngb_test_RMSE = np.sqrt(mean_squared_error(y_test, ngb_y_pred))
    ngb_test_RMSE_list.append(ngb_test_RMSE)
    
    svr_test_MAE = mean_absolute_error(y_test, svr_y_pred)
    svr_test_MAE_list.append(svr_test_MAE)
    lgb_test_MAE = mean_absolute_error(y_test, lgb_y_pred)
    lgb_test_MAE_list.append(lgb_test_MAE)
    ngb_test_MAE = mean_absolute_error(y_test, ngb_y_pred)
    ngb_test_MAE_list.append(ngb_test_MAE)
    
    test_NLL = -y_dist.logpdf(y_test).mean()
    test_NLL_list.append(test_NLL)
    

In [12]:
print(np.mean(svr_test_RMSE_list))
print(np.mean(svr_test_MAE_list))
print(np.mean(lgb_test_RMSE_list))
print(np.mean(lgb_test_MAE_list))
print(np.mean(ngb_test_RMSE_list))
print(np.mean(ngb_test_MAE_list))
print(np.mean(test_NLL_list))

4.917167434157478
4.0231108427187685
4.840293970818328
3.945899908669839
4.848978309084878
3.947516457250598
3.0926093266193657
