In [33]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import pickle
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from ngboost.ngboost import NGBoost
from sklearn.tree import DecisionTreeRegressor
from ngboost.distns import Normal

from sklearn.metrics import mean_squared_error, mean_absolute_error

seed = 16

In [34]:
with open('../pickle_files/lineup_data1.pkl', 'rb') as f:
    lineup_data1 = pickle.load(f)

In [35]:
lineup_data1.replace("NOH", "NOP", inplace=True)

In [36]:
TEAMS = ["BOS",
        "BKN",
        "NYK",
        "PHI",
        "TOR",
        "CHI",
        "CLE",
        "DET",
        "IND",
        "MIL",
        "ATL",
        "CHA",
        "MIA",
        "ORL",
        "WAS",
        "DAL",
        "HOU",
        "MEM",
        "NOP",
        "SAS",
        "DEN",
        "MIN",
        "OKC",
        "POR",
        "UTA",
        "GSW",
        "LAC",
        "LAL",
        "PHX",
        "SAC"]

In [37]:
svr = SVR(kernel = "rbf", gamma = 1, C=0.001, epsilon = 0)
params = {'objective': 'regression',  
          'metric': 'mae',
          'random_state': seed,
          'boosting_type': 'gbdt',  
          'n_estimators': 10000,
          'verbose': -1,  
          'early_stopping_round': 10,
          'feature_fraction': 0.2,
          'min_child_samples': 5
          }
lgb = LGBMRegressor(**params)
ngb = NGBoost(Base=DecisionTreeRegressor(criterion='friedman_mse', max_depth=2, random_state=seed),
              Dist=Normal, 
              natural_gradient=True, 
              verbose=False,
              random_state=seed,
              n_estimators=10000,
              col_sample=0.6
              )

In [38]:
lineup_data1_and_predictions = pd.DataFrame(columns=["PLAYER1", "PLAYER2", "PLAYER3", "PLAYER4", "PLAYER5", 
                                                     "TEAM",
                                                     "Season",
                                                     "OFFRTG_true", 
                                                     "CB",
                                                     "MB",
                                                     "MA",
                                                     "MS",
                                                     "OA",
                                                     "PH",
                                                     "DH",
                                                     "S4",
                                                     "CS",
                                                     "PS",
                                                     "OS",
                                                     "DS",
                                                     "SA",
                                                     "OFFRTG_pred"
                                                     ])

svr_y_pred_list = []
gpr_y_pred_list = []
lgb_y_pred_list = []
ngb_y_pred_list = []
y_dist_list = []

svr_test_RMSE_list = []
svr_test_MAE_list = []
gpr_test_RMSE_list = []
gpr_test_MAE_list = []
gpr_test_NLL_list = []
lgb_test_RMSE_list = []
lgb_test_MAE_list = []
ngb_test_RMSE_list = []
ngb_test_MAE_list = []
ngb_test_NLL_list = []

ngb_test_NLL_dict = {}

random.seed(seed)

for TEAM in TEAMS:
    pred_target_team_df = lineup_data1[lineup_data1["TEAM"] == TEAM]
    pred_target_team_df.reset_index(inplace=True, drop=True)
    X_test, y_test = pred_target_team_df[[
                                        "CB",
                                        "MB",
                                        "MA",
                                        "MS",
                                        "OA",
                                        "PH",
                                        "DH",
                                        "S4",
                                        "CS",
                                        "PS",
                                        "OS",
                                        "DS",
                                        "SA",
                                        ]], pred_target_team_df["OFFRTG"]
    
    training_df = lineup_data1[lineup_data1["TEAM"] != TEAM]
    X_train, y_train = training_df[[
                                    "CB",
                                    "MB",
                                    "MA",
                                    "MS",
                                    "OA",
                                    "PH",
                                    "DH",
                                    "S4",
                                    "CS",
                                    "PS",
                                    "OS",
                                    "DS",
                                    "SA",
                                    ]], training_df["OFFRTG"]
    
    train_val_TEAMS = [team for team in TEAMS if team != TEAM]
    
    val_TEAMS = random.sample(train_val_TEAMS, 4)
    
    X_train_, y_train_ = training_df[~training_df["TEAM"].isin(val_TEAMS)][[
                                                                            "CB",
                                                                            "MB",
                                                                            "MA",
                                                                            "MS",
                                                                            "OA",
                                                                            "PH",
                                                                            "DH",
                                                                            "S4",
                                                                            "CS",
                                                                            "PS",
                                                                            "OS",
                                                                            "DS",
                                                                            "SA",
                                                                            ]], training_df[~training_df["TEAM"].isin(val_TEAMS)]["OFFRTG"]
    X_val, y_val = training_df[training_df["TEAM"].isin(val_TEAMS)][[
                                                                        "CB",
                                                                        "MB",
                                                                        "MA",
                                                                        "MS",
                                                                        "OA",
                                                                        "PH",
                                                                        "DH",
                                                                        "S4",
                                                                        "CS",
                                                                        "PS",
                                                                        "OS",
                                                                        "DS",
                                                                        "SA",
                                                                        ]], training_df[training_df["TEAM"].isin(val_TEAMS)]["OFFRTG"]
    
    svr.fit(X_train, y_train)
    svr_y_pred = svr.predict(X_test)
    svr_y_pred_list.append(svr_y_pred)
    svr_test_RMSE = np.sqrt(mean_squared_error(y_test, svr_y_pred))
    svr_test_RMSE_list.append(svr_test_RMSE)
    svr_test_MAE = mean_absolute_error(y_test, svr_y_pred)
    svr_test_MAE_list.append(svr_test_MAE)
    
    lgb.fit(X_train_, y_train_, 
            eval_set=[(X_val, y_val)]
            )
    lgb_y_pred = lgb.predict(X_test)
    lgb_y_pred_list.append(lgb_y_pred)
    lgb_test_RMSE = np.sqrt(mean_squared_error(y_test, lgb_y_pred))
    lgb_test_RMSE_list.append(lgb_test_RMSE)
    lgb_test_MAE = mean_absolute_error(y_test, lgb_y_pred)
    lgb_test_MAE_list.append(lgb_test_MAE)
    
    ngb.fit(X_train_, y_train_,
            X_val, y_val, 
            early_stopping_rounds=10
            )
    ngb_y_pred = ngb.predict(X_test)
    ngb_y_pred_list.append(ngb_y_pred)
    y_dist = ngb.pred_dist(X_test)
    y_dist_list.append(y_dist)
    ngb_test_RMSE = np.sqrt(mean_squared_error(y_test, ngb_y_pred))
    ngb_test_RMSE_list.append(ngb_test_RMSE)
    ngb_test_MAE = mean_absolute_error(y_test, ngb_y_pred)
    ngb_test_MAE_list.append(ngb_test_MAE)
    ngb_test_NLL = -y_dist.logpdf(y_test).mean()
    ngb_test_NLL_list.append(ngb_test_NLL)
    ngb_test_NLL_by_team = []
    
    for i in range(len(ngb_y_pred)):
        test_NLL_by_example = -y_dist[i].logpdf(y_test[i])
        ngb_test_NLL_by_team.append(test_NLL_by_example)
        
    ngb_test_NLL_dict[TEAM] = ngb_test_NLL_by_team
    
    lineup_data_with_pred_by_team = pd.concat([pred_target_team_df, pd.DataFrame(ngb_y_pred)], axis=1)
    print(lineup_data_with_pred_by_team)
    print(lineup_data_with_pred_by_team)
    lineup_data_with_pred_by_team.columns = ["PLAYER1", "PLAYER2", "PLAYER3", "PLAYER4", "PLAYER5", 
                                            "TEAM",
                                            "OFFRTG_true",
                                            "Season",
                                            "CB",
                                            "MB",
                                            "MA",
                                            "MS",
                                            "OA",
                                            "PH",
                                            "DH",
                                            "S4",
                                            "CS",
                                            "PS",
                                            "OS",
                                            "DS",
                                            "SA",
                                            "OFFRTG_pred"
                                            ]
    lineup_data1_and_predictions = pd.concat([lineup_data1_and_predictions, lineup_data_with_pred_by_team])

           PLAYER1       PLAYER2         PLAYER3          PLAYER4  \
0     K. Garnett13   P. Pierce13       B. Bass13         C. Lee13   
1     K. Garnett13   P. Pierce13      J. Terry13        B. Bass13   
2     K. Garnett13   P. Pierce13       B. Bass13       R. Rondo13   
3      P. Pierce13     B. Bass13    Jeff Green13         C. Lee13   
4     K. Garnett13    J. Terry13    Jeff Green13         C. Lee13   
5     K. Garnett13   P. Pierce13       B. Bass13       R. Rondo13   
6     K. Garnett13  L. Barbosa13    Jeff Green13         C. Lee13   
7      P. Pierce13    J. Terry13       B. Bass13     Jeff Green13   
8     K. Garnett13   P. Pierce13      J. Terry13       R. Rondo13   
9     K. Garnett13   P. Pierce13      J. Terry13     Jeff Green13   
10    K. Garnett13   P. Pierce13      J. Terry13        B. Bass13   
11       B. Bass14  Jeff Green14    A. Bradley14    J. Crawford14   
12  K. Humphries14     B. Bass14      R. Rondo14     Jeff Green14   
13  K. Humphries14     B. Bass14  

In [39]:
print(np.mean(svr_test_RMSE_list))
print(np.mean(svr_test_MAE_list))
print(np.mean(lgb_test_RMSE_list))
print(np.mean(lgb_test_MAE_list))
print(np.mean(ngb_test_RMSE_list))
print(np.mean(ngb_test_MAE_list))
print(np.mean(ngb_test_NLL_list))

4.516329635114647
3.7057644076274747
4.583569712827142
3.7596603038169913
4.5745978980694515
3.770025514365634
2.979684599511333


In [8]:
lineup_data1_and_predictions = lineup_data1_and_predictions.sort_values("OFFRTG_pred", ascending=False).reset_index(drop=True)
print("TOP10")
print(lineup_data1_and_predictions.iloc[:10, ])
print("BOTTOM10")
print(lineup_data1_and_predictions.sort_values("OFFRTG_pred", ascending=True).iloc[:10,])

TOP10
           PLAYER1          PLAYER2          PLAYER3             PLAYER4  \
0       S. Blake16    A. Tolliver16  Marcus Morris16         A. Baynes16   
1  Marcus Morris16     R. Jackson16    A. Drummond16  K. Caldwell-Pope16   
2  Marcus Morris16      T. Harris16     R. Jackson16         A. Baynes16   
3    E. Ilyasova16  Marcus Morris16     R. Jackson16         A. Baynes16   
4       I. Smith17  Marcus Morris17      T. Harris17       A. Drummond17   
5  Marcus Morris17     R. Jackson17       J. Leuer17       A. Drummond17   
6  Marcus Morris17      T. Harris17     R. Jackson17       A. Drummond17   
7       I. Smith17  Marcus Morris17       J. Leuer17       A. Drummond17   
8       I. Smith17      T. Harris17       J. Leuer17       A. Drummond17   
9       I. Smith17  Marcus Morris17      T. Harris17         A. Baynes17   

              PLAYER5 TEAM   Season  OFFRTG_true CB MB  ... OA PH DH S4 CS PS  \
0        S. Johnson16  DET  2015-16   105.193333  1  0  ...  1  0  0  1  0  

In [9]:
with open('../pickle_files//kalman2020_lineup_data1.pkl', 'rb') as f:
    kalman2020_lineup_data1 = pickle.load(f)

In [10]:
svr = SVR(kernel = "rbf", gamma = 1, C=0.1, epsilon = 0)

params = {'objective': 'regression',  
          'metric': 'mae',
          'random_state': seed,
          'boosting_type': 'gbdt',
          'n_estimators': 10000,
          'verbose': -1,
          'early_stopping_round': 10,
          'feature_fraction': 0.3,
          'min_child_samples': 5
          }
lgb = LGBMRegressor(**params)
ngb = NGBoost(Base=DecisionTreeRegressor(criterion='friedman_mse', max_depth=2, random_state=seed),
              Dist=Normal, 
              natural_gradient=True, 
              verbose=False,
              random_state=seed,
              col_sample=0.8
              )

In [27]:
svr = SVR(kernel = "rbf", gamma = 1, C=0.1, epsilon = 0)

params = {'objective': 'regression',  
          'metric': 'mae',
          'random_state': seed,
          'boosting_type': 'gbdt',
          'n_estimators': 10000,
          'verbose': -1,
          'early_stopping_round': 10,
          'feature_fraction': 0.3,
          'min_child_samples': 5
          }
lgb = LGBMRegressor(**params)
ngb = NGBoost(Base=DecisionTreeRegressor(criterion='friedman_mse', max_depth=3, random_state=seed),
              Dist=Normal, 
              natural_gradient=True, 
              verbose=False,
              random_state=seed,
              col_sample=0.8
              )

In [28]:
svr_y_pred_list = []
lgb_y_pred_list = []
ngb_y_pred_list = []
y_dist_list = []

svr_test_RMSE_list = []
svr_test_MAE_list = []
lgb_test_RMSE_list = []
lgb_test_MAE_list = []
ngb_test_RMSE_list = []
ngb_test_MAE_list = []
ngb_test_NLL_list = []

random.seed(seed)

for TEAM in TEAMS:
    pred_target_team_df = kalman2020_lineup_data1[kalman2020_lineup_data1["TEAM"] == TEAM]
    pred_target_team_df.reset_index(inplace=True, drop=True)
    X_test, y_test = pred_target_team_df.drop(columns=["LINEUPS", "TEAM", "OFFRTG"]), pred_target_team_df["OFFRTG"]
    
    training_df = kalman2020_lineup_data1[kalman2020_lineup_data1["TEAM"] != TEAM]
    X_train, y_train = training_df.drop(columns=["LINEUPS", "TEAM", "OFFRTG"]), training_df["OFFRTG"]
    
    train_val_TEAMS = [team for team in TEAMS if team != TEAM]
    
    val_TEAMS = random.sample(train_val_TEAMS, 4)
    
    X_train_, y_train_ = training_df[~training_df["TEAM"].isin(val_TEAMS)].drop(columns=["LINEUPS", "TEAM", "OFFRTG"]), training_df[~training_df["TEAM"].isin(val_TEAMS)]["OFFRTG"]
    X_val, y_val = training_df[training_df["TEAM"].isin(val_TEAMS)].drop(columns=["LINEUPS", "TEAM", "OFFRTG"]), training_df[training_df["TEAM"].isin(val_TEAMS)]["OFFRTG"]
    
    svr.fit(X_train, y_train)
    svr_y_pred = svr.predict(X_test)
    svr_y_pred_list.append(svr_y_pred)
    svr_test_RMSE = np.sqrt(mean_squared_error(y_test, svr_y_pred))
    svr_test_RMSE_list.append(svr_test_RMSE)
    svr_test_MAE = mean_absolute_error(y_test, svr_y_pred)
    svr_test_MAE_list.append(svr_test_MAE)
    
    lgb.fit(X_train_, y_train_, eval_set=[(X_val, y_val)])
    lgb_y_pred = lgb.predict(X_test)
    lgb_y_pred_list.append(lgb_y_pred)
    lgb_test_RMSE = np.sqrt(mean_squared_error(y_test, lgb_y_pred))
    lgb_test_RMSE_list.append(lgb_test_RMSE)
    lgb_test_MAE = mean_absolute_error(y_test, lgb_y_pred)
    lgb_test_MAE_list.append(lgb_test_MAE)
    
    ngb.fit(X_train_, y_train_, X_val, y_val, early_stopping_rounds=10)
    ngb_y_pred = ngb.predict(X_test)
    ngb_y_pred_list.append(ngb_y_pred)
    ngb_test_RMSE = np.sqrt(mean_squared_error(y_test, ngb_y_pred))
    ngb_test_RMSE_list.append(ngb_test_RMSE)
    ngb_test_MAE = mean_absolute_error(y_test, ngb_y_pred)
    ngb_test_MAE_list.append(ngb_test_MAE)
    y_dist = ngb.pred_dist(X_test)
    y_dist_list.append(y_dist)
    ngb_test_NLL = -y_dist.logpdf(y_test).mean()
    ngb_test_NLL_list.append(ngb_test_NLL)

In [29]:
print(np.mean(svr_test_RMSE_list))
print(np.mean(svr_test_MAE_list))
print(np.mean(lgb_test_RMSE_list))
print(np.mean(lgb_test_MAE_list))
print(np.mean(ngb_test_RMSE_list))
print(np.mean(ngb_test_MAE_list))
print(np.mean(ngb_test_NLL_list))

4.380665976427182
3.5753930244084886
4.420715536427772
3.6345871761407786
4.3450667062669615
3.551915917397111
2.91552621574462
