In [22]:
import pandas as pd
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

from sklearn.metrics import mean_squared_error, median_absolute_error, mean_absolute_percentage_error
from scipy.stats import uniform, randint

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import RepeatedKFold, KFold


In [2]:
STATS_COLUMNS = ["player", "position", "age","team_id","g","gs","mp_per_g","fg_per_g",
"fga_per_g","fg_pct","fg3_per_g","fg3a_per_g","fg3_pct","fg2_per_g","fg2a_per_g",
"fg2_pct","efg_pct","ft_per_g","fta_per_g","ft_pct","orb_per_g","drb_per_g","trb_per_g",
"ast_per_g","stl_per_g","blk_per_g","tov_per_g","pf_per_g","pts_per_g"]
NUM_STATS_COLUMNS = [col for col in STATS_COLUMNS if col not in ('"player", "position","team_id"')]

ADVANCED_STATS_COLUMNS =["player","position","age","team_id","g","mp","per","ts_pct","fg3a_per_fga_pct",
"fta_per_fga_pct","orb_pct","drb_pct","trb_pct","ast_pct","stl_pct","blk_pct","tov_pct","usg_pct",
                         "ows","dws","ws","ws_per_48","obpm","dbpm","bpm","vorp"]

NUM_ADVANCED_STATS_COLUMNS = [col for col in ADVANCED_STATS_COLUMNS if col not in ('"player", "position","team_id"')]
INJURIES_COLUMNS = ['out_for_season', 'out_indefinitely','acum_out_for_season', 'acum_out_indefinitely']

In [3]:
df = pd.read_csv('datasets/preprocessed_dataset.csv').fillna(0).set_index('player')
results_df = pd.read_csv('datasets/results/performance_metrics.csv')

In [4]:
relevant_cols = list(set(NUM_STATS_COLUMNS + NUM_ADVANCED_STATS_COLUMNS + ['position'] + INJURIES_COLUMNS))

In [5]:
def evaluate_predictions(model, dataset_type, y_pred, y_truth):
    rmse = np.round(np.sqrt(mean_squared_error(y_truth, y_pred)), 0)
    mae = np.round(median_absolute_error(y_truth, y_pred), 0)
    mape = np.round(mean_absolute_percentage_error(y_truth, y_pred), 4)
    return {'model': [model], 'dataset_type': [dataset_type], 'rmse':[rmse], 'mae': [mae], 'mape':[mape]}

class PredictionEvaluation:
    
    def __init__(self, y_pred_train, y_truth_train, y_pred_test, y_truth_test, model_name, results_df):
        self.y_pred_train = y_pred_train
        self.y_truth_train = y_truth_train
        self.y_pred_test = y_pred_test
        self.y_truth_test = y_truth_test
        self.model_name = model_name
        self.results_df = results_df        
    
    def fill_results_df(self):
        train_results = pd.DataFrame(evaluate_predictions(self.model_name, 'train',
                                                          self.y_pred_train, self.y_truth_train))
        test_results = pd.DataFrame(evaluate_predictions(self.model_name, 'test',
                                                         self.y_pred_test, self.y_truth_test))
        return pd.concat([self.results_df, train_results, test_results])

    def diagnostics_plots(y_pred, y_truth):
    
        diag_plot = pd.DataFrame({'y_pred':y_pred,'y':y_truth, 'error': y_pred-y_truth})
        diag_plot.plot.scatter(x='y',y='y_pred')
        plt.plot([0,max(y_truth)], [0,max(y_truth)], c='black')
        plt.show()

In [6]:
X = df[relevant_cols+['free_agency_year']]
y = df[['mean_salary','free_agency_year']]

In [7]:
X['position'] = X.position.str.split('-').apply(lambda x: x[0])
X = pd.get_dummies(X, columns=["position"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [8]:
x_columns = X.columns.to_list()
x_columns.remove('free_agency_year')

In [9]:
train_X, test_X = X.query('free_agency_year < 2020').drop('free_agency_year', axis=1),\
                  X.query('free_agency_year == 2020').drop('free_agency_year', axis=1)#.values
train_y, test_y = y.query('free_agency_year < 2020').drop('free_agency_year', axis=1).mean_salary,\
                  y.query('free_agency_year == 2020').drop('free_agency_year', axis=1).mean_salary#.values

In [10]:
train_X.to_csv("datasets/xgboost/train_X.csv")
train_y.to_csv("datasets/xgboost/train_y.csv")
test_X.to_csv("datasets/xgboost/test_X.csv")
test_y.to_csv("datasets/xgboost/test_y.csv")

In [11]:
params = {
        'n_estimators': [50, 100, 150, 200, 300, 400, 500],
        'min_child_weight': [1, 5, 10, 20],
        'gamma': [0, 0.5, 1, 1.5, 2, 5],
        'subsample': [0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.7, 0.85, 1.0],
        'max_depth': [3, 4, 5, 7],
        'reg_alpha': [0, 0.5, 0.75, 0.9, 0.95, 0.99, 1, 2.5, 5, 10],
        'reg_lambda': [0, 0.5, 0.75, 0.9, 1, 2.5, 10]
        }

In [27]:
params = {
        'n_estimators': randint(50, 400),
        'min_child_weight': randint(1, 21),
        'gamma': uniform(2.5, 2.5),
        'subsample': uniform(0.85, 0.15),
        'colsample_bytree': uniform(0.85, 0.15),
        'max_depth': randint(3, 10),
        'reg_alpha': uniform(5, 5),
        'reg_lambda': uniform(5, 5)
        }

In [28]:
bst = xgb.XGBRegressor(objective = 'reg:squarederror',nthread=4, random_state=1992)

In [29]:
folds = 5
param_comb = 1500
repeats = 3

kf = RepeatedKFold(n_splits=folds, n_repeats=repeats, random_state = 1992)

In [30]:
random_search = RandomizedSearchCV(bst, param_distributions=params,
                                   n_iter=param_comb, scoring='neg_median_absolute_error',
                                   n_jobs=6, cv=kf.split(train_X,train_y), verbose=2, random_state=1992 )

In [31]:
random_search.fit(train_X,train_y)

Fitting 15 folds for each of 1500 candidates, totalling 22500 fits


RandomizedSearchCV(cv=<generator object _RepeatedSplits.split at 0x7f79b30bc840>,
                   estimator=XGBRegressor(nthread=4,
                                          objective='reg:squarederror',
                                          random_state=1992),
                   n_iter=1500, n_jobs=6,
                   param_distributions={'colsample_bytree': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f79b4da6780>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7...
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f79b4db54a8>,
                                        'reg_alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f79b47bbdd8>,
                                        'reg_lambda': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f79b47bb940>,
                                        'subsample': <scipy.stats._

In [32]:
print('Best estimator:')
print(random_search.best_estimator_)

Best estimator:
XGBRegressor(colsample_bytree=0.8581835126798332, gamma=2.5914770688791324,
             max_depth=8, min_child_weight=6, n_estimators=63, nthread=4,
             objective='reg:squarederror', random_state=1992,
             reg_alpha=6.874405831393419, reg_lambda=5.825639041358663,
             subsample=0.9245868225124521)


In [33]:
random_search.best_score_

-1697789.4249999998

In [17]:
random_search.best_score_

-1658431.25

In [34]:
train_y_pred = random_search.predict(train_X)
test_y_pred = random_search.predict(test_X)

xgboost_random_search_evaluation = PredictionEvaluation(train_y_pred, train_y, test_y_pred,
                                           test_y, 'xgboost best cv', results_df)

results_df = xgboost_random_search_evaluation.fill_results_df()
results_df

Unnamed: 0,model,dataset_type,rmse,mae,mape
0,baseline,train,7046669.0,4191739.0,1.7206
1,baseline,test,6136372.0,3988694.0,1.5356
2,modelo lineal,train,3924033.0,2111549.0,0.8302
3,modelo lineal,test,3559228.0,2088352.0,0.8496
4,modelo lineal no negativo,train,4343415.0,2166715.0,0.8812
5,modelo lineal no negativo,test,3798000.0,1817943.0,0.7398
6,modelo lineal logaritmo,train,4492837.0,1382346.0,0.5387
7,modelo lineal logaritmo,test,4095432.0,1059238.0,0.4941
8,lasso,train,3958471.0,2031632.0,0.8034
9,lasso,test,3501706.0,1819077.0,0.7776


In [35]:
cv_results = pd.DataFrame(random_search.cv_results_).sort_values('rank_test_score')

In [36]:
rmse_train = []
mae_train = []
mape_train = []
rmse_test = []
mae_test = []
mape_test = []

In [37]:
for i in range(param_comb):
    
    params = cv_results.iloc[i, :].params
    subsample = params['subsample']
    reg_lambda = params['reg_lambda']
    reg_alpha = params['reg_alpha']
    n_estimators = params['n_estimators']
    min_child_weight = params['min_child_weight']
    max_depth = params['max_depth']
    colsample_bytree = params['colsample_bytree']
    gamma = params['gamma']
    
    bst = xgb.XGBRegressor( objective = 'reg:squarederror',
                           subsample = subsample,  
                            reg_lambda = reg_lambda,
                            reg_alpha = reg_alpha,
                            n_estimators = n_estimators,
                            min_child_weight = min_child_weight,
                            max_depth = max_depth, 
                            colsample_bytree = colsample_bytree,
                            gamma = gamma,
                           nthread=4, random_state=1992)
    
    bst.fit(train_X, train_y)
    
    train_y_pred = bst.predict(train_X)
    rmse_train.append(np.round(np.sqrt(mean_squared_error(train_y, train_y_pred)), 0))
    mae_train.append(np.round(median_absolute_error(train_y, train_y_pred), 0))
    mape_train.append(np.round(mean_absolute_percentage_error(train_y, train_y_pred), 4))
    
    test_y_pred = bst.predict(test_X)
    rmse_test.append(np.round(np.sqrt(mean_squared_error(test_y, test_y_pred)), 0))
    mae_test.append(np.round(median_absolute_error(test_y, test_y_pred), 0))
    mape_test.append(np.round(mean_absolute_percentage_error(test_y, test_y_pred), 4))

In [38]:
cv_results['mae_train'] = mae_train
cv_results['mae_test'] = mae_test
cv_results['rmse_train'] = rmse_train
cv_results['rmse_test'] = rmse_test
cv_results['mape_train'] = mape_train
cv_results['mape_test'] = mape_test

In [39]:
cv_results.to_csv('datasets/results/xgboost_cv_results.csv', index=False)

In [74]:
cv_results.sort_values('mae_test')[["mae_train", "mae_test", "rmse_train" ,"rmse_test", "mape_train", "mape_test", "rank_test_score"]].head(10)

Unnamed: 0,mae_train,mae_test,rmse_train,rmse_test,mape_train,mape_test,rank_test_score
242,602692.0,861846.0,1167816.0,3518551.0,0.255,0.4882,912
941,719550.0,887728.0,1472205.0,3611983.0,0.3109,0.4951,420
123,970121.0,895349.0,2318240.0,3478508.0,0.4106,0.4746,339
538,642831.0,895772.0,1270735.0,3464154.0,0.2794,0.5041,829
1415,1035496.0,926448.0,2037009.0,3547182.0,0.4359,0.4732,230
797,229706.0,932338.0,534845.0,3737345.0,0.1108,0.5216,355
1140,281510.0,937630.0,676148.0,3500915.0,0.1427,0.5112,254
531,372463.0,943918.0,707274.0,3722569.0,0.1701,0.5575,1165
396,682693.0,961383.0,1255846.0,3647075.0,0.2854,0.4905,808
697,28251.0,967896.0,92564.0,3676707.0,0.016,0.5202,813


In [41]:
cv_results.sort_values('rmse_test')[["mae_train", "mae_test", "rmse_train" ,"rmse_test","mape_train", "mape_test", "rank_test_score"]].head(10)

Unnamed: 0,mae_train,mae_test,rmse_train,rmse_test,mape_train,mape_test,rank_test_score
1289,5151.0,1175115.0,23611.0,3264885.0,0.0042,0.504,710
268,693918.0,1053684.0,1299240.0,3310465.0,0.3061,0.4955,1245
799,122444.0,1199680.0,340664.0,3323444.0,0.0717,0.5109,350
127,41331.0,1262929.0,138041.0,3334011.0,0.0261,0.5205,1184
305,981496.0,1167554.0,1877994.0,3335350.0,0.4197,0.5214,50
701,372983.0,1243226.0,768677.0,3351861.0,0.1816,0.5135,938
968,1125426.0,1180184.0,2182829.0,3352352.0,0.4697,0.5215,389
1230,685747.0,1289302.0,1255834.0,3359257.0,0.2997,0.5293,1209
333,861289.0,1306959.0,1705812.0,3367780.0,0.3827,0.5231,229
1090,167694.0,1173008.0,538357.0,3375204.0,0.0969,0.496,116


In [42]:
cv_results.sort_values('mape_test')[["mae_train", "mae_test", "rmse_train" ,"rmse_test", "mape_train", "mape_test", "rank_test_score"]].head(5)

Unnamed: 0,mae_train,mae_test,rmse_train,rmse_test,mape_train,mape_test,rank_test_score
546,832077.0,1176768.0,1668127.0,3452622.0,0.3471,0.4507,244
713,789988.0,990064.0,1539341.0,3544126.0,0.3384,0.4629,795
18,955142.0,1064501.0,2318667.0,3641693.0,0.4063,0.4725,201
1415,1035496.0,926448.0,2037009.0,3547182.0,0.4359,0.4732,230
443,555112.0,1168530.0,1107431.0,3594789.0,0.2345,0.4736,128


In [102]:
cv_results[["mae_train", "mae_test", "rmse_train" ,"rmse_test", "mape_train", "mape_test", "rank_test_score"]].head(20)

Unnamed: 0,mae_train,mae_test,rmse_train,rmse_test,mape_train,mape_test,rank_test_score
1037,538511.0,1185345.0,1228752.0,3604844.0,0.2422,0.5074,1
574,1244431.0,1089502.0,2368776.0,3687992.0,0.4992,0.5277,2
266,1459813.0,1143961.0,2958928.0,3704268.0,0.6076,0.5364,3
481,1115727.0,1182584.0,2220607.0,3533359.0,0.4871,0.52,4
751,858295.0,1109009.0,1894331.0,3671633.0,0.3564,0.5188,5
557,127758.0,1244570.0,452735.0,3621155.0,0.0719,0.5421,6
206,1332769.0,1146694.0,2645649.0,3498582.0,0.5612,0.5479,7
1228,1267862.0,1235564.0,2538132.0,3718927.0,0.5255,0.5148,8
1468,399340.0,1154658.0,1179821.0,3753784.0,0.1791,0.503,9
1040,1306861.0,976748.0,2581686.0,3611545.0,0.5344,0.5065,10


In [104]:
params = cv_results.iloc[1, :].params
subsample = params['subsample']
reg_lambda = params['reg_lambda']
reg_alpha = params['reg_alpha']
n_estimators = params['n_estimators']
min_child_weight = params['min_child_weight']
max_depth = params['max_depth']
colsample_bytree = params['colsample_bytree']
gamma = params['gamma']

In [105]:
booster = xgb.XGBRegressor(objective = 'reg:squarederror',
                           subsample = subsample,  
                            reg_lambda = reg_lambda,
                            reg_alpha = reg_alpha,
                            n_estimators = n_estimators,
                            min_child_weight = min_child_weight,
                            max_depth = max_depth, 
                            colsample_bytree = colsample_bytree,
                            gamma = gamma,
                           nthread=4, random_state=1992)

In [106]:
params

{'colsample_bytree': 0.9147464743698386,
 'gamma': 3.8558594825217174,
 'max_depth': 4,
 'min_child_weight': 6,
 'n_estimators': 67,
 'reg_alpha': 5.74710787412114,
 'reg_lambda': 8.84030873186032,
 'subsample': 0.9788269402165428}

In [107]:
booster.fit(train_X, train_y)

XGBRegressor(colsample_bytree=0.9147464743698386, gamma=3.8558594825217174,
             max_depth=4, min_child_weight=6, n_estimators=67, nthread=4,
             objective='reg:squarederror', random_state=1992,
             reg_alpha=5.74710787412114, reg_lambda=8.84030873186032,
             subsample=0.9788269402165428)

In [108]:
train_y_pred = booster.predict(train_X)

In [109]:
evaluate_predictions('xgboost', 'train', train_y, train_y_pred)

{'model': ['xgboost'],
 'dataset_type': ['train'],
 'rmse': [2368776.0],
 'mae': [1244431.0],
 'mape': [0.3459]}

In [99]:
test_y_pred = booster.predict(test_X)

In [100]:
evaluate_predictions('xgboost', 'test', test_y, test_y_pred)

{'model': ['xgboost'],
 'dataset_type': ['test'],
 'rmse': [3604844.0],
 'mae': [1185345.0],
 'mape': [0.4926]}

In [92]:
booster.save_model('models/best_mae_model_829.model')

https://stats.stackexchange.com/questions/364827/why-is-regression-with-gradient-tree-boosting-sometimes-impacted-by-normalizatio

In [73]:
results_df.to_csv('datasets/results/performance_metrics.csv', index=False)

In [103]:
results_df

Unnamed: 0,model,dataset_type,rmse,mae,mape
0,baseline,train,7046669.0,4191739.0,1.7206
1,baseline,test,6136372.0,3988694.0,1.5356
2,modelo lineal,train,3924033.0,2111549.0,0.8302
3,modelo lineal,test,3559228.0,2088352.0,0.8496
4,modelo lineal no negativo,train,4343415.0,2166715.0,0.8812
5,modelo lineal no negativo,test,3798000.0,1817943.0,0.7398
6,modelo lineal logaritmo,train,4492837.0,1382346.0,0.5387
7,modelo lineal logaritmo,test,4095432.0,1059238.0,0.4941
8,lasso,train,3958471.0,2031632.0,0.8034
9,lasso,test,3501706.0,1819077.0,0.7776
