In [37]:
import polars as pl
import polars.selectors as cs
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import RocCurveDisplay
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import os

In [38]:
(salaries_pitchers := pl.read_csv('./data/lahman_1871-2024_csv/final_ds.csv'))



playerID,yearID,W,L,G,GS,SV,IPouts,H,ER,HR,BB,SO,PO,A,E,G_field,salary,TrainingValidation
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str
"""aardsda01""",2006,3,0,45,0,0,159,41,24,9,28,49,1,5,0,45,,"""Training"""
"""abreuwi01""",2006,0,0,7,0,0,24,10,9,1,6,6,1,0,0,7,,"""Training"""
"""accarje01""",2006,2,4,65,0,3,207,76,41,7,20,54,2,12,0,65,330000,"""Training"""
"""adamsmi03""",2006,0,0,2,0,0,7,4,3,1,2,1,0,0,0,2,,"""Training"""
"""adkinjo01""",2006,2,1,55,0,0,163,55,24,3,20,30,1,12,1,55,,"""Training"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""youngma03""",2016,0,0,8,0,0,21,12,5,0,4,4,0,2,0,8,,"""Validation"""
"""zastrro01""",2016,1,0,8,1,0,48,12,2,0,5,17,1,2,0,8,,"""Validation"""
"""zieglbr01""",2016,4,7,69,0,22,204,67,17,2,26,58,2,11,1,69,5500000,"""Validation"""
"""zimmejo02""",2016,9,7,19,18,0,316,118,57,14,26,66,2,17,2,19,18000000,"""Validation"""


In [39]:
salaries_pitchers = salaries_pitchers.drop_nulls()

### Creating Validation/Training Columns

In [40]:
training_data = salaries_pitchers.filter(pl.col('TrainingValidation') == 'Training')
validation_data = salaries_pitchers.filter(pl.col('TrainingValidation') == 'Validation')

In [41]:
x = training_data.drop({'salary', 'TrainingValidation', 'yearID', 'playerID'}).to_pandas()
y = training_data['salary'].to_pandas()
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state = 50)

In [42]:
x_val = validation_data.drop({'salary', 'TrainingValidation', 'yearID', 'playerID'}).to_pandas()
y_val = validation_data['salary'].to_pandas()

In [43]:
from sklearn.model_selection import StratifiedKFold

param_grid = {
    "max_depth": [None, 1, 2, 3, 4, 5],
    "min_samples_split": [2, 3, 5, 10],
    "min_samples_leaf": [1, 5, 10]
}

rf_params = {
    "n_estimators": [100, 200, 300],
    "max_depth": [5, 10, None],
    "min_samples_split": [2, 3, 5, 10],
    "min_samples_leaf": [1, 5, 10],
    "max_features": ["sqrt", "log2"]
}

state_val = 5000

cv_fold = StratifiedKFold(
    n_splits=10,
    shuffle=True,
    random_state=state_val
)

cv_fold


StratifiedKFold(n_splits=10, random_state=5000, shuffle=True)

In [44]:
grid_search_1 = GridSearchCV(DecisionTreeRegressor(), param_grid, verbose=3, cv=cv_fold)
grid_search_rf = GridSearchCV(RandomForestRegressor(), rf_params, verbose=3, cv=cv_fold)

In [45]:
grid_search_1.fit(X_train, y_train)

Fitting 10 folds for each of 72 candidates, totalling 720 fits
[CV 1/10] END max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=-0.428 total time=   0.0s
[CV 2/10] END max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=-0.775 total time=   0.0s
[CV 3/10] END max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=-0.531 total time=   0.0s
[CV 4/10] END max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=-0.969 total time=   0.0s
[CV 5/10] END max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=-0.770 total time=   0.0s
[CV 6/10] END max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=-0.857 total time=   0.0s
[CV 7/10] END max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=-0.701 total time=   0.0s




[CV 8/10] END max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=-0.593 total time=   0.0s
[CV 9/10] END max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=-0.245 total time=   0.0s
[CV 10/10] END max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=-0.524 total time=   0.0s
[CV 1/10] END max_depth=None, min_samples_leaf=1, min_samples_split=3;, score=-0.473 total time=   0.0s
[CV 2/10] END max_depth=None, min_samples_leaf=1, min_samples_split=3;, score=-0.720 total time=   0.0s
[CV 3/10] END max_depth=None, min_samples_leaf=1, min_samples_split=3;, score=-0.682 total time=   0.0s
[CV 4/10] END max_depth=None, min_samples_leaf=1, min_samples_split=3;, score=-0.998 total time=   0.0s
[CV 5/10] END max_depth=None, min_samples_leaf=1, min_samples_split=3;, score=-0.771 total time=   0.0s
[CV 6/10] END max_depth=None, min_samples_leaf=1, min_samples_split=3;, score=-0.746 total time=   0.0s
[CV 7/10] END max_depth=None, min_samples_leaf=1, min_samples_s

0,1,2
,estimator,DecisionTreeRegressor()
,param_grid,"{'max_depth': [None, 1, ...], 'min_samples_leaf': [1, 5, ...], 'min_samples_split': [2, 3, ...]}"
,scoring,
,n_jobs,
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,3
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,10
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [None]:
grid_search_rf.fit(X_train, y_train)

Fitting 10 folds for each of 216 candidates, totalling 2160 fits
[CV 1/10] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.218 total time=   0.1s




[CV 2/10] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.190 total time=   0.1s
[CV 3/10] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.215 total time=   0.1s
[CV 4/10] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.200 total time=   0.1s
[CV 5/10] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.262 total time=   0.1s
[CV 6/10] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.126 total time=   0.1s
[CV 7/10] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.176 total time=   0.1s
[CV 8/10] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.205 total time=   0.1s
[CV 9/10] END max_depth=5, max_features=s

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


In [None]:
grid_search_1.best_score_

np.float64(0.17370746553275268)

In [None]:
grid_search_1.best_params_

{'max_depth': 3, 'min_samples_leaf': 10, 'min_samples_split': 2}

In [None]:
(best_model_1 := grid_search_1.best_estimator_)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,10
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [None]:
y_val_pred_1 = best_model_1.predict(x_val)


In [None]:
mse_val = mean_squared_error(y_val, y_val_pred_1)
rmse_val = np.sqrt(mse_val)
r2_val = r2_score(y_val, y_val_pred_1)

pl.DataFrame({
    "Metric": ["MSE", "RMSE", "R²"],
    "Value": [mse_val, rmse_val, r2_val]
})

Metric,Value
str,f64
"""MSE""",26441000000000.0
"""RMSE""",5142100.0
"""R²""",0.149538


In [None]:
grid_search_rf.best_score_

AttributeError: 'GridSearchCV' object has no attribute 'best_score_'

In [None]:
grid_search_rf.best_params_

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [None]:
(best_model_rf := grid_search_rf.best_estimator_)

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [None]:
y_pred_rf = best_model_rf.predict(x_val)

mse_val = mean_squared_error(y_val, y_pred_rf)
rmse_val = np.sqrt(mse_val)
r2_val = r2_score(y_val, y_pred_rf)

pl.DataFrame({
    "Metric": ["MSE", "RMSE", "R²"],
    "Value": [
        f"{mse_val:,.2f}",
        f"{rmse_val:,.2f}",
        f"{r2_val:.3f}"
    ]
})

NameError: name 'best_model_rf' is not defined