In [1]:
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error
import pickle

In [2]:
X = np.load("../data/X_train_scaled.npy", allow_pickle=True)
y = np.load("../data/y_train_scaled.npy", allow_pickle=True)

In [3]:
print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (3104579, 26)
y shape: (3104579, 1)


In [4]:
pipeline = Pipeline(
    steps=[
        ("poly", PolynomialFeatures()),
        ("regressor", LinearRegression())
    ]
)

In [5]:
param_grid = {"poly__degree": [2, 3]}

In [6]:
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring="neg_mean_squared_error",
    cv=10,
    verbose=10
)

In [7]:
np.random.seed(2907)
grid_search.fit(X, y)

Fitting 10 folds for each of 2 candidates, totalling 20 fits
[CV 1/10; 1/2] START poly__degree=2.............................................
[CV 1/10; 1/2] END poly__degree=2;, score=-32618528414623.781 total time= 1.2min
[CV 2/10; 1/2] START poly__degree=2.............................................
[CV 2/10; 1/2] END .............poly__degree=2;, score=-0.287 total time= 1.2min
[CV 3/10; 1/2] START poly__degree=2.............................................
[CV 3/10; 1/2] END .............poly__degree=2;, score=-0.237 total time= 1.4min
[CV 4/10; 1/2] START poly__degree=2.............................................
[CV 4/10; 1/2] END .............poly__degree=2;, score=-0.249 total time= 1.3min
[CV 5/10; 1/2] START poly__degree=2.............................................
[CV 5/10; 1/2] END poly__degree=2;, score=-97290310225102.812 total time= 1.2min
[CV 6/10; 1/2] START poly__degree=2.............................................
[CV 6/10; 1/2] END poly__degree=2;, score=-19538

10 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "/home/henrique/github/coc361-ic/.venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/henrique/github/coc361-ic/.venv/lib/python3.10/site-packages/sklearn/pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/home/henrique/github/coc361-ic/.venv/lib/python3.10/site-packages/sklearn/pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/home/henrique/github/coc361-ic/.venv/lib/python3.10/site

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('poly', PolynomialFeatures()),
                                       ('regressor', LinearRegression())]),
             param_grid={'poly__degree': [2, 3]},
             scoring='neg_mean_squared_error', verbose=10)

In [None]:
print("GridSearch results:")
grid_search.cv_results_

In [None]:
with open("../grids/grid_search_linear_regressor_with_polynomial_features_01.pkl", mode="wb") as f:
    pickle.dump(grid_search, f)

In [None]:
best_regressor = grid_search.best_estimator_

In [None]:
print("Best regressor")
print("Coefs.:", best_regressor.coef_)
print("Intercept:", best_regressor.intercept_)

In [None]:
with open("../models/model_linear_regressor_with_polynomial_features_01.pkl", mode="wb") as f:
    pickle.dump(best_regressor, f)

In [None]:
y_hat = best_regressor.predict(X)

In [None]:
r2 = r2_score(y, y_hat)
mse = mean_squared_error(y, y_hat)

print(f"R2 Score: {r2:.3f}")
print(f"MSE: {mse:.3f}")