In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import pickle

In [None]:
X = np.load("../data/X_train_scaled.npy", allow_pickle=True)
y = np.load("../data/y_train_scaled.npy", allow_pickle=True)

In [None]:
print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (3104579, 26)
y shape: (3104579, 1)


In [None]:
regressor = LinearRegression()

In [None]:
param_grid = {}

In [None]:
grid_search = GridSearchCV(
    regressor,
    param_grid,
    scoring=[
        "r2",
        "neg_mean_squared_error"
    ],
    refit="r2",
    cv=10,
    n_jobs=5,
    verbose=True
)

In [None]:
np.random.seed(2907)
grid_search.fit(X, y)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


GridSearchCV(cv=10, estimator=LinearRegression(), n_jobs=5, param_grid={},
             refit='r2', scoring=['r2', 'neg_mean_squared_error'],
             verbose=True)

In [None]:
print("GridSearch results:")
grid_search.cv_results_

GridSearch results:


{'mean_fit_time': array([7.98677826]),
 'std_fit_time': array([0.17703462]),
 'mean_score_time': array([0.03379705]),
 'std_score_time': array([0.00810202]),
 'params': [{}],
 'split0_test_r2': array([0.55734423]),
 'split1_test_r2': array([0.6234702]),
 'split2_test_r2': array([0.66434417]),
 'split3_test_r2': array([0.62344688]),
 'split4_test_r2': array([0.59631857]),
 'split5_test_r2': array([0.61960196]),
 'split6_test_r2': array([0.5793328]),
 'split7_test_r2': array([0.59254723]),
 'split8_test_r2': array([0.59526889]),
 'split9_test_r2': array([0.58366517]),
 'mean_test_r2': array([0.60353401]),
 'std_test_r2': array([0.02846942]),
 'rank_test_r2': array([1]),
 'split0_test_neg_mean_squared_error': array([-0.45157184]),
 'split1_test_neg_mean_squared_error': array([-0.34449367]),
 'split2_test_neg_mean_squared_error': array([-0.28011243]),
 'split3_test_neg_mean_squared_error': array([-0.30242592]),
 'split4_test_neg_mean_squared_error': array([-0.3432773]),
 'split5_test_neg_m

In [None]:
best_regressor = grid_search.best_estimator_

In [None]:
print("Best regressor:", )
print("Coefs.:", best_regressor.coef_)
print("Intercept:", best_regressor.intercept_)

Best regressor: 0
Coefs.: [[ 3.82853636e+06  1.25548606e+00  3.39402417e-01 -1.37816748e+06
   6.79869531e+00 -1.76473721e+07  1.67325823e+07 -1.41520074e-01
   5.39203402e+00 -1.83283594e-01  2.43321632e-01 -1.83969530e-01
   8.55868815e-01  8.99826459e-01 -9.48991738e-01  8.47317333e-01
  -6.16477212e-01 -2.11255026e+01 -1.02712088e+01 -9.72839839e+00
  -4.56980226e+00 -6.27713645e-01 -6.65224032e-01 -2.30389706e+00
  -1.78554706e+00  8.89182598e-02]]
Intercept: [-4.61491303e-08]


In [None]:
with open("../models/linear_regressor_01.pkl", mode="wb") as f:
    pickle.dump(best_regressor, f)

In [None]:
y_hat = best_regressor.predict(X)

In [None]:
r2 = r2_score(y, y_hat)
mse = mean_squared_error(y, y_hat)

print(f"R2 Score: {r2:.3f}")
print(f"MSE: {mse:.3f}")

R2 Score: 0.615
MSE: 0.385
