# 4 regression on a given dataset

## Imports

In [191]:
import numpy as np
import os
import sklearn.linear_model
import sklearn.metrics
import optuna
import sklearn.ensemble

## Load data

In [145]:
PATH_DATA = "data/"
X_train = np.load(os.path.join(PATH_DATA, "X_train.npy"))
X_test = np.load(os.path.join(PATH_DATA, "X_test.npy"))
y_train = np.load(os.path.join(PATH_DATA, "y_train.npy"))
y_test = np.load(os.path.join(PATH_DATA, "y_test.npy"))

X_train.shape, y_test.shape, X_test.shape, y_test.shape

((200, 200), (200, 1), (200, 200), (200, 1))

In [146]:
for name, dataset in {"X_train": X_train, "y_train": y_train,"X_test": X_test, "y_test": y_test}.items():
    print(name)
    print(f"min : {dataset.min()}, max : {dataset.max()}, mean : {dataset.mean()}, std : {dataset.std()}")

X_train
min : 8.740489740644009e-06, max : 0.9999919179207105, mean : 0.49838834357650513, std : 0.28928895293168067
y_train
min : 1.9300329881344744, max : 7.464791606974733, mean : 4.906834385571885, std : 0.9454213390267655
X_test
min : 1.3653102668209627e-05, max : 0.9999562829345615, mean : 0.49925180276418607, std : 0.2891216096023428
y_test
min : 2.2818440532901314, max : 7.482037402054659, mean : 5.064874886067615, std : 0.8580144222049355


## Load models

In [200]:
model = sklearn.linear_model.LinearRegression(copy_X=True, positive=True)
# model = sklearn.linear_model.Ridge(copy_X=True)
# model = sklearn.linear_model.Lasso(copy_X=True)
# model = sklearn.ensemble.RandomForestRegressor()

## Train model

In [201]:
model.fit(X_train, y_train)

## Evaluate

In [202]:
y_pred = model.predict(X_test)
mse = sklearn.metrics.mean_squared_error(y_test, y_pred)
r2 = sklearn.metrics.r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 0.08482385362764326
R^2 Score: 0.8847797473459286


## Optimize parameters

### Create the objectives function

In [197]:
def objective_ridge(trial: optuna.trial.Trial):

    alpha = trial.suggest_float("alpha", 1e-12, 10)
    tol = trial.suggest_float("tol", 1e-12, 1e-2)
    solver = trial.suggest_categorical(
        "solver", ["auto", "svd", "cholesky", "lsqr", "sag", "saga", "lbfgs"]
    )
    positive = (
        True
        if solver == "lbfgs"
        else (
            False
            if solver != "auto"
            else trial.suggest_categorical("positive", [False, True])
        )
    )

    estimator = sklearn.linear_model.Ridge(
        alpha=alpha, solver=solver, positive=positive, tol=tol
    )
    estimator.fit(X_train, y_train)

    return estimator.score(X_test, y_test)

def objective_lasso(trial: optuna.trial.Trial):

    alpha = trial.suggest_float("alpha", 1e-12, 10)
    tol = trial.suggest_float("tol", 1e-12, 1e-2)
    positive = trial.suggest_categorical("positive", [False, True])

    estimator = sklearn.linear_model.Ridge(
        alpha=alpha, positive=positive, tol=tol
    )
    estimator.fit(X_train, y_train)

    return estimator.score(X_test, y_test)

def objective_forest(trial: optuna.trial.Trial):

    criterion = trial.suggest_categorical("criterion", ["squared_error", "absolute_error", "friedman_mse", "poisson"])
    n_estimators = trial.suggest_int("n_estimators", low=1, high=1000)
    max_depth = trial.suggest_int("max_depth", low=1, high=1000)
    min_samples_split = trial.suggest_float("min_samples_split", low=0, high=1)

    estimator = sklearn.ensemble.RandomForestRegressor(
        n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split
    )
    estimator.fit(X_train, y_train)

    return estimator.score(X_test, y_test)


### Run the expirements

In [203]:
storage_name = "regression.db"
if os.path.exists(storage_name):
    os.remove(storage_name)

study = optuna.create_study(
        storage=f"sqlite:///{storage_name}",
        study_name="Regression",
        load_if_exists=False,
        direction="maximize",  # we want to maximize the R2 score
    )
study.optimize(func=objective_ridge, n_trials=100)

[I 2024-06-18 16:49:40,847] A new study created in RDB with name: Regression
[I 2024-06-18 16:49:41,036] Trial 0 finished with value: 0.5987772228334451 and parameters: {'alpha': 5.812037908777894, 'tol': 0.001356870001852476, 'solver': 'cholesky'}. Best is trial 0 with value: 0.5987772228334451.
[I 2024-06-18 16:49:41,450] Trial 1 finished with value: 0.5408998479764573 and parameters: {'alpha': 9.345589963800998, 'tol': 0.00913697728075423, 'solver': 'svd'}. Best is trial 0 with value: 0.5987772228334451.
[I 2024-06-18 16:49:41,576] Trial 2 finished with value: 0.6866177666956601 and parameters: {'alpha': 2.1083033129768824, 'tol': 0.0031620645121030414, 'solver': 'sag'}. Best is trial 2 with value: 0.6866177666956601.
[I 2024-06-18 16:49:41,636] Trial 3 finished with value: 0.5504063969691657 and parameters: {'alpha': 8.682503755905964, 'tol': 0.0006557086435155567, 'solver': 'lsqr'}. Best is trial 2 with value: 0.6866177666956601.
[I 2024-06-18 16:49:41,697] Trial 4 finished with v

### Display the best results and hyperparameters

In [188]:
print(f"Best value: {study.best_value:.4f} (params: {study.best_params})")
for key, value in study.best_trial.params.items():
    if type(value) == float:
        print(f"{key}: {value:.2f}")
    else:
        print(f"{key}: {value}")

Best value: 0.8856 (params: {'alpha': 0.13303991140771043, 'tol': 0.00020648032855823814, 'positive': True})
alpha: 0.13
tol: 0.00
positive: True


### See all expirements results

In [187]:
!optuna-dashboard sqlite:///regression.db

[2024-06-18 16:34:01 +0200] [139800] [INFO] Starting gunicorn 22.0.0
[2024-06-18 16:34:01 +0200] [139800] [INFO] Listening at: http://127.0.0.1:8080 (139800)
[2024-06-18 16:34:01 +0200] [139800] [INFO] Using worker: gthread
[2024-06-18 16:34:01 +0200] [139817] [INFO] Booting worker with pid: 139817
  return get_param_importances(study, target=target, evaluator=PedAnovaImportanceEvaluator())
^C
[2024-06-18 16:34:32 +0200] [139800] [INFO] Handling signal: int
[2024-06-18 16:34:32 +0200] [139817] [INFO] Worker exiting (pid: 139817)


## Conclusion

Analysess :
- regularization does not seems to improve performances
- positive must be set to true, otherwise the model will overfit

TODO