In [1]:
from sklearn.datasets import make_regression
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from utils.storage import get_storage
import optuna

RuntimeError: module compiled against API version 0xe but this version of numpy is 0xd

In [2]:
import os
os.environ.get("MYSQL_DB_USER")

'optuna'

In [3]:
print(os.environ.get("MYSQL_DB_USER"))

optuna


In [4]:
X, y = make_regression(n_samples=10**4, n_features=20, n_informative=16, noise=0.0, random_state=1234)

In [5]:
X = pd.DataFrame(X)
y = pd.Series(y)

In [6]:
print(X.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       10000 non-null  float64
 1   1       10000 non-null  float64
 2   2       10000 non-null  float64
 3   3       10000 non-null  float64
 4   4       10000 non-null  float64
 5   5       10000 non-null  float64
 6   6       10000 non-null  float64
 7   7       10000 non-null  float64
 8   8       10000 non-null  float64
 9   9       10000 non-null  float64
 10  10      10000 non-null  float64
 11  11      10000 non-null  float64
 12  12      10000 non-null  float64
 13  13      10000 non-null  float64
 14  14      10000 non-null  float64
 15  15      10000 non-null  float64
 16  16      10000 non-null  float64
 17  17      10000 non-null  float64
 18  18      10000 non-null  float64
 19  19      10000 non-null  float64
dtypes: float64(20)
memory usage: 1.5 MB
None


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [8]:
def objective(trial):

    # search better model
    regressor_name = trial.suggest_categorical('classifier', ['RandomForest', 'XGBoost', 'AdaBoost'])
    # For decision tree
    # search better max_depth from 2 to 24
    max_depth = trial.suggest_int('max_depth', 2, 24)
    # search better n_estimators from 50 to 4000
    n_estimators = trial.suggest_int('n_estimators', 50, 4000)
    # search better max_depth from 1e-4 to 1
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1)
    
    if regressor_name == 'RandomForest':
        model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=1234)
    elif regressor_name == 'XGBoost':
        model = XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, random_state=1234)
    elif regressor_name == 'AdaBoost':
        model = AdaBoostRegressor(n_estimators=n_estimators, learning_rate=learning_rate, random_state=1234)
        
    error_list = cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_squared_error')

    return error_list.mean()  # An objective value linked with the Trial object.

In [9]:
# study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner(), study_name='sample14', storage=get_storage(), load_if_exists=True)  # Create a new study.
# study.optimize(objective, n_trials=50)  # Invoke optimization of the objective function.

In [10]:
study = optuna.load_study(study_name='sample14', storage=get_storage())

In [11]:
default_model = RandomForestRegressor(random_state=1234)
default_model.fit(X_train, y_train)
default_predict = default_model.predict(X_test)
default_score = mean_squared_error(y_test, default_predict)

In [12]:
if study.best_trial.params["classifier"] == 'RandomForest':
    best_model = RandomForestRegressor(n_estimators=study.best_trial.params["n_estimators"], max_depth=study.best_trial.params["max_depth"], random_state=1234)
elif study.best_trial.params["classifier"] == 'XGBoost':
    best_model = XGBRegressor(n_estimators=study.best_trial.params["n_estimators"], max_depth=study.best_trial.params["max_depth"], learning_rate=study.best_trial.params["learning_rate"], random_state=1234)
elif study.best_trial.params["classifier"] == 'AdaBoost':
    best_model = AdaBoostRegressor(n_estimators=study.best_trial.params["n_estimators"], learning_rate=study.best_trial.params["learning_rate"], random_state=1234)
    
best_model.fit(X_train, y_train)
best_predict = best_model.predict(X_test)
best_score = mean_squared_error(y_test, best_predict)

In [13]:
print(f'Score of dafault parameters => {default_score}')
print(f'Score of best parameters => {best_score}')

Score of dafault parameters => 5708.327072384357
Score of best parameters => 405.66758301451966
