In [1]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
import numpy as np

import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

In [2]:
df = fetch_california_housing(as_frame=True)["frame"]
print(df.shape)
df.head()

(20640, 9)


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [3]:
from sklearn.model_selection import train_test_split

trainX, testX, trainy, testy = train_test_split(df.drop(columns=['MedHouseVal']), df.MedHouseVal, test_size=0.15, shuffle=True, random_state=1337)

In [5]:
import optuna
import xgboost as xgb
#from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error


def obj_xgb(trial, X,y):
    #x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.15)
    X = X.reset_index(drop=True)
    y = y.reset_index(drop=True)
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000, step=50),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1.0),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_loguniform('gamma', 0.001, 0.2),
        'objective': 'reg:squarederror', 
        'eval_metric': 'rmse',
    }

    #
    # custom_scorer = make_scorer(mean_squared_error, greater_is_better=False)
    model = xgb.XGBRegressor(**params)
    #random_state_list = [X, X, X, X, X]
    #cv_score/=len(random_state_list)

    cv = KFold(n_splits=5, shuffle=True, random_state=11)
    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        model.fit(X_train,y_train)
        preds = model.predict(X_test)
        #scores = cross_val_score(model, x, y, cv=kf,scoring=custom_scorer)
        cv_scores[idx] = mean_squared_error(y_test, preds)
 
    avg = cv_scores.mean()
    #y_pred = model.predict_proba(x_valid)[:, 1]
    #roc_auc = roc_auc_score(y_valid, y_pred)
    return avg


study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: obj_xgb(trial, trainX, trainy), n_trials=20)  # Pass X and y to the obj_xgb function

display(study)

print('Best trial:', study.best_trial.params)
'''
param = {'n_estimators': 900, 'max_depth': 8, 'learning_rate': 0.04494131592092563, 
         'subsample': 0.7007281411384232, 'colsample_bytree': 0.5687261362861079, 'gamma': 0.02682824556180894}
'''
print('Best brier:', study.best_trial.value)

[I 2023-08-02 03:18:39,774] A new study created in memory with name: no-name-fd8724e1-fbfa-46cf-8344-d7e90fa1234c
[W 2023-08-02 03:18:43,649] Trial 0 failed with parameters: {'n_estimators': 1000, 'max_depth': 8, 'learning_rate': 0.0014541453710486933, 'subsample': 0.631192118326085, 'colsample_bytree': 0.779251988617035, 'gamma': 0.0016083638449846391} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/yukaisun/opt/anaconda3/lib/python3.8/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-5-7c4f009dbd55>", line 46, in <lambda>
    study.optimize(lambda trial: obj_xgb(trial, trainX, trainy), n_trials=20)  # Pass X and y to the obj_xgb function
  File "<ipython-input-5-7c4f009dbd55>", line 34, in obj_xgb
    model.fit(X_train,y_train)
  File "/Users/yukaisun/opt/anaconda3/lib/python3.8/site-packages/xgboost/core.py", line 433, in inner_f
    return f(**kwargs)
  Fi

KeyboardInterrupt: 

In [8]:
model1 = xgb.XGBRegressor()
model1.fit(trainX, trainy)
model1.score(testX, testy)
mean_squared_error(testy, model1.predict(testX))



0.2124221487617203

In [6]:
#param = {'n_estimators': 900, 'max_depth': 8, 'learning_rate': 0.04494131592092563, 
#         'subsample': 0.7007281411384232, 'colsample_bytree': 0.5687261362861079, 'gamma': 0.02682824556180894}
param = {'n_estimators': 850, 'max_depth': 7, 'learning_rate': 0.035579334505517195, 
         'subsample': 0.7433060185168757, 'colsample_bytree': 0.9985782631581257, 'gamma': 0.014679853880810986}
import xgboost as xgb

#param = {'n_estimators': 700, 'max_depth': 7, 'learning_rate': 0.03332043654808083, 
#         'subsample': 0.8285660265147325, 'colsample_bytree': 0.7764008382448025, 'gamma': 0.004462565002330252}
model2 = xgb.XGBRegressor(**param)
model2.fit(trainX, trainy)
model2.score(testX, testy)



0.8549158707139253

In [7]:
from sklearn.metrics import mean_squared_error
mean_squared_error(testy, model2.predict(testX))



0.18976632289867668

In [10]:
pd.DataFrame({'predy': model2.predict(testX), 'testy':pd.Series(testy)}).sort_index().head(20)



Unnamed: 0,predy,testy
2,4.134664,3.521
15,1.338883,1.4
21,1.266553,1.598
30,1.393143,1.223
31,1.261677,1.152
37,1.462393,1.039
45,1.947667,1.823
47,1.402132,1.375
48,1.163201,1.875
52,1.056554,0.975
