In [None]:
#!pip install optuna

In [1]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
import numpy as np

# Machine learning models that we'll use
import xgboost as xgb
import lightgbm as lgbm
#from sklearn.linear_model import Ridge
import catboost as cat
# optuna
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

  from pandas import MultiIndex, Int64Index
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


In [2]:
# We set as_frame parameter to True and access the return object's "frame"
# attribute to get the dataset as pandas dataframe.

df = fetch_california_housing(as_frame=True)["frame"]
print(df.shape)
df.head()

(20640, 9)


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [3]:
from sklearn.model_selection import train_test_split

trainX, testX, trainy, testy = train_test_split(df.drop(columns=['MedHouseVal']), df.MedHouseVal, test_size=0.15, shuffle=True, random_state=1337)

In [5]:
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error

n_folds = 10
def CV(model, X, y, loss_function):
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    kfold = kf.split(X, y)
    losses = []
    for (train_idx, test_idx) in kfold:
        trainX, trainy, testX, testy = X.iloc[train_idx], y.iloc[test_idx], X.iloc[test_idx], y.iloc[test_idx]
        model.fit(trainX, trainy)
        predy = model.predict_proba(testX)
        loss = loss_function(testy, predy)
        losses.append(loss)
    return np.sum(losses) / n_folds

In [None]:
### Best parameter
from sklearn.metrics import mean_squared_error
import optuna

def objective(trial, data=X, target=y, param, model):
    from sklearn.model_selection import train_test_split
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15, random_state=42)
    amodel = model(**param)
    amodel.fit(train_x, train_y, eval_set=[(test_x,test_y)], early_stopping_rounds=100, verbose=False)
    preds = amodel.predict(test_x)
    rmse = mean_squared_error(test_y, preds,squared=False)
    return rmse

def objective1(trial):
    param = {'n_estimators': trial.suggest_int('n_estimators', 50, 200, 10),
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
            'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]),
            'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
            'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-1),
            'max_depth': trial.suggest_int('max_depth', 3, 20),
            'num_leaves' : trial.suggest_int('num_leaves', 10, 50),
            'min_child_samples': trial.suggest_int('min_child_samples', 1, 50)}
        
    model = LGBMClassifier(**param, random_state = 42)
    score = CV(model, trainX, mean_squared_error)
    return score

study = optuna.create_study(direction='minimize', study_name="LightGBM")
study.optimize(objective, n_trials=20)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
#optuna.visualization.plot_optimization_history(study)
#optuna.visualization.plot_parallel_coordinate(study)
#optuna.visualization.plot_param_importances(study)


In [4]:
from sklearn.metrics import mean_squared_error

# instantiating the model
cat_model = cat.CatBoostRegressor(verbose=False)
cat_model.fit(trainX, trainy)
y_pred_cat = cat_model.predict(testX)

xgb_model = xgb.XGBRegressor()
xgb_model.fit(trainX, trainy, verbose=False)
y_pred_xgb = xgb_model.predict(testX)

lgbm_model = lgbm.LGBMRegressor()
lgbm_model.fit(trainX, trainy, verbose=-1)
y_pred_lgbm = lgbm_model.predict(testX)

# combining predictions by taking simple average using numpy
y_pred_final = np.mean([y_pred_cat, y_pred_xgb, y_pred_lgbm], axis=0)

# let's calculate mse
mse = mean_squared_error(testy, y_pred_lgbm)

print(f"Simple Average Ensemble's MSE: {mse}") #Simple Average Ensemble's MSE: 0.20581131648521198



Simple Average Ensemble's MSE: 0.20581131648521198


### Best parameter
```Python
from lightgbm import early_stopping
from lightgbm import log_evaluation
def objective(trial,data=X,target=y):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    param = {
        #'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        #'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': 10000,
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }
    model = lgb.LGBMRegressor(**param)  
    callbacks = [lgb.early_stopping(10, verbose=0)]#, lgb.log_evaluation(period=0)]
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)], early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)
    
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

import warnings
warnings.filterwarnings('ignore')

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
```

In [None]:
def objective(trial):
  STEP_SIZE = 1

  weights = []
  all_models_predictions = []

  # we'll use a variable for setting upper limit for suggested value
  # since we intend to update it after each weight suggestion
  upper_limit = 100

  w_cat = trial.suggest_int("w_cat", 0, upper_limit, step=STEP_SIZE)
  weights.append(w_cat)

  # Update upper limit to 100 - all the previous weights combined, which in this case is just w_ridge
  # WHY? well because we want to keep our sum of all weights equal to 100
  # and this is one way of ensuring that!
  upper_limit -= sum(weights)
  upper_limit = upper_limit

  w_xgb = trial.suggest_int("w_xgb", 0, upper_limit, step=STEP_SIZE)
  weights.append(w_xgb)

  # for the final weight we won't use optuna, rather we'll manually set it equal
  # to whatever value remains after subtracting the sum of suggested weight values from 100
  # This will also make sure that the sum of all weights remains equal to 100.
  w_lgbm = 100 - sum(weights)
  weights.append(w_lgbm)

  # Just as a sanity check, we'll check that the sum of all weights is equal to 100
  weights_sum = sum(weights)

  if weights_sum != 100:
    raise Exception(f"Weights sum must be equal to 100. Instead {weights_sum} was encountered!")
  
  # We'll use the default parameter values for all our models
  cat_model = cat.CatBoostRegressor()
  cat_model.fit(X_train, y_train)
  y_pred_ridge = cat_model.predict(X_val)
  all_models_predictions.append(y_pred_ridge)

  xgb_model = xgb.XGBRegressor()
  xgb_model.fit(X_train, y_train)
  y_pred_xgb = xgb_model.predict(X_val)
  all_models_predictions.append(y_pred_xgb)

  lgbm_model = lgbm.LGBMRegressor()
  lgbm_model.fit(X_train, y_train, verbose=-1)
  y_pred_lgbm = lgbm_model.predict(X_val)
  all_models_predictions.append(y_pred_lgbm)

  # let's take the weighted average of the predictions using numpy
  y_pred_final = np.average(all_models_predictions, weights=weights, axis=0)
  # computing our metric i.e. MSE
  mse = mean_squared_error(y_val, y_pred_final)

  return mse

In [None]:
study = optuna.create_study(study_name="optimizing weights", direction="minimize")
study.optimize(objective, n_trials=20)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

### Optuna best output
```Python
[I 2023-07-20 13:00:07,167] Trial 14 finished with value: 0.1882958572840078 and parameters: {'w_cat': 78, 'w_xgb': 17}. Best is trial 13 with value: 0.18818298170682904.
/Users/yukaisun/opt/anaconda3/lib/python3.8/site-packages/catboost/core.py:1133: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.
  self._init_pool(data, label, cat_features, text_features, embedding_features, pairs, weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, feature_names, thread_count)
Learning rate set to 0.064294
```

In [None]:
#{'w_ridge': 10, 'w_xgb': 80}. Best is trial 14 with value: 0.2082563139755657.
cat_model = cat.CatBoostRegressor()
# training the model
cat_model.fit(X_train, y_train)
# predicting using the trained model
y_pred_cat = cat_model.predict(X_val)

xgb_model = xgb.XGBRegressor(objective="reg:squarederror")
xgb_model.fit(X_train, y_train, verbose=False)
y_pred_xgb = xgb_model.predict(X_val)

lgbm_model = lgbm.LGBMRegressor()
lgbm_model.fit(X_train, y_train, verbose=-1)
y_pred_lgbm = lgbm_model.predict(X_val)

# combining predictions by taking simple average using numpy
y_pred_final = 0.78*y_pred_cat + 0.17*y_pred_xgb + 0.05*y_pred_lgbm

# let's calculate mse
mse = mean_squared_error(y_val, y_pred_final)

print(f"Simple Average Ensemble's MSE: {mse}")
# Simple Average Ensemble's MSE: 0.1882958570766543