In [1]:
import pandas as pd
from bayes_opt import BayesianOptimization
from sklearn.model_selection import KFold
import xgboost as xgb
from functions import *
from sklearn.model_selection import train_test_split


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
HYPERPARAMETER_BOUNDS = {
	'alpha': (0, 20),
	'lambda_': (0, 20),
	'colsample_bytree': (0.1, 1.0),
	'learning_rate': (0.01, 0.75),
	'max_depth': (3, 12),
	'subsample': (0.25, 1.0),
	'gamma': (0, 10),
	'min_child_weight': (0, 10)
}

EARLY_STOPPING_ROUNDS = 20
RANDOM_SEED = 123

In [3]:
def xgb_crossval(alpha, lambda_, colsample_bytree, learning_rate, max_depth, subsample, gamma, min_child_weight, dtrain):
	params = {
		'alpha': alpha,
		'lambda': lambda_,
		'colsample_bytree': colsample_bytree,
		'learning_rate': learning_rate,
		'max_depth': int(max_depth),
		'subsample': subsample,
		'gamma': gamma,
		'min_child_weight': min_child_weight,
		'objective': 'reg:squarederror',
	}
	
	cv_results = xgb.cv(params, dtrain, num_boost_round=100, nfold=5,
						early_stopping_rounds=EARLY_STOPPING_ROUNDS, metrics="rmse", seed=RANDOM_SEED)
	return -1.0 * cv_results['test-rmse-mean'].iloc[-1]

In [4]:
def tune_hyperparameters(X_train, y_train):
	dtrain = xgb.DMatrix(X_train, label=y_train)

	def optimize(alpha, lambda_, colsample_bytree, learning_rate, max_depth, subsample, gamma, min_child_weight):
		return xgb_crossval(alpha, lambda_, colsample_bytree, learning_rate, max_depth, subsample, gamma, min_child_weight, dtrain)

	optimizer = BayesianOptimization(f=optimize, pbounds=HYPERPARAMETER_BOUNDS, random_state=RANDOM_SEED)
	optimizer.maximize(init_points=50, n_iter=125)
	
	return optimizer.max

In [5]:
results = []

In [25]:
# Assuming 'states_subdiv_mapping' contains all the states
# Only run for the first state to demonstrate (remove break to run for all)
for state in states_subdiv_mapping:
	print(state)
	subdiv = states_subdiv_mapping[state]
	df, df_energy = load_data(state)
	df = process_data(df, df_energy, subdiv)

	feature_names = feature_names_full

	df = df[feature_names + ['energy_met_MU']]
	df = df.dropna()
	
	X = df[feature_names]
	y = df['energy_met_MU']
	
	
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

	best_hyperparameters = tune_hyperparameters(X_train, y_train)
	best_score = -best_hyperparameters['target']  # Convert back to positive RMSE

	#Feedback: Print or log the best score achieved.
	print(f"Best hyperparameters for {state}: {best_hyperparameters['params']}")
	print(f"Best score achieved (RMSE) for {state}: {best_score}")

	params = best_hyperparameters['params']
	params['max_depth'] = int(params['max_depth'])
	params['objective'] = 'reg:squarederror'

	results.append({
		"state": state,
		**params
	})
	
	break


Andhra_Pradesh
|   iter    |  target   |   alpha   | colsam... |   gamma   |  lambda_  | learni... | max_depth | min_ch... | subsample |
-------------------------------------------------------------------------------------------------------------------------
| [39m1        [39m | [39m-17.76   [39m | [39m13.93    [39m | [39m0.3575   [39m | [39m2.269    [39m | [39m11.03    [39m | [39m0.5424   [39m | [39m6.808    [39m | [39m9.808    [39m | [39m0.7636   [39m |
| [39m2        [39m | [39m-18.29   [39m | [39m9.619    [39m | [39m0.4529   [39m | [39m3.432    [39m | [39m14.58    [39m | [39m0.3345   [39m | [39m3.537    [39m | [39m3.98     [39m | [39m0.8035   [39m |
| [39m3        [39m | [39m-18.09   [39m | [39m3.65     [39m | [39m0.2579   [39m | [39m5.316    [39m | [39m10.64    [39m | [39m0.4795   [39m | [39m10.64    [39m | [39m7.245    [39m | [39m0.7083   [39m |
| [35m4        [39m | [35m-16.95   [39m | [35m14.45    [39m | [35m0.

In [None]:
# Convert results to DataFrame and save to CSV
df_results = pd.DataFrame(results)
df_results.to_csv("outputs/best_hyperparameters.csv", index=False)