In [1]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import lightgbm as lgb

# Load the Boston dataset
housing = fetch_california_housing()
#df = pd.read_csv('', sep=',')
df = pd.DataFrame(data=housing['data'], columns=housing['feature_names'])
df['target'] = housing['target']

X, y = df.drop(['target'], axis=1), df[['target']]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the LightGBM regressor
lgb_model = lgb.LGBMRegressor()

# Define the parameter grid for GridSearchCV
param_grid = {
    'learning_rate': [0.1, 0.05, 0.01],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5]
}

# Define the KFold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform the GridSearchCV
grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, cv=kfold, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best estimator from the grid search
best_model = grid_search.best_estimator_

# Iterate over the folds and calculate the loss for each fold
fold_losses = []
for train_index, val_index in kfold.split(X_train, y_train):
    X_fold_train, X_fold_val = X_train.iloc[train_index, :], X_train.iloc[val_index, :]
    y_fold_train, y_fold_val = y_train.iloc[train_index, :], y_train.iloc[val_index, :]
    
    best_model.fit(X_fold_train, y_fold_train)
    y_val_pred = best_model.predict(X_fold_val)
    fold_loss = mean_squared_error(y_fold_val, y_val_pred)
    fold_losses.append(fold_loss)

# Print the loss for each fold
for fold, loss in enumerate(fold_losses):
    print(f"Fold {fold+1} Loss: {loss}")

# Print the best parameters, best score, and mean loss across folds
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)
print("Mean Loss across Folds:", np.mean(fold_losses))

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


Fold 1 Loss: 0.2293861053731641
Fold 2 Loss: 0.20960571134544026
Fold 3 Loss: 0.2053813230762813
Fold 4 Loss: 0.2220594079558242
Fold 5 Loss: 0.20087269256088575
Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300}
Best Score: -0.21346104806231914
Mean Loss across Folds: 0.21346104806231914


In [2]:
# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)

# Print the best parameters, best score, and mean squared error
print("Mean Squared Error:", mse)

Mean Squared Error: 0.21537825655583825
