In [1]:
import pandas as pd
import numpy as np
import optuna
import xgboost as xgb

from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error ,r2_score , mean_absolute_error , mean_squared_error
from sklearn.ensemble import RandomForestRegressor


In [2]:
df = pd.read_csv("../dataset/day.csv")
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [3]:
df.columns

Index(['instant', 'dteday', 'season', 'yr', 'mnth', 'holiday', 'weekday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed',
       'casual', 'registered', 'cnt'],
      dtype='object')

In [4]:
# droping 
#instant – It is just a sequence number of rows
#dteday – It is not required since columns for year & month already exists
#casual – This variable cannot be predicted.
#registered – This variable cannot be predicted.
#atemp - This feature and temp feature are same
df.drop(labels = ['instant', 'dteday','atemp','casual' ,'registered'] ,axis= 1 ,inplace= True)

In [5]:
df.columns

Index(['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday',
       'weathersit', 'temp', 'hum', 'windspeed', 'cnt'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   season      731 non-null    int64  
 1   yr          731 non-null    int64  
 2   mnth        731 non-null    int64  
 3   holiday     731 non-null    int64  
 4   weekday     731 non-null    int64  
 5   workingday  731 non-null    int64  
 6   weathersit  731 non-null    int64  
 7   temp        731 non-null    float64
 8   hum         731 non-null    float64
 9   windspeed   731 non-null    float64
 10  cnt         731 non-null    int64  
dtypes: float64(3), int64(8)
memory usage: 62.9 KB


In [14]:
scaler = StandardScaler()


In [15]:
## XGBOOST

# spliting dataset into X and y
features = df.drop(labels=['cnt'] ,axis= 1)
target = df[['cnt']]


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Define the objective function for Optuna
def objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'booster': 'gbtree',
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        
    }

    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse

# Create an Optuna study
study = optuna.create_study(direction='minimize')

# Start the optimization process
study.optimize(objective, n_trials=100)

# Get the best set of hyperparameters
best_params = study.best_params

# Train the final model with the best parameters
final_model = xgb.XGBRegressor(**best_params)
final_model.fit(X_train, y_train)

# Evaluate the final model on the test set
y_pred_test = final_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred_test)
print(f"Test MSE with best parameters: {test_mse}")
r2_score(y_test ,y_pred_test)

[I 2023-08-29 20:10:50,890] A new study created in memory with name: no-name-9996f69e-52c6-477a-823d-119daf5e6b60
[I 2023-08-29 20:10:51,016] Trial 0 finished with value: 409244.8766591702 and parameters: {'max_depth': 10, 'learning_rate': 0.050854342655070085, 'subsample': 0.7437359242763817}. Best is trial 0 with value: 409244.8766591702.
[I 2023-08-29 20:10:51,128] Trial 1 finished with value: 369851.7470070557 and parameters: {'max_depth': 10, 'learning_rate': 0.08161026930701737, 'subsample': 0.6701172288981808}. Best is trial 1 with value: 369851.7470070557.
[I 2023-08-29 20:10:51,223] Trial 2 finished with value: 372143.1674515806 and parameters: {'max_depth': 9, 'learning_rate': 0.05833067511595152, 'subsample': 0.570376967033911}. Best is trial 1 with value: 369851.7470070557.
[I 2023-08-29 20:10:51,286] Trial 3 finished with value: 4931532.416839215 and parameters: {'max_depth': 5, 'learning_rate': 0.008253808526259107, 'subsample': 0.6071557833316447}. Best is trial 1 with v

Test MSE with best parameters: 347086.87615512655


0.9134421284277302

In [16]:
# Random forest
def objective(trial):
    
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'min_samples_split': trial.suggest_float('min_samples_split', 0.1, 1.0),
        'min_samples_leaf': trial.suggest_float('min_samples_leaf', 0.1, 0.5),
        'max_features': trial.suggest_categorical('max_features', ['log2', 'sqrt']),
        
    }

    model = RandomForestRegressor(**params, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse

# Creating an Optuna study
study = optuna.create_study(direction='minimize')

# Start the optimization process
study.optimize(objective, n_trials=100)

# Get the best set of hyperparameters
best_params = study.best_params

# Train the final model with the best parameters
final_model = RandomForestRegressor(**best_params, random_state=42)
final_model.fit(X_train, y_train)

# Evaluate the final model on the test set
y_pred_test = final_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred_test)
print(f"Test MSE with best parameters: {test_mse}")
r2_score(y_test ,y_pred_test)    

[I 2023-08-29 20:11:02,037] A new study created in memory with name: no-name-31dea1ff-91a6-402d-9381-09d437686e6b
  return fit_method(estimator, *args, **kwargs)
[I 2023-08-29 20:11:02,144] Trial 0 finished with value: 4090452.943269747 and parameters: {'n_estimators': 78, 'max_depth': 7, 'min_samples_split': 0.8355900892701931, 'min_samples_leaf': 0.3617730366170292, 'max_features': 'sqrt'}. Best is trial 0 with value: 4090452.943269747.
  return fit_method(estimator, *args, **kwargs)
[I 2023-08-29 20:11:02,337] Trial 1 finished with value: 4090260.7702540094 and parameters: {'n_estimators': 179, 'max_depth': 9, 'min_samples_split': 0.3601565685554071, 'min_samples_leaf': 0.3679940054859461, 'max_features': 'sqrt'}. Best is trial 1 with value: 4090260.7702540094.
  return fit_method(estimator, *args, **kwargs)
[I 2023-08-29 20:11:02,413] Trial 2 finished with value: 2345560.0279192426 and parameters: {'n_estimators': 64, 'max_depth': 14, 'min_samples_split': 0.577784636885252, 'min_sa

Test MSE with best parameters: 1323228.825894434


0.6700080624157472

In [17]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [18]:

# spliting dataset into X and y
features = df.drop(labels=['cnt'] ,axis= 1)
target = df[['cnt']]


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [19]:
def objective(trial, model, X_train, y_train, X_test, y_test):
    params = {}

    if model == 'random_forest':
        params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'min_samples_split': trial.suggest_float('min_samples_split', 0.1, 1.0),
        'min_samples_leaf': trial.suggest_float('min_samples_leaf', 0.1, 0.5),
        'max_features': trial.suggest_categorical('max_features', ['log2', 'sqrt']),
        
    }
        model = RandomForestRegressor(**params, random_state=42)
    elif model == 'xgboost':
        
        params ={
            'objective': 'reg:squarederror',
            'booster': 'gbtree',
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            
        }
        model = xgb.XGBRegressor(**params, random_state=42)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse

# Load and preprocess your dataset
# ...

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Create an Optuna study
study_rf = optuna.create_study(direction='minimize')
study_xgb = optuna.create_study(direction='minimize')

# Start the optimization process for Random Forest
study_rf.optimize(lambda trial: objective(trial, 'random_forest', X_train, y_train, X_test, y_test), n_trials=100)

# Start the optimization process for XGBoost
study_xgb.optimize(lambda trial: objective(trial, 'xgboost', X_train, y_train, X_test, y_test), n_trials=100)

# Get the best set of hyperparameters for Random Forest
best_params_rf = study_rf.best_params

# Get the best set of hyperparameters for XGBoost
best_params_xgb = study_xgb.best_params

# Train the final Random Forest model with the best parameters
final_model_rf = RandomForestRegressor(**best_params_rf, random_state=42)
final_model_rf.fit(X_train, y_train)

# Train the final XGBoost model with the best parameters
final_model_xgb = xgb.XGBRegressor(**best_params_xgb, random_state=42)
final_model_xgb.fit(X_train, y_train)

# Evaluate the final Random Forest model on the test set
y_pred_test_rf = final_model_rf.predict(X_test)
rf_mae, rf_rmse, rf_r2 = evaluate_model(y_test, y_pred_test_rf)
print(f"Random Forest Model Evaluation:")
print(f"MAE: {rf_mae:.2f}, RMSE: {rf_rmse:.2f}, R^2: {rf_r2:.2f}")

# Evaluate the final XGBoost model on the test set
y_pred_test_xgb = final_model_xgb.predict(X_test)
xgb_mae, xgb_rmse, xgb_r2 = evaluate_model(y_test, y_pred_test_xgb)
print(f"XGBoost Model Evaluation:")
print(f"MAE: {xgb_mae:.2f}, RMSE: {xgb_rmse:.2f}, R^2: {xgb_r2:.2f}")

[I 2023-08-29 20:11:19,432] A new study created in memory with name: no-name-a11d3c1e-5af6-4bc5-8edf-67e95efeaf95
[I 2023-08-29 20:11:19,435] A new study created in memory with name: no-name-24be153c-59ff-4871-82c8-3fc998bf7040
  return fit_method(estimator, *args, **kwargs)
[I 2023-08-29 20:11:19,583] Trial 0 finished with value: 2577122.3973331368 and parameters: {'n_estimators': 140, 'max_depth': 9, 'min_samples_split': 0.2929550055093766, 'min_samples_leaf': 0.2780780360560094, 'max_features': 'log2'}. Best is trial 0 with value: 2577122.3973331368.
  return fit_method(estimator, *args, **kwargs)
[I 2023-08-29 20:11:19,688] Trial 1 finished with value: 4088619.8964144103 and parameters: {'n_estimators': 104, 'max_depth': 5, 'min_samples_split': 0.686267610532765, 'min_samples_leaf': 0.39513718298970724, 'max_features': 'sqrt'}. Best is trial 0 with value: 2577122.3973331368.
  return fit_method(estimator, *args, **kwargs)
[I 2023-08-29 20:11:19,801] Trial 2 finished with value: 408

Random Forest Model Evaluation:
MAE: 937.41, RMSE: 1151.59, R^2: 0.67
XGBoost Model Evaluation:
MAE: 406.54, RMSE: 586.43, R^2: 0.91


In [20]:
# Split dataset into features (X) and target (y)
features = df.drop(labels=['cnt'], axis=1)
target = df['cnt']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Apply StandardScaler to the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Apply the same scaler to the test data
X_test_scaled = scaler.transform(X_test)

# Define objective function for Optuna
def objective(trial, model, X_train, y_train, X_test, y_test):
    params = {}

    if model == 'random_forest':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),
            'max_depth': trial.suggest_int('max_depth', 5, 15),
            'min_samples_split': trial.suggest_float('min_samples_split', 0.1, 1.0),
            'min_samples_leaf': trial.suggest_float('min_samples_leaf', 0.1, 0.5),
            'max_features': trial.suggest_categorical('max_features', ['log2', 'sqrt']),
        }
        model = RandomForestRegressor(**params, random_state=42)
    elif model == 'xgboost':
        params = {
            'objective': 'reg:squarederror',
            'booster': 'gbtree',
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        }
        model = xgb.XGBRegressor(**params, random_state=42)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2= r2_score(y_test, y_pred)
    return r2

# Create an Optuna study for Random Forest
study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(lambda trial: objective(trial, 'random_forest', X_train_scaled, y_train, X_test_scaled, y_test), n_trials=100)

# Get the best set of hyperparameters for Random Forest
best_params_rf = study_rf.best_params

# Train the final Random Forest model with the best parameters
final_model_rf = RandomForestRegressor(**best_params_rf, random_state=42)
final_model_rf.fit(X_train_scaled, y_train)

# Evaluate the final Random Forest model on the test set
y_pred_test_rf = final_model_rf.predict(X_test_scaled)
rf_mae, rf_rmse, rf_r2 = evaluate_model(y_test, y_pred_test_rf)
print(f"Random Forest Model Evaluation:")
print(f"MAE: {rf_mae:.2f}, RMSE: {rf_rmse:.2f}, R^2: {rf_r2:.2f}")

[I 2023-08-29 20:11:45,355] A new study created in memory with name: no-name-780d531b-bda6-4cb7-ad39-a6f26f705264
[I 2023-08-29 20:11:45,517] Trial 0 finished with value: -0.01990926973950491 and parameters: {'n_estimators': 155, 'max_depth': 13, 'min_samples_split': 0.3438671700627758, 'min_samples_leaf': 0.34958962937219196, 'max_features': 'log2'}. Best is trial 0 with value: -0.01990926973950491.
[I 2023-08-29 20:11:45,612] Trial 1 finished with value: -0.019834998843313967 and parameters: {'n_estimators': 99, 'max_depth': 15, 'min_samples_split': 0.11968026169771444, 'min_samples_leaf': 0.4393437047319978, 'max_features': 'log2'}. Best is trial 1 with value: -0.019834998843313967.
[I 2023-08-29 20:11:45,748] Trial 2 finished with value: -0.020034953540496447 and parameters: {'n_estimators': 135, 'max_depth': 6, 'min_samples_split': 0.41070178546782643, 'min_samples_leaf': 0.38299201670096084, 'max_features': 'sqrt'}. Best is trial 1 with value: -0.019834998843313967.
[I 2023-08-29

Random Forest Model Evaluation:
MAE: 930.79, RMSE: 1150.13, R^2: 0.67


Random Forest Model Evaluation:
MAE: 705.67, RMSE: 857.21, R^2: 0.82
