In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset
data = pd.read_csv('new_data1.csv')

# Drop the 'date' column as it's not required
data = data.drop(['date(dd/mm/yy)'], axis=1)

# Feature Engineering: Adding Aggregated Features
data['total_delay'] = data[['VSKP', 'YLM', 'ANV', 'SLO', 'NDD', 'BZA', 'WL', 'KZJ', 'SC']].sum(axis=1)
data['mean_temperature'] = data[['Temperature_1', 'Temperature_2', 'Temperature_3', 
                                 'Temperature_4', 'Temperature_5', 'Temperature_6', 
                                 'Temperature_7', 'Temperature_8', 'Temperature_9', 
                                 'Temperature_10']].mean(axis=1)
data['mean_wind_speed'] = data[['Wind_Speed_1', 'Wind_Speed_2', 'Wind_Speed_3', 
                                'Wind_Speed_4', 'Wind_Speed_5', 'Wind_Speed_6', 
                                'Wind_Speed_7', 'Wind_Speed_8', 'Wind_Speed_9', 
                                'Wind_Speed_10']].mean(axis=1)

# Separate features and target
X = data.drop(['HYB'], axis=1)
y = data['HYB']

# Preprocessing for numerical features
numerical_features = [col for col in X.columns if 'Temperature' in col or 'Wind_Speed' in col or 'Distance_travelled' in col or col in ['total_delay', 'mean_temperature', 'mean_wind_speed']]

# Preprocessing for categorical features
categorical_features = [col for col in X.columns if 'weather_code' in col]

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),  # Scale numerical features
        ('cat', OneHotEncoder(), categorical_features)  # One-hot encode categorical features
    ])

# Define the model pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(random_state=42))])

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X, y)

# Best model from GridSearch
best_model = grid_search.best_estimator_

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the best model
best_model.fit(X_train, y_train)

# Make predictions
y_pred = best_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R2 Score: {r2}")

# Cross-validation to get a more reliable estimate of model performance
cross_val_scores = cross_val_score(best_model, X, y, cv=5, scoring='r2')
print(f"Cross-validated R2 scores: {cross_val_scores}")
print(f"Average R2 Score from Cross-validation: {cross_val_scores.mean()}")
print(f"Best Parameters from Grid Search: {grid_search.best_params_}")


MAE: 21.972073794546393
MSE: 1053.2418443735155
R2 Score: -0.1382147323100591
Cross-validated R2 scores: [-1.11402633 -0.15900507 -0.07345229  0.04088935 -0.44211023]
Average R2 Score from Cross-validation: -0.3495409139914037
Best Parameters from Grid Search: {'regressor__max_depth': None, 'regressor__min_samples_split': 5, 'regressor__n_estimators': 100}


In [4]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# XGBoost model
xgb_model = xgb.XGBRegressor(random_state=42)

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

# Grid search to find the best hyperparameters
grid_search_xgb = GridSearchCV(xgb_model, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search_xgb.fit(X, y)

# Best model from GridSearch
best_xgb_model = grid_search_xgb.best_estimator_

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the best model
best_xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = best_xgb_model.predict(X_test)

# Evaluate the model
mae_xgb = mean_absolute_error(y_test, y_pred)
mse_xgb = mean_squared_error(y_test, y_pred)
r2_xgb = r2_score(y_test, y_pred)

print(f"XGBoost MAE: {mae_xgb}")
print(f"XGBoost MSE: {mse_xgb}")
print(f"XGBoost R2 Score: {r2_xgb}")

# Cross-validation scores
cross_val_scores_xgb = cross_val_score(best_xgb_model, X, y, cv=5, scoring='r2')
print(f"Cross-validated R2 scores (XGBoost): {cross_val_scores_xgb}")
print(f"Average R2 Score from Cross-validation (XGBoost): {cross_val_scores_xgb.mean()}")
print(f"Best Parameters from Grid Search (XGBoost): {grid_search_xgb.best_params_}")


1 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Shashank\AppData\Roaming\Python\Python311\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\Shashank\AppData\Roaming\Python\Python311\site-packages\xgboost\sklearn.py", line 1081, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Shashank\AppData

XGBoost MAE: 15.85920616596529
XGBoost MSE: 509.56568751613787
XGBoost R2 Score: 0.4493238417093445
Cross-validated R2 scores (XGBoost): [-0.21450918  0.1410401   0.0203261   0.60529677  0.32435379]
Average R2 Score from Cross-validation (XGBoost): 0.17530151472051633
Best Parameters from Grid Search (XGBoost): {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}
