In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
sns.set(rc={'axes.facecolor': '#f1faeb'}, style='darkgrid')


In [2]:
# Reload the dataset to retain the original data types of the variables
df = pd.read_csv('HR.csv')
def prep_data(df): 
    # Implementing one-hot encoding on the 'sales' feature
    df_encoded = pd.get_dummies(df, columns=['sales'], drop_first=True)

    # Label encoding of 'salary' feature
    le = LabelEncoder()
    df_encoded['salary'] = le.fit_transform(df_encoded['salary'])
    # Define the features (X) and the output labels (y)
    X = df_encoded.drop('left', axis=1)
    y = df_encoded['left']
    # Splitting data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
    return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = prep_data(df)

In [3]:
def tune_regressor_hyperparameters(reg, param_grid, X_train, y_train, scoring='neg_mean_squared_error', n_splits=3):
    '''
    This function optimizes the hyperparameters for a regressor by searching over a specified hyperparameter grid. 
    It uses GridSearchCV and cross-validation (KFold) to evaluate different combinations of hyperparameters. 
    The combination with the highest negative mean squared error is selected as the default scoring metric. 
    The function returns the regressor with the optimal hyperparameters.
    '''
    
    # Create the cross-validation object using KFold
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=0)

    # Create the GridSearchCV object
    reg_grid = GridSearchCV(reg, param_grid, cv=cv, scoring=scoring, n_jobs=-1)

    # Fit the GridSearchCV object to the training data
    reg_grid.fit(X_train, y_train)

    # Get the best hyperparameters
    best_hyperparameters = reg_grid.best_params_
    
    # Return best_estimator_ attribute which gives us the best model that has been fitted to the training data
    return reg_grid.best_estimator_, best_hyperparameters
def evaluate_model(model, X_train, y_train, X_test, y_test, model_name):

    # Predict on training and testing data
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Calculate metrics for training data
    mae_train = mean_absolute_error(y_train, y_train_pred)
    mse_train = mean_squared_error(y_train, y_train_pred)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(y_train, y_train_pred)

    # Calculate metrics for testing data
    mae_test = mean_absolute_error(y_test, y_test_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_test, y_test_pred)
    
    # Create a DataFrame for metrics
    metrics_df = pd.DataFrame(data = [mae_test, mse_test, rmse_test, r2_test],
                              index = ['MAE', 'MSE', 'RMSE', 'R2 Score'],
                              columns = [model_name])
    
    # Print the metrics
    print(f"{model_name} Training Data Metrics:")
    print("MAE: {:.4f}".format(mae_train))
    print("MSE: {:.4f}".format(mse_train))
    print("RMSE: {:.4f}".format(rmse_train))
    print("R2 Score: {:.4f}".format(r2_train))
    
    print(f"\n{model_name} Testing Data Metrics:")
    print("MAE: {:.4f}".format(mae_test))
    print("MSE: {:.4f}".format(mse_test))
    print("RMSE: {:.4f}".format(rmse_test))
    print("R2 Score: {:.4f}".format(r2_test))
        
    return metrics_df

def xgb_predict(X_train, y_train, X_test, y_test):
    # Define the model
    xgb_base = xgb.XGBRegressor(objective ='reg:squarederror')
    # Define the parameters for grid search
    xgb_param_grid = {
        'max_depth': [4, 5],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'n_estimators': [200, 250, 300],
        'min_child_weight': [2, 3, 4]
    }
    # Tune the hyperparameters
    best_xgb, best_xgb_hyperparameters = tune_regressor_hyperparameters(xgb_base, xgb_param_grid, X_train, y_train)
    xgb_result = evaluate_model(best_xgb, X_train, y_train, X_test, y_test, 'XGBoost')
    return xgb_result, best_xgb

def ctb_predict(X_train, y_train, X_test, y_test):
 
    ctb_base = CatBoostRegressor(verbose=0)
    # Define the parameters for grid search
    ctb_param_grid = {
        'iterations': [100, 300, 500],
        'learning_rate': [0.01, 0.1, 0.3],
        'depth': [4, 6, 8],
        'l2_leaf_reg': [1, 3, 5],
    }
    # Tune the hyperparameters
    best_ctb, best_ctb_hyperparameters = tune_regressor_hyperparameters(ctb_base, ctb_param_grid, X_train, y_train)
    ctb_result = evaluate_model(best_ctb, X_train, y_train, X_test, y_test, 'CatBoost')
    return ctb_result, best_ctb

In [4]:
xgb_result, best_xgb = xgb_predict(X_train, y_train, X_test, y_test)


XGBoost Training Data Metrics:
MAE: 0.0253
MSE: 0.0040
RMSE: 0.0633
R2 Score: 0.9779

XGBoost Testing Data Metrics:
MAE: 0.0436
MSE: 0.0126
RMSE: 0.1123
R2 Score: 0.9305


In [None]:

ctb_result, best_ctb = ctb_predict(X_train, y_train, X_test, y_test)


CatBoost Training Data Metrics:
MAE: 0.0234
MSE: 0.0037
RMSE: 0.0612
R2 Score: 0.9794

CatBoost Testing Data Metrics:
MAE: 0.0392
MSE: 0.0116
RMSE: 0.1078
R2 Score: 0.9359


In [12]:
def plot_diff(ctb_result,xgb_result):
    combined_df = pd.concat([ctb_result.T, xgb_result.T], axis=0)
    combined_df['Model'] = ['CatBoost', 'XGBoost']

    # Melt the dataframe
    melted_df = combined_df.melt(id_vars='Model', var_name='Metric', value_name='Score')

    # Define custom colors
    custom_colors = ['#009c05', 'darkorange']

    # Create the barplot
    plt.figure(figsize=(10,6))
    sns.barplot(x='Score', y='Metric', hue='Model', data=melted_df, palette=custom_colors)

    plt.title('Model Comparison')
    plt.show()

satisfaction_level        0.96
last_evaluation           0.93
number_project               6
average_montly_hours       240
time_spend_company           6
Work_accident                0
promotion_last_5years        0
salary                       1
sales_RandD              False
sales_accounting         False
sales_hr                 False
sales_management         False
sales_marketing          False
sales_product_mng        False
sales_sales              False
sales_support            False
sales_technical          False
Name: 7241, dtype: object

In [24]:
X_test[0:1]

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,sales_RandD,sales_accounting,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical
7241,0.96,0.93,6,240,6,0,0,1,False,False,False,False,False,False,False,False,False


In [5]:
def predict(model, X_test):
    # Predict on the test data
    y_pred = model.predict(X_test)
    return y_pred
predict(best_xgb , X_test )

array([ 0.3008172 , -0.00407761, -0.0088242 , ..., -0.01374841,
       -0.02062091,  0.00197378], dtype=float32)