# Comparing Random Forest and Gradient Boosting Classifiers Before and After Hyperparameter Tuning

In [2]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
import seaborn as sns
import matplotlib.pyplot as plt


## Load and Prepare Data

In [3]:

# Load the dataset
df = pd.read_csv("Absenteeism_at_work.csv", delimiter=";")

# Data Preparation
df.fillna(method='ffill', inplace=True)
df = pd.get_dummies(df, drop_first=True)
df = df.drop("ID", axis=1)

scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.drop('Absenteeism time in hours', axis=1))
X = scaled_features
y = df['Absenteeism time in hours']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


  df.fillna(method='ffill', inplace=True)


## Train and Evaluate Random Forest Before Tuning

In [4]:

# Train and evaluate Random Forest before tuning
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_report = classification_report(y_test, rf_pred, output_dict=True)
rf_cm = confusion_matrix(y_test, rf_pred)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Train and Evaluate Gradient Boosting Before Tuning

In [5]:

# Train and evaluate Gradient Boosting before tuning
gbm = GradientBoostingClassifier(random_state=42)
gbm.fit(X_train, y_train)
gbm_pred = gbm.predict(X_test)
gbm_accuracy = accuracy_score(y_test, gbm_pred)
gbm_report = classification_report(y_test, gbm_pred, output_dict=True)
gbm_cm = confusion_matrix(y_test, gbm_pred)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Hyperparameter Tuning for Random Forest

In [6]:

# Hyperparameter tuning for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
random_search_rf = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42), param_distributions=param_grid_rf, n_iter=10, cv=5, random_state=42, n_jobs=-1)
random_search_rf.fit(X_train, y_train)
best_rf_model = random_search_rf.best_estimator_

# Train and evaluate Random Forest after tuning
best_rf_pred = best_rf_model.predict(X_test)
best_rf_accuracy = accuracy_score(y_test, best_rf_pred)
best_rf_report = classification_report(y_test, best_rf_pred, output_dict=True)
best_rf_cm = confusion_matrix(y_test, best_rf_pred)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Hyperparameter Tuning for Gradient Boosting

In [7]:

# Hyperparameter tuning for Gradient Boosting
param_grid_gbm = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}
random_search_gbm = RandomizedSearchCV(estimator=GradientBoostingClassifier(random_state=42), param_distributions=param_grid_gbm, n_iter=10, cv=5, random_state=42, n_jobs=-1)
random_search_gbm.fit(X_train, y_train)
best_gbm_model = random_search_gbm.best_estimator_

# Train and evaluate Gradient Boosting after tuning
best_gbm_pred = best_gbm_model.predict(X_test)
best_gbm_accuracy = accuracy_score(y_test, best_gbm_pred)
best_gbm_report = classification_report(y_test, best_gbm_pred, output_dict=True)
best_gbm_cm = confusion_matrix(y_test, best_gbm_pred)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Comparison of Results

In [9]:

# Create a DataFrame to compare the results
comparison_df = pd.DataFrame({
    'Classifier': ['Random Forest', 'Random Forest (Tuned)', 'Gradient Boosting', 'Gradient Boosting (Tuned)'],
    'Accuracy': [rf_accuracy, best_rf_accuracy, gbm_accuracy, best_gbm_accuracy],
    'Precision': [
        rf_report['weighted avg']['precision'],
        best_rf_report['weighted avg']['precision'],
        gbm_report['weighted avg']['precision'],
        best_gbm_report['weighted avg']['precision']
    ],
    'Recall': [
        rf_report['weighted avg']['recall'],
        best_rf_report['weighted avg']['recall'],
        gbm_report['weighted avg']['recall'],
        best_gbm_report['weighted avg']['recall']
    ],
    'F1-Score': [
        rf_report['weighted avg']['f1-score'],
        best_rf_report['weighted avg']['f1-score'],
        gbm_report['weighted avg']['f1-score'],
        best_gbm_report['weighted avg']['f1-score']
    ]
})



comparison_df


Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1-Score
0,Random Forest,0.490991,0.466299,0.490991,0.467498
1,Random Forest (Tuned),0.486486,0.433033,0.486486,0.445469
2,Gradient Boosting,0.5,0.496854,0.5,0.489985
3,Gradient Boosting (Tuned),0.477477,0.448496,0.477477,0.433883
