# Logistic Regression Model for Human Resources Management 

In [None]:
import sys 
sys.executable  # Display the path to the Python executable ensuring the correct env"				

# Import Libraries and Read Data

In [None]:
import numpy as np  # For numerical operations and arrays.	
import pandas as pd  # For data manipulation and analysis.	
import matplotlib.pyplot as plt  # For basic plotting.	
import seaborn as sns  # For enhanced plotting.	
from sklearn.preprocessing import StandardScaler  # For creating scaler instances for standardization purposes.
from sklearn.model_selection import train_test_split  # For splitting the data into sets avoiding overfitting.
from sklearn.linear_model import LogisticRegression  # For creating LogisticRegression instances.
from sklearn import metrics  # For evaluating the model
from python_scripts import summary_metrics
from sklearn.model_selection import GridSearchCV  # For searching the best parameters over specified parameter values
import joblib  # For saving models

In [None]:
# Read CSV Datafile to a DataFrame:
df = pd.read_csv('cleaned.csv')

In [None]:
pd.options.display.max_columns = None

# Create Features and Targets for Two Scaling Versions

I 'll try two different scaling options and hence I 'll separate into two different feature DataFrames.

In [None]:
# Create checkpoints:
df_1 = df.copy()
df_2 = df.copy()

In [None]:
# Define features and target
features_1 = df_1.iloc[:, :-1]  # All features except the target
features_2 = df_2.iloc[:, :-1]  # All features except the target
targets = df['Extensive Absenteeism Time in Hours']  # Common target for both versions

# Shuffle and Split the Data for Two Scaling Versions

In [None]:
# Split the data into training and test sets for both versions:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(
    features_1, targets, test_size=0.15, random_state=7)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(
    features_2, targets, test_size=0.15, random_state=7)

# Scale Features (2 Versions are Provided)

<div style="text-align: justify">
It is crucial to leave the dummy variables unscaled because they are already binary with values of 0 or 1. Additionally, leaving dummies unscaled enhances interpretability.
</div>

<div style="text-align: justify">
We will try two different versions of scaling. In the first version, we will scale all features except for the dummy variables (df_1). In the second version, we will leave the date-related features unscaled, as they are discrete and not continuous variables, and we will scale only the remaining columns (df_2).
</div>

In [None]:
# Choose features to be scaled:
scale_all_except_dummies = ['Month Absence Occurred', 'Monthday Range Absence Occurred', 'Weekday Absence Occurred', 
                            'Transportation Expense', 'Distance to Work', 'Age', 'Daily Work Load Average', 
                            'Body Mass Index']

scale_not_all = ['Transportation Expense', 'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index']

# Create two scaler object for both versions:
scaler_1 = StandardScaler()
scaler_2 = StandardScaler()

# Fit and Transform all features except for dummies for X_train_1:
X_train_1[scale_all_except_dummies] = scaler_1.fit_transform(X_train_1[scale_all_except_dummies])

# Transform X_test_1 using the same scaler
X_test_1[scale_all_except_dummies] = scaler_1.transform(X_test_1[scale_all_except_dummies])

# Fit and transform the features to scale (excluding dummies and date-related features) for X_train_2
X_train_2[scale_not_all] = scaler_2.fit_transform(X_train_2[scale_not_all])

# Transform X_test_2 using the same scaler
X_test_2[scale_not_all] = scaler_2.transform(X_test_2[scale_not_all])

# Baseline Model for Both Scaling Versions

In [None]:
# Initialize logistic regression models for both versions
model_1 = LogisticRegression()
model_2 = LogisticRegression()

In [None]:
# Fit the models with both versions of scaled data
model_1.fit(X_train_1, y_train_1)
model_2.fit(X_train_2, y_train_2)

In [None]:
# Call function which summarizes the results of first version of scaling data:
summary_df_1 = summary_metrics(feature_df=features_1, 
                             model=model_1, 
                             x_tr=X_train_1, 
                             y_tr=y_train_1, 
                             x_te=X_test_1, 
                             y_te=y_test_1)
summary_df_1

In [None]:
# Call function which summarizes the results of second version of scaling data:
summary_df_2 = summary_metrics(feature_df=features_2, 
                             model=model_2, 
                             x_tr=X_train_2, 
                             y_tr=y_train_2, 
                             x_te=X_test_2, 
                             y_te=y_test_2)
summary_df_2

<div style="text-align: justify">
Based on the results observed, it appears that whether we scale all features except dummies or only some features does not significantly impact the model's performance. I manually tested the model with different random states, and the results consistently showed similar patterns. Additionally, in most cases, the weights of the date-related features are very close to zero. This suggests that we can safely drop these features to reduce dimensionality and choose any scaling option we prefer.
</div>

# Reducing Model Complexity (Model Version 3)

<div style="text-align: justify">
Knowing that scaling or not scaling the date-related features doesn't affect the performance, we 'll choose to retain unscaled date-related features.
</div>

In [None]:
df_3 = df.copy()

In [None]:
# Perform backward elimination by dropping zero-weight features:
df_3 = df_3.drop(columns=['Month Absence Occurred', 'Monthday Range Absence Occurred', 'Has 1 Child', 'Has 1 Pet',
                                                          'Daily Work Load Average', 'Distance to Work'], axis=1)

In [None]:
# Extract new features from the updated DataFrame:
features_3 = df_3.iloc[:, :-1]

In [None]:
# Create a new train-test split based on the newly extracted features:
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(
    features_3, targets, test_size=0.15, random_state=7)

In [None]:
# Choose features to be scaled:
scale_not_all = ['Transportation Expense', 'Age', 'Body Mass Index']

# Create two scaler object for both versions:
scaler_3 = StandardScaler()

# Fit and Transform X_train_3:
X_train_3[scale_not_all] = scaler_3.fit_transform(X_train_3[scale_not_all])

# Transform X_test_3 using the same scaler
X_test_3[scale_not_all] = scaler_3.transform(X_test_3[scale_not_all])

In [None]:
# Initialize new logistic regression model:
model_3 = LogisticRegression()

In [None]:
# Train the new model with the updated training set:
model_3.fit(X_train_3, y_train_3)

In [None]:
# Generate summary metrics for the new model
summary_df_3 = summary_metrics(feature_df=features_3, 
                             model=model_3, 
                             x_tr=X_train_3, 
                             y_tr=y_train_3, 
                             x_te=X_test_3, 
                             y_te=y_test_3)
summary_df_3

# Build a More Advanced Model (Model Version 4)

<div style="text-align: justify">
We 'll try some other combinations to see if the model performance can be improved using GridSearchCV. We 'll use the simplified model from version 3. Therefore, X_train_3, X_test_3, y_train_3, and y_test_3 remain unchanged.
</div>

In [None]:
# Initialize a new logistic regression model:
model_4 = LogisticRegression()

In [None]:
# Define parameter grids for different solvers without raising warnings:
param_grid_liblinear = {
    'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], # Regularization strength
    'penalty': ['l1', 'l2'],  # Regularization type
    'solver': ['liblinear'],  # Optimization algorithm
    'max_iter': [100, 200, 300, 400, 500]  # Maximum number of iterations for convergence
}

param_grid_newton_cg = {
    'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l2'],  
    'solver': ['newton-cg'],
    'max_iter': [100, 200, 300, 400, 500]  
}

param_grid_lbfgs = {
    'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l2'],  
    'solver': ['lbfgs'],
    'max_iter': [100, 200, 300, 400, 500] 
}

param_grid_sag = {
    'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l2'],  
    'solver': ['sag'],
    'max_iter': [100, 200, 300, 400, 500]  
}

param_grid_saga = {
    'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2', 'elasticnet'],  
    'solver': ['saga'],
    'max_iter': [100, 200, 300, 400, 500],
    'l1_ratio': [0, 0.1, 0.5, 0.9, 1]  
}

In [None]:
# Setup GridSearchCV changing only the param_grid parameter to meet all options of the cell above:
grid_search = GridSearchCV(
    estimator=model_4,
    param_grid=param_grid_liblinear,
    scoring='accuracy',  # Evaluation metric
    cv=5,  # Number of cross-validation folds
    verbose=1,  # Verbosity level
    n_jobs=-1  # Use all available cores
)

In [None]:
# Fit the grid search with preprocessed data
grid_search.fit(X_train_3, y_train_3)

In [None]:
# View the best parameters and scores
print("Best Parameters:")
print(grid_search.best_params_)

print("\nBest Score:")
print(grid_search.best_score_)

# Evaluate on test data
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test_3, y_test_3)
print("\nTest Accuracy:")
print(test_accuracy)

<div style="text-align: justify">
<strong>The results did not improve with different parameter grids for each solver. Therefore, we 'll keep the simplest model, that is, version 3.</strong>
</div>

# Save the Model

In [None]:
# Filenames for saving the model and scaler:
filename = 'model.joblib'
scalername = 'scaler.joblib'

# Save the logistic regression model:
joblib.dump(model_3, filename)

# Save the scaler used for data normalization:
joblib.dump(scaler_3, scalername)

# Provide Insights

<div style="text-align: justify">
Some conclusions are intuitive. For example, 'Other Factor Absence' has the highest positive impact on the probability of absenteeism, which is expected as it indicates a significant health issue. Similarly, factors like 'Has More than 2 Children' and 'Has 2 Children' also have a positive impact on absenteeism, aligning with common expectations.
</div>

<div style="text-align: justify">
Rather than focusing on these expected factors, it is more insightful to explore social variables. For instance, higher education levels are associated with a slight decrease in absenteeism odds. This may be due to increased job stability or motivation among more educated employees. Additionally, older employees show a lower probability of absenteeism, possibly due to greater experience and commitment to their jobs. An interesting finding is the negative impact of the 'Has More than 2 Pets' factor. While one might expect that owning multiple pets would lead to more absences due to veterinary visits or pet health issues, the data suggests otherwise. This could be because individuals with more than two pets likely have the support of a family to help care for them, reducing the impact on their own work attendance.
</div>