# Load Packages & Data

In [None]:
# Install required packages if missing
packages <- c("googledrive", "tidyverse", "skimr", "DataExplorer", "corrplot", "ggplot2", "dplyr", "caret", "randomForest", "gbm")
installed <- packages %in% installed.packages()
if (any(!installed)) install.packages(packages[!installed])

# Load them
library(googledrive)
library(tidyverse)
library(skimr)
library(DataExplorer)
library(corrplot)
library(ggplot2)
library(dplyr)
library(cluster)
library(caret)
library(randomForest)
library(gbm)

Installing packages into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘listenv’, ‘parallelly’, ‘future’, ‘globals’, ‘shape’, ‘future.apply’, ‘numDeriv’, ‘progressr’, ‘SQUAREM’, ‘diagram’, ‘lava’, ‘prodlim’, ‘data.tree’, ‘igraph’, ‘proxy’, ‘iterators’, ‘clock’, ‘gower’, ‘hardhat’, ‘ipred’, ‘sparsevctrs’, ‘timeDate’, ‘reshape2’, ‘gridExtra’, ‘networkD3’, ‘e1071’, ‘foreach’, ‘ModelMetrics’, ‘plyr’, ‘pROC’, ‘recipes’




In [None]:
# drive_auth()

In [None]:
# # Adjust the file name to match exactly what’s in your Google Drive
# file <- drive_get(path = "WQD7004/HR-Employee-Attrition.csv")
# drive_download(file, path = "HR.csv", overwrite = TRUE)

In [None]:
attrition <- read.csv("HR.csv")
head(attrition)

# Data Cleaning, Understanding & Exploration

In [None]:
# Check structure and summary
str(attrition)

In [None]:
summary(attrition)

      Age         Attrition         BusinessTravel       DailyRate     
 Min.   :18.00   Length:1470        Length:1470        Min.   : 102.0  
 1st Qu.:30.00   Class :character   Class :character   1st Qu.: 465.0  
 Median :36.00   Mode  :character   Mode  :character   Median : 802.0  
 Mean   :36.92                                         Mean   : 802.5  
 3rd Qu.:43.00                                         3rd Qu.:1157.0  
 Max.   :60.00                                         Max.   :1499.0  
  Department        DistanceFromHome   Education     EducationField    
 Length:1470        Min.   : 1.000   Min.   :1.000   Length:1470       
 Class :character   1st Qu.: 2.000   1st Qu.:2.000   Class :character  
 Mode  :character   Median : 7.000   Median :3.000   Mode  :character  
                    Mean   : 9.193   Mean   :2.913                     
                    3rd Qu.:14.000   3rd Qu.:4.000                     
                    Max.   :29.000   Max.   :5.000              

In [None]:
skim(attrition)

── Data Summary ────────────────────────
                           Values   
Name                       attrition
Number of rows             1470     
Number of columns          35       
_______________________             
Column type frequency:              
  character                9        
  numeric                  26       
________________________            
Group variables            None     

── Variable type: character ────────────────────────────────────────────────────
  skim_variable  n_missing complete_rate min max empty n_unique whitespace
[90m1[39m Attrition              0             1   2   3     0        2          0
[90m2[39m BusinessTravel         0             1  10  17     0        3          0
[90m3[39m Department             0             1   5  22     0        3          0
[90m4[39m EducationField         0             1   5  16     0        6          0
[90m5[39m Gender                 0             1   4   6     0        2          0
[90m6

ERROR: Error in is.null(text_repr) || nchar(text_repr) == 0L: 'length = 17' in coercion to 'logical(1)'


**Check Missing Values & Duplicates**

In [None]:
# Missing values count
colSums(is.na(attrition))

# Visualize missing data
plot_missing(attrition)

# Check for duplicate rows
sum(duplicated(attrition))

**Encode Categorical Variables**

In [None]:
# Convert categorical variables to factors
attrition <- attrition %>%
  mutate(
    Attrition = as.factor(Attrition),
    BusinessTravel = as.factor(BusinessTravel),
    Department = as.factor(Department),
    EducationField = as.factor(EducationField),
    Gender = as.factor(Gender),
    JobRole = as.factor(JobRole),
    MaritalStatus = as.factor(MaritalStatus),
    OverTime = as.factor(OverTime),
    Over18 = as.factor(Over18)
  )

In [None]:
str(attrition)

'data.frame':	1470 obs. of  35 variables:
 $ Age                     : int  41 49 37 33 27 32 59 30 38 36 ...
 $ Attrition               : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 1 1 1 1 1 ...
 $ BusinessTravel          : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 2 3 2 3 2 3 3 2 3 ...
 $ DailyRate               : int  1102 279 1373 1392 591 1005 1324 1358 216 1299 ...
 $ Department              : Factor w/ 3 levels "Human Resources",..: 3 2 2 2 2 2 2 2 2 2 ...
 $ DistanceFromHome        : int  1 8 2 3 2 2 3 24 23 27 ...
 $ Education               : int  2 1 2 4 1 2 3 1 3 3 ...
 $ EducationField          : Factor w/ 6 levels "Human Resources",..: 2 2 5 2 4 2 4 2 2 4 ...
 $ EmployeeCount           : int  1 1 1 1 1 1 1 1 1 1 ...
 $ EmployeeNumber          : int  1 2 4 5 7 8 10 11 12 13 ...
 $ EnvironmentSatisfaction : int  2 3 4 4 1 4 3 4 4 3 ...
 $ Gender                  : Factor w/ 2 levels "Female","Male": 1 2 2 1 2 2 1 2 2 2 ...
 $ HourlyRate              : int  94 6

**Summary Statistics Grouped by Attrition**

In [None]:
# Numeric summaries
attrition %>%
  group_by(Attrition) %>%
  summarise(across(where(is.numeric), list(mean = mean, median = median), .names = "{.col}_{.fn}"))

**Attrition Distribution**

In [None]:
# Bar chart of Attrition
ggplot(attrition, aes(x = Attrition, fill = Attrition)) +
  geom_bar() +
  theme_minimal() +
  labs(title = "Attrition Distribution", y = "Count")

**Correlation Analysis (Numerical)**

In [None]:
# Select numeric columns
numeric_vars <- attrition %>% select(where(is.numeric))

# Correlation matrix
cor_matrix <- cor(numeric_vars)

# Visualize correlation
corrplot(cor_matrix, method = "color", type = "upper", tl.cex = 0.8)

**Univariate & Bivariate Analysis**

In [None]:
# Age Distribution
ggplot(attrition, aes(x = Age)) +
  geom_histogram(fill = "steelblue", bins = 30) +
  theme_minimal() +
  labs(title = "Distribution of Age")

In [None]:
# Monthly Income by Attrition
ggplot(attrition, aes(x = Attrition, y = MonthlyIncome, fill = Attrition)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Monthly Income by Attrition")

In [None]:
# Attrition by Job Role
ggplot(attrition, aes(x = JobRole, fill = Attrition)) +
  geom_bar(position = "fill") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Attrition Rate by Job Role", y = "Proportion")

In [None]:
# OverTime vs Attrition
ggplot(attrition, aes(x = OverTime, fill = Attrition)) +
  geom_bar(position = "fill") +
  theme_minimal() +
  labs(title = "Attrition Rate by Overtime", y = "Proportion")

In [None]:
ggplot(attrition, aes(x = Gender, fill = Attrition)) +
  geom_bar(position = "fill") +
  theme_minimal() +
  labs(title = "Attrition Rate by Gender", y = "Proportion")


In [None]:
ggplot(attrition, aes(x = MaritalStatus, fill = Attrition)) +
  geom_bar(position = "fill") +
  theme_minimal() +
  labs(title = "Attrition Rate by Marital Status", y = "Proportion")


In [None]:
ggplot(attrition, aes(x = Attrition, y = Age, fill = Attrition)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Age Distribution by Attrition")


In [None]:
ggplot(attrition, aes(x = YearsAtCompany, fill = Attrition)) +
  geom_histogram(position = "identity", alpha = 0.6, bins = 20) +
  theme_minimal() +
  labs(title = "Years at Company by Attrition")


In [None]:
ggplot(attrition, aes(x = factor(JobSatisfaction), fill = Attrition)) +
  geom_bar(position = "fill") +
  theme_minimal() +
  labs(title = "Attrition by Job Satisfaction", x = "Job Satisfaction (1â€“4)", y = "Proportion")


In [None]:
ggplot(attrition, aes(x = BusinessTravel, fill = Attrition)) +
  geom_bar(position = "fill") +
  theme_minimal() +
  labs(title = "Attrition Rate by Business Travel", y = "Proportion")


In [None]:
ggplot(attrition, aes(x = TotalWorkingYears, y = MonthlyIncome, color = Attrition)) +
  geom_point(alpha = 0.6) +
  theme_minimal() +
  labs(title = "Total Working Years vs. Monthly Income by Attrition")


# Regression Data Analysis

In [None]:
set.seed(123)

attrition <- subset(attrition, select = -c(EmployeeCount, EmployeeNumber, Over18, StandardHours))

split <- createDataPartition(attrition$MonthlyIncome, p = 0.8, list = FALSE)
train <- attrition[split, ]
test <- attrition[-split, ]

print(head(train))
print(head(test))

In [None]:
model_lm <- train(MonthlyIncome ~ ., data = train, method = "lm")

In [None]:
model_rf <- train(MonthlyIncome ~ ., data = train, method = "rf", trControl = trainControl(method = "cv", number = 5))

In [None]:
model_gbm <- train(MonthlyIncome ~ ., data = train, method = "gbm", verbose = FALSE, trControl = trainControl(method = "cv", number = 5))

In [None]:
pred_lm <- predict(model_lm, newdata = test)
pred_rf <- predict(model_rf, newdata = test)
pred_gbm <- predict(model_gbm, newdata = test)

postResample(pred_lm, obs = test$MonthlyIncome)
postResample(pred_rf, obs = test$MonthlyIncome)
postResample(pred_gbm, obs = test$MonthlyIncome)

In [None]:
ggplot(data.frame(Actual = test$MonthlyIncome, Predicted = pred_rf), aes(x = Actual, y = Predicted)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_abline(color = "red", linetype = "dashed") +
  theme_minimal() +
  labs(title = "Predicted vs Actual Monthly Income", x = "Actual", y = "Predicted")

In [None]:
plot(test$MonthlyIncome, pred_rf, col = "blue", main = "Predicted vs Actual (Random Forest)", xlab = "Actual Monthly Income", ylab = "Predicted Monthly Income")
abline(0, 1, col = "red")

In [None]:
varImp(model_rf)

In [None]:
test_results <- data.frame(Actual = test$MonthlyIncome, Predicted = pred_rf)
test_results$Difference <- test_results$Predicted - test_results$Actual

# Show top 10 underpaid employees
head(arrange(test_results, Difference), 10)

# Clustering

In [None]:
# Determine optimal number of clusters (Elbow Method)

set.seed(123)

fviz_nbclust(df_scaled, kmeans, method = "wss") +
  geom_vline(xintercept = 3, linetype = 2) +
  labs(subtitle = "Elbow method")

In [None]:
# Run k-means clustering
# assume we got 3 from elbow method

set.seed(123)

km_res <- kmeans(df_scaled, centers = 3, nstart = 25)

# Add cluster assignment to original data
df_clustered <- df %>%
  mutate(Cluster = as.factor(km_res$cluster))

In [None]:
# Visualize Clusters (PCA for Dimensionality Reduction)

fviz_cluster(km_res, data = df_scaled,
             ellipse.type = "euclid",
             palette = "jco",
             ggtheme = theme_minimal())


In [None]:
# Profile Clusters
# see what each clusters look like in terms of average values

df_clustered %>%
  group_by(Cluster) %>%
  summarise(across(where(is.numeric), mean, na.rm = TRUE))