<a href="https://colab.research.google.com/github/keyBoredWarHero/mysentapp/blob/main/Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# load packages
library(tidyverse)
library(broom)
library(GGally)
library(palmerpenguins)
library(mvrsquared)
library(ggridges)
library(caret)
library(car)
library(readxl)
library(pROC)
library(margins)

#Load Data
EmployeeData = read_excel("data-raw/Employee_Data_Project.xlsx")

#check Structure of data
str(EmployeeData)

#change our variables accordingly
EmployeeData <-
  EmployeeData |>
  mutate(
    Gender = factor(Gender),
    MaritalStatus = factor(MaritalStatus),
    Attrition = factor(Attrition),
    TotalWorkingYears = as.numeric(TotalWorkingYears),
    NumCompaniesWorked = as.numeric(NumCompaniesWorked),
    BusinessTravel = factor(BusinessTravel),
    JobSatisfaction = as.numeric(JobSatisfaction),
    EnvironmentSatisfaction = as.numeric(EnvironmentSatisfaction)
  )

#to avoid an error code later we must convert our target variable into something our model can understand
levels(EmployeeData_train$Attrition) <- make.names(levels(EmployeeData_train$Attrition))


#Attrition = factor(Attrition)
#Attrition = ifelse(Attrition == "Yes", 1, 0)
# check missing values
MissingValues = EmployeeData |>
  summarize(
    across(everything(), function(x) sum(is.na(x)))
  )

MissingValues

#check Structure of data
str(EmployeeData)

#remove the NA's from data
EmployeeData = na.omit(EmployeeData)

#set seed
set.seed(5514)
#test/train split
test_idx <- createDataPartition(
  EmployeeData$Attrition,
  p = 0.3
)

EmployeeData_test <- EmployeeData[test_idx[[1]], ]

EmployeeData_train <- EmployeeData[-test_idx[[1]], ]

validation_idx <- createDataPartition(
  EmployeeData_train$Attrition,
  p = 0.3
)

EmployeeData_validation <- EmployeeData[validation_idx[[1]], ]

EmployeeData_train <- EmployeeData_train[-validation_idx[[1]], ]

#step 2 data explorations

summary(EmployeeData_train)
str(EmployeeData_train)

EmployeeData_train[sample(1:nrow(EmployeeData_train), 800), ] |> # sampling to save my computer
  ggpairs(aes(color = Attrition, alpha = 0.4))




#step 3 data prep

#aviod oversampling
sum(EmployeeData_train$Attrition == "1")

keep_idx <- c(
  which(EmployeeData_train$Attrition == "1"), # indices of positive class
  sample(which(EmployeeData_train$Attrition != "1"), 342)
)

#step 4 feature engineering
# step 5 Model
  lr <- glm(
    Attrition ~ JobSatisfaction + Age + NumCompaniesWorked + TotalWorkingYears + I(Age * TotalWorkingYears) ,
    data = EmployeeData_train[keep_idx, ],
    family = binomial("logit")
  )

  # initial checks
  summary(lr)

  # get the odds
  coefs <- tidy(lr)
  coefs <-
    coefs |>
    mutate(
      odds_estimate = exp(estimate),
      odds_mfx = odds_estimate - 1
    )

  coefs

  f_roc <- tibble(
    actual = EmployeeData_train$Attrition,
    predicted = predict(lr, EmployeeData_train, type = "response")
  ) |>
    roc("actual", "predicted")

  plot(f_roc)

  f_roc$auc


  levels(EmployeeData_train$Attrition) <- make.names(levels(EmployeeData_train$Attrition))


#creating a decision tree model
  decisiontree <- train(
    Attrition ~ JobSatisfaction + Age + NumCompaniesWorked + TotalWorkingYears,
    data = EmployeeData_train[keep_idx, ] |> drop_na(),
    method = "rpart",
    tuneGrid = expand.grid(cp = seq(0.001, 0.1, by = 0.01)),  # Tuning the complexity parameter (cp)
    trControl = trainControl(
      method = "cv", number = 10,  # 10-fold cross-validation
      classProbs = TRUE,  # Enable probability predictions
      summaryFunction = twoClassSummary  # Use twoClassSummary to compute AUC
    ),
    metric = "ROC"  # "ROC" gives us AUC & silences warning about Accuracy
  )

  decisiontree$results # average across CV results for each tuning parameter

  print(decisiontree$bestTune) # print chosen cp

  decisiontree$resample # cross validation results for the chosen cp

  rpart.plot(decisiontree$finalModel) # print the tree

  var_imp_income <- varImp(decisiontree)
