In [3]:
library(broom)
library(MASS)
library(glmbb)
library(cowplot)
library(tidyverse)
library(glmnet)
library(knitr)
library(caret)
library(tidyverse)
library(data.table)
library(broom)
library(scales)


Attaching package: ‘data.table’


The following objects are masked from ‘package:lubridate’:

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    yday, year


The following objects are masked from ‘package:dplyr’:

    between, first, last


The following object is masked from ‘package:purrr’:

    transpose



Attaching package: ‘scales’


The following object is masked from ‘package:purrr’:

    discard


The following object is masked from ‘package:readr’:

    col_factor




In [4]:
customer_personality <- read_tsv("marketing_campaign.csv")

customer_personality_clean <- customer_personality %>%
  # Drop useless variables
  select(-ID, -Dt_Customer, -Z_CostContact, -Z_Revenue) %>%
  # Create derived features
  mutate(Age = 2025 - Year_Birth,
         Total_Spending = MntWines + MntFruits + MntMeatProducts +
                     MntFishProducts + MntSweetProducts + MntGoldProds,
         # Convert categorical and binary variables to factors
         Education = as.factor(Education),
         Marital_Status = as.factor(Marital_Status),
         Complain = as.factor(Complain),
         AcceptedCmp1 = as.factor(AcceptedCmp1),
         AcceptedCmp2 = as.factor(AcceptedCmp2),
         AcceptedCmp3 = as.factor(AcceptedCmp3),
         AcceptedCmp4 = as.factor(AcceptedCmp4),
         AcceptedCmp5 = as.factor(AcceptedCmp5),
         Response = as.factor(Response)) %>%
  # Drop any rows with missing data (safe for GLM later)
  drop_na()

[1mRows: [22m[34m2240[39m [1mColumns: [22m[34m29[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m  (3): Education, Marital_Status, Dt_Customer
[32mdbl[39m (26): ID, Year_Birth, Income, Kidhome, Teenhome, Recency, MntWines, MntF...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


Ridge Model:

In [5]:
set.seed(123)

n <- nrow(customer_personality_clean)
train_size <- floor(0.7 * n)      # 70% train, 30% test

train_index <- sample(seq_len(n), size = train_size)

train_data <- customer_personality_clean[train_index, ]
test_data  <- customer_personality_clean[-train_index, ]


x_train <- model.matrix(Response ~ ., data = train_data)[, -1]  
y_train <- as.numeric(as.character(train_data$Response))       

x_test  <- model.matrix(Response ~ ., data = test_data)[, -1]
y_test  <- as.numeric(as.character(test_data$Response)) 

cv_ridge <- cv.glmnet(x_train, y_train, alpha = 0, family = "binomial", nfolds = 10)

best_lambda <- cv_ridge$lambda.min
se_lambda <- cv_ridge$lambda.1se

final_ridge_model <- glmnet(
  x_train, y_train,
  alpha = 0,         
  lambda = best_lambda,    
  family = "binomial"
)

pred_prob <- predict(final_ridge_model, newx = x_test, type = "response")


Logistic Regression Model

In [6]:
train_data2 <- customer_personality_clean[train_index, ]
test_data2  <- customer_personality_clean[-train_index, ]

## 2. Make sure Response is "No"/"Yes" in BOTH train and test --------

train_data2$Response <- factor(train_data2$Response,
                               levels = c("0", "1"),
                               labels = c("No", "Yes"))

test_data2$Response  <- factor(test_data2$Response,
                               levels = c("0", "1"),
                               labels = c("No", "Yes"))

## 3. Fit simple logistic regression on training data ----------------

logistic_model <- glm(
  Response ~ .,
  data   = train_data2,
  family = binomial
)

## 4. Predict probabilities on test data -----------------------------

pred_prob_simple <- predict(
  logistic_model,
  newdata = test_data2,
  type    = "response"
)

Function to loop through thresholds: 

In [7]:
compute_threshold_metrics <- function(probs, truth, positive_label,
                                      thresholds = seq(0.95, 0.05, by = -0.05)) {
  
  # Ensure the truth vector is a factor
  truth <- as.factor(truth)
  
  # Identify the negative label (the other level)
  negative_label <- setdiff(levels(truth), positive_label)
  
  # Loop through each threshold and compute metrics
  metrics_list <- lapply(thresholds, function(th) {
    
    # Convert probabilities to predicted classeses
    pred_class <- ifelse(probs >= th, positive_label, negative_label)
    
    # Convert predictions to factor with the same levels as truth
    pred_factor <- factor(pred_class, levels = levels(truth))

    # Create confusion matrix at this threshold
    cm <- confusionMatrix(
      data      = pred_factor,
      reference = truth,
      positive  = positive_label
    )

    # Extract metrics
    sensitivity <- as.numeric(cm$byClass["Sensitivity"])
    specificity <- as.numeric(cm$byClass["Specificity"])
    precision   <- as.numeric(cm$byClass["Pos Pred Value"])
    accuracy    <- as.numeric(cm$overall["Accuracy"])
    
    # Return as row vector
    c(
      Threshold   = th,
      Sensitivity = sensitivity,
      Specificity = specificity,
      Precision   = precision,
      Accuracy    = accuracy
    )
  })
  
  # Convert list of rows into data frame
  metrics_df <- as.data.frame(do.call(rbind, metrics_list))
  
  # Round values to 3 decimals
  metrics_df <- data.frame(lapply(metrics_df, function(col) {
    if (is.numeric(col)) round(col, 3) else col
  }))
  
  metrics_df
}

Results: 

In [9]:
set.seed(123)

metrics_simple <- compute_threshold_metrics(
  probs          = pred_prob_simple,
  truth          = test_data2$Response,
  positive_label = "Yes"
)

metrics_simple

metrics_ridge <- compute_threshold_metrics(
  probs          = as.numeric(pred_prob),
  truth          = factor(y_test, levels = c(0, 1)),
  positive_label = "1"
)

metrics_ridge

Threshold,Sensitivity,Specificity,Precision,Accuracy
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
0.95,0.063,0.996,0.75,0.863
0.9,0.074,0.996,0.778,0.865
0.85,0.084,0.996,0.8,0.866
0.8,0.095,0.995,0.75,0.866
0.75,0.189,0.993,0.818,0.878
0.7,0.221,0.989,0.778,0.88
0.65,0.263,0.986,0.758,0.883
0.6,0.284,0.984,0.75,0.884
0.55,0.326,0.982,0.756,0.889
0.5,0.411,0.981,0.78,0.899


Threshold,Sensitivity,Specificity,Precision,Accuracy
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
0.95,0.011,1.0,1.0,0.859
0.9,0.074,0.998,0.875,0.866
0.85,0.074,0.998,0.875,0.866
0.8,0.074,0.998,0.875,0.866
0.75,0.116,0.998,0.917,0.872
0.7,0.158,0.998,0.937,0.878
0.65,0.221,0.995,0.875,0.884
0.6,0.263,0.989,0.806,0.886
0.55,0.284,0.988,0.794,0.887
0.5,0.316,0.988,0.811,0.892
