In [54]:
library(ggplot2)
library(dplyr)
library(tidyverse)
library(pillar)
library(repr) 
library(magrittr)
library(rsample)
library(leaps)
library(broom)
# install.packages("car")
library(car)
# install.packages("caret")
library(caret)

In [55]:
data_clean <- read_csv("clean_BOLT_dataset.csv")
head(data_clean)
nrow(data_clean)

[1mRows: [22m[34m100000[39m [1mColumns: [22m[34m14[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (11): card, month_date, trans_time, payment_method, merchant_country, ca...
[32mdbl[39m  (3): risk_score, trans_value, MCC

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


card,month_date,trans_time,risk_score,payment_method,trans_value,merchant_country,card_present,chip_usage,international_trans,acquirer,merchant,MCC,fraud_flagged
<chr>,<chr>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>
card 1,05-11,16:22:14,362,Paypass - Contactless,13.98,USA,CP,Yes,No,acquirer 1,merchant 1,5812,No
card 2,06-05,15:16:35,602,Online,24.64,USA,CNP,No,No,acquirer 2,merchant 2,4121,No
card 3,06-05,11:57:40,482,Unknown,15.0,USA,CNP,No,No,acquirer 3,merchant 3,7211,No
card 4,07-20,18:26:30,947,Online,30.56,USA,CNP,No,No,acquirer 1,merchant 4,5814,No
card 5,03-03,19:03:11,1382,Magnetic Stripe,50.85,USA,CP,No,No,acquirer 4,merchant 1,5812,No
card 6,07-22,18:04:28,612,Paypass - Contactless,75.77,USA,CP,Yes,No,acquirer 5,merchant 1,5812,No


In [56]:
unique_types <- unique(data_clean$payment_method)

# Print the types of payment methods
print(unique_types)

 [1] "Paypass - Contactless" "Online"                "Unknown"              
 [4] "Magnetic Stripe"       "Subscription"          "Chip"                 
 [7] "Postal"                "PayPass - Wallet"      "Phone"                
[10] "Tap-to-Pay"            "Mobile Wallet"         "eCommerce"            


In [57]:
# turn categorical variables into binary value and remove NA values
data2 <- data_clean %>% filter(trans_time != "na:n:" ) %>%
    na.omit(risk_score) %>%
    mutate(chip_usage = if_else(chip_usage == "Yes", 1, 0), 
           international_trans = if_else(international_trans == "Yes", 1,0), 
           card_present = if_else(card_present == "CP",1,0), 
           fraud_flagged = if_else(fraud_flagged == "Yes",1,0))

In [58]:
# replace the original categorical variables for payment method with new dummy variables for each type of payment method.
model_data <- data2 %>%
                tibble(online = if_else(payment_method == "Online", 1, 0),
                       contactless = if_else(payment_method == "Paypass - Contactless", 1,0),
                       tap = if_else(payment_method == "Tap-to-Pay", 1, 0),
                       stripe = if_else(payment_method == "Magnetic Stripe", 1, 0),
                       subscription = if_else(payment_method == "Subscription", 1, 0),
                       chip = if_else(payment_method == "Chip", 1, 0),
                       postal = if_else(payment_method == "Postal", 1, 0),
                       paypass_wallet = if_else(payment_method == "PayPass - Wallet", 1, 0),
                       phone = if_else(payment_method == "Phone", 1, 0),
                       mobile_wallet = if_else(payment_method == "Mobile Wallet", 1, 0),
                       #eCommerce = if_else(payment_method == "eCommerce", 1, 0),
                       unknown = if_else(payment_method == "Unknown", 1, 0)
                ) %>%
#deselect some unused variables 
                 select(-card, -month_date, -trans_time, -payment_method, -merchant_country, -acquirer, -merchant, -MCC)
head(model_data,3)

risk_score,trans_value,card_present,chip_usage,international_trans,fraud_flagged,online,contactless,tap,stripe,subscription,chip,postal,paypass_wallet,phone,mobile_wallet,unknown
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
362,13.98,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0
602,24.64,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
482,15.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [59]:
set.seed(100) # set seed

# Splitting the data into two sets: 70% training and 30% test + validation
data_split <- initial_split(model_data, prop = 0.7)
data_train <- training(data_split)  # Training set
data_test <- testing(data_split)    # Test set

In [60]:

forward <- regsubsets(
  x = fraud_flagged ~ . ,nvmax = 15,
  data = data_train,
  method = "forward",
)

forward_summary <- summary(forward)
forward_summary

Subset selection object
Call: regsubsets.formula(x = fraud_flagged ~ ., nvmax = 15, data = data_train, 
    method = "forward", )
16 Variables  (and intercept)
                    Forced in Forced out
risk_score              FALSE      FALSE
trans_value             FALSE      FALSE
card_present            FALSE      FALSE
chip_usage              FALSE      FALSE
international_trans     FALSE      FALSE
online                  FALSE      FALSE
contactless             FALSE      FALSE
tap                     FALSE      FALSE
stripe                  FALSE      FALSE
subscription            FALSE      FALSE
chip                    FALSE      FALSE
postal                  FALSE      FALSE
paypass_wallet          FALSE      FALSE
phone                   FALSE      FALSE
mobile_wallet           FALSE      FALSE
unknown                 FALSE      FALSE
1 subsets of each size up to 15
Selection Algorithm: forward
          risk_score trans_value card_present chip_usage international_trans
1  ( 

In [61]:
#perform forward selection and select the number of variables with the highest adjusted R^2
forward_summary_df <- tibble(
    n_input_variables = 1:15, 
    RSQ = forward_summary$rsq,
    RSS = forward_summary$rss,
    ADJ.R2 = forward_summary$adjr2,
    Cp = forward_summary$cp,
    BIC = forward_summary$bic,
)
forward_summary_df

n_input_variables,RSQ,RSS,ADJ.R2,Cp,BIC
<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0.01429797,190.6798,0.01428345,23.547408,-955.4726
2,0.01442789,190.6546,0.01439885,16.596236,-953.2961
3,0.01456603,190.6279,0.01452248,9.078851,-951.687
4,0.01464228,190.6132,0.01458422,5.825347,-945.815
5,0.01471574,190.599,0.01464317,2.763941,-939.7512
6,0.01475591,190.5912,0.01466883,1.996822,-931.3931
7,0.01479183,190.5842,0.01469024,1.521582,-922.7431
8,0.01479979,190.5827,0.01468369,2.973203,-912.166
9,0.01480508,190.5817,0.01467446,4.609129,-901.4044
10,0.01480967,190.5808,0.01466453,6.292793,-890.5952


In [103]:
selected_var_1 <- names(coef(forward, 7))[-1] 
selected_var_1  risk, pay method trans_value country, card present, chip usage, international_trans, 

In [148]:
training <- data_train %>% select(all_of(selected_var_1),fraud_flagged,card_present, chip_usage)
testing <- data_test %>% select(all_of(selected_var_1),fraud_flagged, card_present, chip_usage) 

# training <- data_train %>% select(-trans_value, -international_trans)
# testing <- data_test %>% select(-trans_value, -international_trans) 

In [149]:
model <- glm(
        formula = fraud_flagged ~ .,
        data = training,
        family = binomial)
summary(model)

#check multicollinearity 
vif(model)


Call:
glm(formula = fraud_flagged ~ ., family = binomial, data = training)

Coefficients:
                      Estimate Std. Error z value Pr(>|z|)    
(Intercept)         -7.637e+00  1.902e-01 -40.149  < 2e-16 ***
risk_score           1.159e-03  6.136e-05  18.896  < 2e-16 ***
trans_value         -2.476e-04  2.726e-04  -0.908  0.36367    
international_trans  4.031e-01  1.630e-01   2.473  0.01339 *  
subscription         5.195e-01  1.826e-01   2.845  0.00444 ** 
postal              -1.260e+01  2.928e+02  -0.043  0.96569    
phone                9.049e-01  5.216e-01   1.735  0.08277 .  
unknown              1.134e+00  4.697e-01   2.414  0.01580 *  
card_present        -8.660e-02  4.646e-01  -0.186  0.85212    
chip_usage          -4.243e-01  4.898e-01  -0.866  0.38631    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2660.3  on 67891  degrees of freedom
Residual deviance: 2142.3  on 6788

In [158]:
# Make predictions using the fitted model
predicted <- predict(model, newdata = testing, type = "response")

# Classify predictions based on the threshold
classified_predictions <- ifelse(predicted <= 0.00275, 0, 1)
head(classified_predictions)

In [162]:
# Classify predictions based on the threshold
classified_predictions <- if_else(predicted <= 0.00275, 0, 1)

# Compare classified predictions with true values
accuracy <- mean(classified_predictions == testing$fraud_flagged)

# Print accuracy
print(accuracy)

[1] 0.8512561


In [163]:

misclassification_rate <- function(y, predicted){
    y_hat <- if_else(predicted <= 0.00275, 0, 1)
    error_rate <- mean(abs(y - y_hat))
    return(error_rate)
}

error_rate_train <- 
    misclassification_rate(
        testing$fraud_flagged,
        predicted)

error_rate_train

In [164]:
confusion_matrix <- 
    confusionMatrix(
    data = as.factor(classified_predictions),
    reference = as.factor(testing$fraud_flagged),
    positive = '1'
)
confusion_matrix

Confusion Matrix and Statistics

          Reference
Prediction     0     1
         0 24720    21
         1  4307    49
                                          
               Accuracy : 0.8513          
                 95% CI : (0.8471, 0.8553)
    No Information Rate : 0.9976          
    P-Value [Acc > NIR] : 1               
                                          
                  Kappa : 0.0175          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.700000        
            Specificity : 0.851621        
         Pos Pred Value : 0.011249        
         Neg Pred Value : 0.999151        
             Prevalence : 0.002406        
         Detection Rate : 0.001684        
   Detection Prevalence : 0.149706        
      Balanced Accuracy : 0.775810        
                                          
       'Positive' Class : 1               
                  

# USA 

In [241]:
# turn categorical variables into binary value and remove NA values
data3 <- data_clean %>% filter(trans_time != "na:n:" ) %>%
    na.omit(risk_score) %>%
    mutate(chip_usage = if_else(chip_usage == "Yes", 1, 0), 
           international_trans = if_else(international_trans == "Yes", 1,0), 
           card_present = if_else(card_present == "CP",1,0), 
           fraud_flagged = if_else(fraud_flagged == "Yes",1,0)) %>%
    filter(merchant_country == "USA")
head(data3)

card,month_date,trans_time,risk_score,payment_method,trans_value,merchant_country,card_present,chip_usage,international_trans,acquirer,merchant,MCC,fraud_flagged
<chr>,<chr>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>
card 1,05-11,16:22:14,362,Paypass - Contactless,13.98,USA,1,1,0,acquirer 1,merchant 1,5812,0
card 2,06-05,15:16:35,602,Online,24.64,USA,0,0,0,acquirer 2,merchant 2,4121,0
card 3,06-05,11:57:40,482,Unknown,15.0,USA,0,0,0,acquirer 3,merchant 3,7211,0
card 4,07-20,18:26:30,947,Online,30.56,USA,0,0,0,acquirer 1,merchant 4,5814,0
card 5,03-03,19:03:11,1382,Magnetic Stripe,50.85,USA,1,0,0,acquirer 4,merchant 1,5812,0
card 6,07-22,18:04:28,612,Paypass - Contactless,75.77,USA,1,1,0,acquirer 5,merchant 1,5812,0


In [242]:
# replace the original categorical variables for payment method with new dummy variables for each type of payment method.
model_data2 <- data3 %>%
                tibble(online = if_else(payment_method == "Online", 1, 0),
                       contactless = if_else(payment_method == "Paypass - Contactless", 1,0),
                       tap = if_else(payment_method == "Tap-to-Pay", 1, 0),
                       stripe = if_else(payment_method == "Magnetic Stripe", 1, 0),
                       subscription = if_else(payment_method == "Subscription", 1, 0),
                       chip = if_else(payment_method == "Chip", 1, 0),
                       postal = if_else(payment_method == "Postal", 1, 0),
                       paypass_wallet = if_else(payment_method == "PayPass - Wallet", 1, 0),
                       phone = if_else(payment_method == "Phone", 1, 0),
                       mobile_wallet = if_else(payment_method == "Mobile Wallet", 1, 0),
                       #eCommerce = if_else(payment_method == "eCommerce", 1, 0),
                       unknown = if_else(payment_method == "Unknown", 1, 0)
                ) %>%
#deselect some unused variables 
                 select(-card, -month_date, -trans_time, -payment_method, -merchant_country, -acquirer, -merchant, -MCC)
head(model_data2,3)

risk_score,trans_value,card_present,chip_usage,international_trans,fraud_flagged,online,contactless,tap,stripe,subscription,chip,postal,paypass_wallet,phone,mobile_wallet,unknown
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
362,13.98,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0
602,24.64,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
482,15.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [243]:
set.seed(101) # set seed

# Splitting the data into two sets: 70% training and 30% test + validation
data_split2 <- initial_split(model_data2, prop = 0.7)
data_train2 <- training(data_split2)  # Training set
data_test2 <- testing(data_split2)    # Test set

In [244]:
forward2 <- regsubsets(
  x = fraud_flagged ~ . ,nvmax = 15,
  data = data_train2,
  method = "forward",
)

forward_summary2 <- summary(forward2)
forward_summary2

“1  linear dependencies found”


Reordering variables and trying again:


“number of items to replace is not a multiple of replacement length”


Subset selection object
Call: regsubsets.formula(x = fraud_flagged ~ ., nvmax = 15, data = data_train2, 
    method = "forward", )
16 Variables  (and intercept)
                    Forced in Forced out
risk_score              FALSE      FALSE
trans_value             FALSE      FALSE
card_present            FALSE      FALSE
chip_usage              FALSE      FALSE
online                  FALSE      FALSE
contactless             FALSE      FALSE
tap                     FALSE      FALSE
stripe                  FALSE      FALSE
subscription            FALSE      FALSE
chip                    FALSE      FALSE
postal                  FALSE      FALSE
paypass_wallet          FALSE      FALSE
phone                   FALSE      FALSE
mobile_wallet           FALSE      FALSE
unknown                 FALSE      FALSE
international_trans     FALSE      FALSE
1 subsets of each size up to 15
Selection Algorithm: forward
          risk_score trans_value card_present chip_usage international_trans
1  (

In [245]:
#perform forward selection and select the number of variables with the highest adjusted R^2
forward_summary_df2 <- tibble(
    n_input_variables = 1:15, 
    RSQ = forward_summary2$rsq,
    RSS = forward_summary2$rss,
    ADJ.R2 = forward_summary2$adjr2,
    Cp = forward_summary2$cp,
    BIC = forward_summary2$bic,
)
forward_summary_df2

n_input_variables,RSQ,RSS,ADJ.R2,Cp,BIC
<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0.01021962,117.5429,0.0102026,13.118979767,-575.5914
2,0.01038259,117.5236,0.01034857,5.539408691,-574.1992
3,0.0105109,117.5083,0.01045986,-0.002384294,-570.7704
4,0.01055905,117.5026,0.010491,-0.832538653,-562.6299
5,0.0105938,117.4985,0.01050875,-0.875414164,-553.702
6,0.01061535,117.4959,0.01051328,-0.14181901,-543.9976
7,0.01062747,117.4945,0.01050839,1.145724397,-533.7391
8,0.01063677,117.4934,0.01050068,2.598970044,-523.3149
9,0.01064165,117.4928,0.01048855,4.311852144,-512.631
10,0.01064476,117.4924,0.01047465,6.129219788,-501.8425


In [246]:
selected_var2 <- names(coef(forward2, 4))[-1] 
selected_var2

In [247]:
training2 <- data_train2 %>% select(all_of(selected_var2),fraud_flagged,card_present, chip_usage)
testing2 <- data_test2 %>% select(all_of(selected_var2),fraud_flagged,card_present, chip_usage)

In [255]:
model2 <- glm(
        formula = fraud_flagged ~ .,
        data = training2,
        family = binomial)
summary(model2)


Call:
glm(formula = fraud_flagged ~ ., family = binomial, data = training2)

Coefficients: (1 not defined because of singularities)
                      Estimate Std. Error z value Pr(>|z|)    
(Intercept)         -7.8236447  0.2193005 -35.675  < 2e-16 ***
risk_score           0.0012236  0.0000722  16.946  < 2e-16 ***
subscription         0.6256379  0.2239059   2.794  0.00520 ** 
chip                 0.2953575  0.5014438   0.589  0.55585    
international_trans         NA         NA      NA       NA    
card_present         0.5978865  0.3657347   1.635  0.10210    
chip_usage          -1.5375154  0.4765620  -3.226  0.00125 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1711.5  on 58169  degrees of freedom
Residual deviance: 1404.0  on 58164  degrees of freedom
AIC: 1416

Number of Fisher Scoring iterations: 10


In [259]:
# Make predictions using the fitted model
predicted2 <- predict(model2, newdata = testing2, type = "response")

# Classify predictions based on the threshold
classified_predictions2 <- ifelse(predicted2 <= 0.0014, 0, 1)
head(classified_predictions2)

In [260]:


# Compare classified predictions with true values
accuracy2 <- mean(classified_predictions2 == testing2$fraud_flagged)

# Print accuracy
print(accuracy2)

[1] 0.7425294


In [261]:
confusion_matrix2 <- 
    confusionMatrix(
    data = as.factor(classified_predictions2),
    reference = as.factor(testing2$fraud_flagged),
    positive = '1'
)
confusion_matrix2

Confusion Matrix and Statistics

          Reference
Prediction     0     1
         0 18473    15
         1  6404    39
                                          
               Accuracy : 0.7425          
                 95% CI : (0.7371, 0.7479)
    No Information Rate : 0.9978          
    P-Value [Acc > NIR] : 1               
                                          
                  Kappa : 0.0077          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.722222        
            Specificity : 0.742573        
         Pos Pred Value : 0.006053        
         Neg Pred Value : 0.999189        
             Prevalence : 0.002166        
         Detection Rate : 0.001564        
   Detection Prevalence : 0.258433        
      Balanced Accuracy : 0.732398        
                                          
       'Positive' Class : 1               
                  