# UpSample/ DownSample 
https://topepo.github.io/caret/subsampling-for-class-imbalances.html

In [23]:
install.packages(c("DMwR","ROSE"))

Installing packages into ‘/home/jupyter/.R/library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘TTR’, ‘xts’, ‘quantmod’




In [24]:
library(dplyr)
library(data.table)
library(partykit)
library(tictoc)
library(caret)
library(e1071)
library(randomForest)
library(ranger)

#for 3d plotting
library(akima)
library(plotly)

# for prep data
library(rPython)
library(stringr)
library(pbapply)
library(stringdist)
library(data.table)
library(tidyverse)

# for smote/rose sampling
library(DMwR)
library(ROSE)
require("plyr")


getwd()

Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 

Loaded ROSE 0.0-3




In [6]:

source('../orig/functions.R')

source('../orig/functions_models.R')

source('functions_eval.R')

source('load_prep_data_expiry.R')

# Create Subsample of Training data for Expirimentation

In [7]:
set.seed(123)
tr_idxs <- sample(seq_len(nrow(expiry_train_df_sub)), size = floor(0.05 * nrow(expiry_train_df_sub)))
ts_idxs <- sample(seq_len(nrow(expiry_test_df_sub)), size = floor(0.05 * nrow(expiry_test_df_sub)))

tr <- expiry_train_df_sub[tr_idxs, ] 
ts <- expiry_test_df_sub[ts_idxs, ] 

dim(tr)
dim(ts)

In [20]:
# initial training proportion 
dim(tr)
prop.table(table(tr$renewal_status))


Not Renewd    Renewed 
 0.9094964  0.0905036 

In [None]:
# downsamples
tr_d <- downSample(x = tr %>% select(-renewal_status),
                   y = tr$renewal_status)
dim(tr_d)
prop.table(table(tr_d$Class))

In [None]:
# upsamples
tr_u <- upSample(x = tr %>% select(-renewal_status),
                   y = tr$renewal_status)
dim(tr_u)
prop.table(table(tr_u$Class))

In [26]:
# check that imbalanced training doesn't have missing values
tr %>%
  select(everything()) %>%  # replace to your needs
  summarise_all(funs(sum(is.na(.))))

“`funs()` is deprecated as of dplyr 0.8.0.
Please use a list of either functions or lambdas: 

  # Simple named list: 
  list(mean = mean, median = median)

  # Auto named with `tibble::lst()`: 
  tibble::lst(mean, median)

  # Using lambdas
  list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))


renewal_status,tld,registrar,reseller_country,region,reg_period,sld_type,sld_length,day_domains,gibb_score,pattern_domain_count,reg_arpt
<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
0,0,0,0,0,0,0,0,0,0,0,0


In [28]:
# convert any character vars to factors
tr <- tr %>%
  mutate_if(sapply(tr, is.character), as.factor)

In [59]:
# create valid factor names for registrar 
# tr$registrar <- mapvalues(tr$registrar, levels(tr$registrar), make.names(levels(tr$registrar), unique=TRUE))
# tr <- tr %>% 
#   mutate(registrar = factor(registrar, 
#                         labels = make.names(levels(registrar))))
tr <- tr %>% 
  mutate(renewal_status = factor(renewal_status, 
                        labels = make.names(levels(renewal_status))))
tr <- tr %>% 
  mutate(tld = factor(tld, 
                        labels = make.names(levels(tld))))
tr <- tr %>% 
  mutate(reseller_country = factor(reseller_country, 
                        labels = make.names(levels(reseller_country))))
tr <- tr %>% 
  mutate(region = factor(region, 
                        labels = make.names(levels(region))))
tr <- tr %>% 
  mutate(sld_type = factor(sld_type, 
                        labels = make.names(levels(sld_type))))

In [60]:
str(tr)

Classes ‘data.table’ and 'data.frame':	59412 obs. of  12 variables:
 $ renewal_status      : Factor w/ 2 levels "Not.Renewd","Renewed": 1 1 1 1 1 1 1 1 1 1 ...
 $ tld                 : Factor w/ 11 levels "fun","host","in.net",..: 1 11 7 6 4 7 7 7 8 7 ...
 $ registrar           : Factor w/ 127 levels "X1.1.internet",..: 94 111 49 73 104 43 94 49 49 47 ...
 $ reseller_country    : Factor w/ 44 levels "Argentina","Australia",..: 33 8 20 43 9 40 33 20 20 13 ...
 $ region              : Factor w/ 2 levels "China","Non.China": 2 2 2 2 1 2 2 2 2 2 ...
 $ reg_period          : int  1 1 1 1 1 1 1 1 1 1 ...
 $ sld_type            : Factor w/ 6 levels "hyphen.l","l",..: 3 2 2 2 2 2 1 2 3 4 ...
 $ sld_length          : int  9 10 6 9 8 9 20 6 10 21 ...
 $ day_domains         : int  78 77 3965 571 1269 16 166 2377 10 5675 ...
 $ gibb_score          : num  4.71 3.88 0.14 1.42 7.63 2.62 8.69 0.12 9.83 5.08 ...
 $ pattern_domain_count: int  1 1 2 2 3 1 3 2 1 204 ...
 $ reg_arpt            : num  0.5 1

In [None]:
# SMOTE
tr_smote <- SMOTE(renewal_status ~ ., data  = tr)                         

dim(tr_smote)
prop.table(table(tr_smote$renewal_status))

In [None]:
# ROSE
tr_rose <- ROSE(renewal_status ~ ., data  = tr)$data                         

dim(tr_rose)
prop.table(table(tr_rose$renewal_status))

In [34]:
# For these data, we’ll use a bagged classification and estimate the area under the ROC curve using five repeats of 10-fold CV.

ctrl <- trainControl(method = "repeatedcv", repeats = 5,
                     classProbs = TRUE,
                     summaryFunction = twoClassSummary)

In [None]:
orig_fit <- train(renewal_status ~ ., data = tr, 
                  method = "treebag",
                  nbagg = 50,
                  metric = "ROC",
                  trControl = ctrl)

In [None]:
down_outside <- train(Class ~ ., data = down_train, 
                      method = "treebag",
                      nbagg = 50,
                      metric = "ROC",
                      trControl = ctrl)

In [None]:
up_outside <- train(Class ~ ., data = up_train, 
                    method = "treebag",
                    nbagg = 50,
                    metric = "ROC",
                    trControl = ctrl)

In [None]:
rose_outside <- train(Class ~ ., data = rose_train, 
                      method = "treebag",
                      nbagg = 50,
                      metric = "ROC",
                      trControl = ctrl)

In [None]:
smote_outside <- train(Class ~ ., data = smote_train, 
                       method = "treebag",
                       nbagg = 50,
                       metric = "ROC",
                       trControl = ctrl)

In [None]:
# We will collate the resampling results and create a wrapper to estimate the test set performance:

outside_models <- list(original = orig_fit,
                       down = down_outside,
                       up = up_outside,
                       SMOTE = smote_outside,
                       ROSE = rose_outside)

outside_resampling <- resamples(outside_models)

test_roc <- function(model, data) {
  library(pROC)
  roc_obj <- roc(data$Class, 
                 predict(model, data, type = "prob")[, "Class1"],
                 levels = c("Class2", "Class1"))
  ci(roc_obj)
  }

outside_test <- lapply(outside_models, test_roc, data = imbal_test)
outside_test <- lapply(outside_test, as.vector)
outside_test <- do.call("rbind", outside_test)
colnames(outside_test) <- c("lower", "ROC", "upper")
outside_test <- as.data.frame(outside_test)

summary(outside_resampling, metric = "ROC")