# Retrain RF on phase1 data but include reg_arpt as predictor

In [1]:
# install.packages("stringdist")

In [1]:
library(dplyr)
library(data.table)
library(partykit)
library(tictoc)
library(caret)
library(e1071)
library(randomForest)
library(ranger)

#for 3d plotting
library(akima)
library(plotly)

# for prep data
library(rPython)
library(stringr)
library(pbapply)
library(stringdist)
library(data.table)

getwd()


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last


Loading required package: grid

Loading required package: libcoin

Loading required package: mvtnorm

Loading required package: lattice

Loading required package: ggplot2

randomForest 4.6-14

Type rfNews() to see new features/changes/bug fixes.


Attaching package: ‘randomForest’


The following object is masked from ‘package:ggplot2’:

    margin


The following object is masked from ‘package:dplyr’:

    combine



Attaching package: ‘ranger’


The following object is masked from ‘package:randomForest’:

    importance



Attaching package: ‘plotly’


The following object is masked from ‘package:ggplot2’:

    last_plot


The following object is masked from ‘package:sta

In [2]:

source('../orig/functions.R')

source('../orig/functions_models.R')

source('functions_eval.R')


# Load data

In [3]:
# ranger_03 trained on train_df_f_sub, a subset (sans leaked vars) of train_df_f which was defined in dtree_play_3.R
# i'm bypassing the creation of date related variables as those are ultimately leaks

train <- readRDS("../../data/train")
train_df <- rbindlist(train)

train_df_f_sub <- subset(train_df, select = c(renewal_status, tld, registrar, reseller, 
                                              reseller_country, region, reg_period, 
                                              sld_type, sld_length, day_domains, gibb_score, 
                                              pattern_domain_count,reg_arpt) ) 
train_df_f_sub <- train_df_f_sub %>%
  mutate_if(sapply(train_df_f_sub, is.character), as.factor)

In [4]:
head(train_df_f_sub)

renewal_status,tld,registrar,reseller,reseller_country,region,reg_period,sld_type,sld_length,day_domains,gibb_score,pattern_domain_count,reg_arpt
<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<int>,<fct>,<int>,<int>,<dbl>,<int>,<dbl>
Not Renewd,fun,gmo,gmo,Japan,Non China,1,l,7,32,7.41,1,8
Renewed,fun,gmo,gmo,Japan,Non China,1,l,4,32,5.29,1,8
Not Renewd,fun,gmo,gmo,Japan,Non China,1,l,8,32,9.94,1,8
Not Renewd,fun,gmo,gmo,Japan,Non China,1,l,6,32,9.38,1,8
Not Renewd,fun,gmo,gmo,Japan,Non China,1,l,6,32,1.99,1,8
Renewed,fun,gmo,gmo,Japan,Non China,1,l,3,32,0.31,1,8


In [5]:
dim(train_df_f_sub)

## count by group to assess data diversity

In [6]:
train_df_f_sub %>% 
    group_by(registrar) %>% 
    tally() %>% 
    arrange(desc(n)) 

registrar,n
<fct>,<int>
namecheap,967254
go daddy,340111
gmo,251900


In [7]:
train_df_f_sub %>% 
    group_by(tld, registrar) %>% 
    tally() %>% 
    arrange(desc(n)) 

tld,registrar,n
<fct>,<fct>,<int>
site,namecheap,209076
site,gmo,177102
online,go daddy,155211
online,namecheap,154637
pw,namecheap,144846
website,namecheap,120162
space,namecheap,90966
host,namecheap,74200
fun,namecheap,59852
press,namecheap,58724


In [8]:
train_df_f_sub %>% 
    group_by(tld, registrar, reg_arpt) %>% 
    tally() %>% 
    arrange(desc(n)) 

tld,registrar,reg_arpt,n
<fct>,<fct>,<dbl>,<int>
site,gmo,0.1000,172611
online,go daddy,0.5000,70812
site,namecheap,0.4800,64470
pw,namecheap,0.4800,60162
site,namecheap,0.4400,55059
online,namecheap,0.4800,49726
site,namecheap,0.5900,46029
pw,namecheap,0.5900,45274
online,go daddy,2.5000,37656
website,namecheap,0.0001,35087


# Retrain Ranger_03

## per the same definitions as in dtree_xl.ipynb, but added reg_arpt as a predictor

In [9]:
names(train_df_f_sub)

In [10]:
na_df <- train_df_f_sub %>%
  summarise_all(funs(sum(is.na(.))))
t(na_df)

“`funs()` is deprecated as of dplyr 0.8.0.
Please use a list of either functions or lambdas: 

  # Simple named list: 
  list(mean = mean, median = median)

  # Auto named with `tibble::lst()`: 
  tibble::lst(mean, median)

  # Using lambdas
  list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))


0,1
renewal_status,0
tld,0
registrar,0
reseller,0
reseller_country,0
region,0
reg_period,0
sld_type,0
sld_length,0
day_domains,0


In [11]:
# model fitting: 1088.275 sec elapsed
# model saving: 146.864 sec elapsed


tic("model fitting")

ranger_03_ <- ranger(
    formula         = renewal_status ~ ., 
    data            = train_df_f_sub, 
    importance = 'impurity', 
    num.trees       = 500,
    class.weights = c(3,4),
    probability = TRUE,
    mtry            = 5,
    min.node.size   = 10,
    replace = FALSE,
    sample.fraction = .725,
    seed            = 123
)

toc()

tic("model saving")
save(ranger_03_, file="../../data/output/ranger_03_")
toc()

Growing trees.. Progress: 3%. Estimated remaining time: 16 minutes, 6 seconds.
Growing trees.. Progress: 7%. Estimated remaining time: 13 minutes, 58 seconds.
Growing trees.. Progress: 11%. Estimated remaining time: 13 minutes, 12 seconds.
Growing trees.. Progress: 15%. Estimated remaining time: 12 minutes, 34 seconds.
Growing trees.. Progress: 18%. Estimated remaining time: 12 minutes, 8 seconds.
Growing trees.. Progress: 22%. Estimated remaining time: 11 minutes, 31 seconds.
Growing trees.. Progress: 25%. Estimated remaining time: 11 minutes, 3 seconds.
Growing trees.. Progress: 29%. Estimated remaining time: 10 minutes, 27 seconds.
Growing trees.. Progress: 33%. Estimated remaining time: 9 minutes, 49 seconds.
Growing trees.. Progress: 37%. Estimated remaining time: 9 minutes, 18 seconds.
Growing trees.. Progress: 40%. Estimated remaining time: 8 minutes, 48 seconds.
Growing trees.. Progress: 44%. Estimated remaining time: 8 minutes, 19 seconds.
Growing trees.. Progress: 47%. Estima

## Predict & Get lift on Phase 1 data

In [12]:

predict_ranger_03_ <- predict(ranger_03_, 
                          data = train_df_f_sub,
                          type="response")$predictions

predict_ranger_03_ <- as.data.frame(predict_ranger_03_)$Renewed

save(predict_ranger_03_, 
     file="../../data/output/predict_ranger_03_")


Predicting.. Progress: 41%. Estimated remaining time: 43 seconds.
Predicting.. Progress: 83%. Estimated remaining time: 12 seconds.
Aggregating predictions.. Progress: 34%. Estimated remaining time: 1 minute, 1 seconds.
Aggregating predictions.. Progress: 67%. Estimated remaining time: 30 seconds.
Aggregating predictions.. Progress: 99%. Estimated remaining time: 0 seconds.


In [13]:
predict_ranger_03_df = data.frame("actual" = train_df_f_sub$renewal_status,
                           "predicted" = predict_ranger_03_)

lift_df_ranger_03_ <- chart_lift(pred_df = predict_ranger_03_df,
                        dep_var = "actual",
                        pred_var = "predicted")

saveRDS(lift_df_ranger_03_, file.path('../../data/output/',"lift_df_ranger_03_"))

gain_ranger_03_ <- plot_gains (lift_df_ranger_03_) 


auc_ranger_03_ <- calc_auc(lift_df_ranger_03_)


In [15]:
lift_df_ranger_03_

P,actu_renwd2,gain,lift
<dbl>,<int>,<dbl>,<dbl>
0.1,112766,0.6441011,6.441011
0.2,164849,0.9415907,4.707954
0.3,174691,0.9978067,3.326022
0.4,175013,0.9996459,2.499115
0.5,175038,0.9997887,1.999577
0.6,175063,0.9999315,1.666552
0.7,175075,1.0,1.428571
0.8,175075,1.0,1.25
0.9,175075,1.0,1.111111
1.0,175075,1.0,1.0


In [14]:
auc_ranger_03_

## Predict & Get Lift on Expiry Data

In [17]:

expiry_test_prepped_2 <- readRDS("/home/jupyter/local/Domains_202003/data/expiry_test_prepped_2")

expiry_test_df <- rbindlist(expiry_test_prepped_2, fill=TRUE)

In [19]:

ranger_predict_03_expiry <- predict(ranger_03_, 
                          data = expiry_test_df,
                          type="response")$predictions

ranger_predict_03_expiry <- as.data.frame(ranger_predict_03_expiry)$Renewed

save(ranger_predict_03_expiry, 
     file="../../data/output/ranger_predict_03_expiry")


In [20]:
ranger_03_pred_df_expiry = data.frame("actual" = expiry_test_df$renewal_status,
          "predicted" = ranger_predict_03_expiry)

lift_df_ranger_03_expiry <- chart_lift(pred_df = ranger_03_pred_df_expiry,
                        dep_var = "actual",
                        pred_var = "predicted")


gains_plot_ranger_03_expiry <- plot_gains (lift_df_ranger_03_expiry) 

auc_ranger_03_expiry <- calc_auc(lift_df_ranger_03_expiry)

saveRDS(lift_df_ranger_03_expiry, file.path('../../data/output/',"lift_df_ranger_03_expiry"))


In [22]:
lift_df_ranger_03_expiry

P,actu_renwd2,gain,lift
<dbl>,<int>,<dbl>,<dbl>
0.1,15115,0.2475921,2.475921
0.2,28120,0.4606212,2.303106
0.3,39709,0.6504554,2.168185
0.4,48024,0.7866597,1.966649
0.5,52859,0.8658597,1.731719
0.6,55795,0.913953,1.523255
0.7,57949,0.9492367,1.356052
0.8,59455,0.9739058,1.217382
0.9,60600,0.9926615,1.102957
1.0,61048,1.0,1.0


In [23]:
system("gsutil mv /home/jupyter/local/Domains_202003/data/output/* gs://data_outputt/output/")