# INVESTIGATE NAs as outputs in tld_reseller model performance comparison 

In [1]:
options(repr.matrix.max.cols=50, repr.matrix.max.rows=100)

In [2]:
# install.packages("pkgcond")

In [3]:
library(dplyr)
library(data.table)
library(partykit)
library(tictoc)
library(caret)
library(e1071)
library(randomForest)
library(ranger)

#for 3d plotting
library(akima)
library(plotly)

# for prep data
library(rPython)
library(stringr)
library(pbapply)
library(stringdist)
library(data.table)
library(dominanceanalysis)


getwd()


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last


Loading required package: grid

Loading required package: libcoin

Loading required package: mvtnorm

Loading required package: lattice

Loading required package: ggplot2

randomForest 4.6-14

Type rfNews() to see new features/changes/bug fixes.


Attaching package: ‘randomForest’


The following object is masked from ‘package:ggplot2’:

    margin


The following object is masked from ‘package:dplyr’:

    combine



Attaching package: ‘ranger’


The following object is masked from ‘package:randomForest’:

    importance



Attaching package: ‘plotly’


The following object is masked from ‘package:ggplot2’:

    last_plot


The following object is masked from ‘package:sta

In [4]:

source('../orig/functions.R')

source('../orig/functions_models.R')

source('../phaseII_03_forest/functions_eval.R')


source('../phaseII_03_forest/load_prep_data_expiry.R')

# expiry_train_prepped_2_1 (list, less df's w/ 0 obs)
# expiry_test_prepped_2_1  (list, less df's w/ 0 obs)
# expiry_train_df_1 (above, rbound)
# expiry_test_df_1 (above, rbound)
# expiry_train_df_sub (subset vars)                                        
# expiry_test_df_sub (subset vars)

## List tld_reseller labels

In [5]:
tld_reseller_lookup_df = expiry_train_df_1 %>% group_by(tld_registrar_index, reseller) %>% tally() %>% arrange(desc(n))
tld_reseller_lookup_df %>% head()

tld_registrar_index,reseller,n
<chr>,<chr>,<int>
sitegmo,gmo,106569
funalibaba,alibaba,72690
sitenamecheap,namecheap,53536
pwnamecheap,namecheap,50470
onlinego daddy,go daddy,42162
sitegandi sas,gandi sas,35309


## Load PREPed tld_reseller compare

In [6]:
predictions_df <- read.csv("../../data/tld_reseller_compare_predictions.csv")
metrics_df <- read.csv("../../data/tld_reseller_compare_metrics.csv")

dim(predictions_df)
head(predictions_df)
dim(metrics_df)
head(metrics_df)

Unnamed: 0_level_0,actual,pred_df_seg2_glm,pred_df_seg_glm,pred_df_agg_glm,pred_df_seg2_rf,pred_df_seg_rf,pred_df_agg_rf,tld_registrar_index,reseller,n,domain_id,domain
Unnamed: 0_level_1,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<fct>,<int>,<int>,<fct>
1,Not Renewd,0.6470992,0.558,0.419,0.4765771,0.4545657,0.5386137,sitegmo,gmo,106569,44295183,kagen.site
2,Renewed,0.6762222,0.567,0.397,0.5410605,0.5769973,0.6091513,sitegmo,gmo,106569,44573611,designlab.site
3,Renewed,0.9098495,0.835,0.41,0.7018108,0.6343404,0.688668,sitegmo,gmo,106569,45304858,hokatu-blog.site
4,Renewed,0.602935,0.515,0.402,0.2414726,0.3999603,0.3464197,sitegmo,gmo,106569,46235129,suzuya.site
5,Not Renewd,0.9311321,0.863,0.417,0.7749462,0.7039504,0.671174,sitegmo,gmo,106569,46276970,wins-company.site
6,Renewed,0.9347869,0.861,0.391,0.6376848,0.656674,0.6717237,sitegmo,gmo,106569,47809960,yamatoku-company.site


Unnamed: 0_level_0,tld_registrar_index,reseller,n,l10_seg2_glm,l10_seg_glm,l10_agg_glm,l10_seg2_rf,l10_seg_rf,l10_agg_rf,auc_seg2_glm,auc_seg_glm,auc_agg_glm,auc_seg2_rf,auc_seg_rf,auc_agg_rf
Unnamed: 0_level_1,<fct>,<fct>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,sitegmo,gmo,106569,8.204334,8.126935,6.408669,8.77709,8.76161,8.591331,0.8958204,0.8825077,0.8729102,0.9345201,0.9348297,0.9329721
2,funalibaba,alibaba,72690,4.883721,5.255814,5.023256,5.813953,6.0,6.186047,0.8262791,0.8430233,0.8518605,0.8769767,0.877907,0.8793023
3,sitenamecheap,namecheap,53536,2.298025,1.921005,1.885099,2.280072,2.028725,2.046679,0.6862657,0.6196589,0.633842,0.6654399,0.6792639,0.6893178
4,pwnamecheap,namecheap,50470,3.478261,3.675889,3.675889,3.438735,4.071146,4.3083,0.7274704,0.6551383,0.636166,0.7167984,0.7405138,0.7507905
5,onlinego daddy,go daddy,42162,2.850995,2.883271,2.474449,2.598171,2.727273,2.646584,0.6506455,0.6493007,0.6381119,0.6095481,0.6266541,0.6249328
6,sitegandi sas,gandi sas,35309,8.243243,8.445946,7.094595,4.932432,8.445946,8.581081,0.9202703,0.9195946,0.9189189,0.7162162,0.9344595,0.9358108


In [7]:
head(expiry_train_df_1)

renewal_type,renewed_count,expiry_date,domain_id,domain,creation_date,status,tld,registrar,reseller,reseller_country,region,reg_period,registrant_country,renewal_status,renew_mbg,renew_type,autorenew_type,renew_date,renew_registrar,renew_reseller,reg_revenue,reg_arpt,renew_period,renew_domain_revenue,renew_arpt,reg_arpt_org,tld_registrar_index,sld,sld_type,sld_length,sld_type2,day_domains,log_reg_arpt,gibb_score,pattern,cluster,pattern_score,pattern_domain_count
<chr>,<int>,<date>,<int>,<chr>,<date>,<chr>,<fct>,<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<fct>,<int>,<chr>,<chr>,<date>,<chr>,<chr>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<fct>,<int>,<chr>,<int>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<int>
FirstTime,1,2020-01-21,91584433,racefor.fun,2019-01-21,Active,fun,1&1 internet,1&1 internet,Germany,Non China,1,,Renewed,0,auto-renewal,realized,2020-01-21,1&1 Internet,1&1 Internet,15,15,1,15,15,15,fun1&1 internet,racefor,l,7,7l,1,2.70805,6.77,racefor,1,1,1
FirstTime,1,2020-01-30,92254793,united4.fun,2019-01-30,Active,fun,1&1 internet,1&1 internet,Germany,Non China,1,,Renewed,0,auto-renewal,realized,2020-01-30,1&1 Internet,1&1 Internet,15,15,1,15,15,15,fun1&1 internet,united4,ln,7,7ln,1,2.70805,8.65,united4,1,1,1
FirstTime,1,2020-02-04,92827233,bereal.fun,2019-02-04,Active,fun,1&1 internet,1&1 internet,Germany,Non China,1,,Renewed,0,auto-renewal,realized,2020-02-04,1&1 Internet,1&1 Internet,15,15,1,15,15,15,fun1&1 internet,bereal,l,6,6l,1,2.70805,13.09,bereal,1,1,1
FirstTime,1,2020-02-12,93490823,dogsoutdoors.fun,2019-02-12,Active,fun,1&1 internet,1&1 internet,Germany,Non China,1,,Not Renewd,0,auto-renewal,unrealized,2020-02-12,1&1 Internet,1&1 Internet,15,15,1,15,15,15,fun1&1 internet,dogsoutdoors,l,12,12l,1,2.70805,2.44,dogsoutdoors,1,1,1
FirstTime,1,2020-02-15,93767978,rosalux.fun,2019-02-15,Active,fun,1&1 internet,1&1 internet,Germany,Non China,1,,Renewed,0,auto-renewal,realized,2020-02-15,1&1 Internet,1&1 Internet,15,15,1,15,15,15,fun1&1 internet,rosalux,l,7,7l,1,2.70805,2.34,rosalux,1,1,1
FirstTime,1,2020-02-16,93823708,verygood.fun,2019-02-16,Active,fun,1&1 internet,1&1 internet,Germany,Non China,1,,Renewed,0,auto-renewal,realized,2020-02-16,1&1 Internet,1&1 Internet,15,15,1,15,15,15,fun1&1 internet,verygood,l,8,8l,1,2.70805,3.64,verygood,1,1,1


In [8]:
system("gsutil cp gs://data_input/PredictiveModelAnalysis_AnomalyRegistrars.csv /home/jupyter/local/Domains_202003/data/PredictiveModelAnalysis_AnomalyRegistrars.csv")
anomalyDF <- read.csv("/home/jupyter/local/Domains_202003/data/PredictiveModelAnalysis_AnomalyRegistrars.csv")
anomalyDF <- apply(anomalyDF, 2, function(x) trimws(x,which = c("both")) )
geoLookupDF <- unique(anomalyDF[,c('reseller','reseller_country','reseller_geo')])
geoLookupDF <- as.data.frame(geoLookupDF)
head(geoLookupDF)

Unnamed: 0_level_0,reseller,reseller_country,reseller_geo
Unnamed: 0_level_1,<fct>,<fct>,<fct>
1,007names,United States,United States
2,0101 internet,Hong Kong,China
3,1&1 internet,Germany,EU
4,101 domain,United States,United States
5,101domain discovery,Germany,EU
6,10dencehispahard,Spain,EU


In [9]:
# number of unique geos per reseller
geoLookupDF %>%
  group_by(reseller) %>%
  summarise(u_geo = n_distinct(reseller_geo))  %>%
  filter(u_geo>1)

`summarise()` ungrouping output (override with `.groups` argument)



reseller,u_geo
<fct>,<int>
hostgator,2
moniker,2
psi-japan,2
registrarsec,2
uk2,2
,2


In [10]:
# number of unique geos per reseller-country
geoLookupDF %>%
  group_by(reseller, reseller_country) %>%
  summarise(u_geo = n_distinct(reseller_geo))  %>%
  filter(u_geo>1)

`summarise()` regrouping output by 'reseller' (override with `.groups` argument)



reseller,reseller_country,u_geo
<fct>,<fct>,<int>
,,2


In [11]:
geoLookupDF %>% filter(is.na(reseller))

reseller,reseller_country,reseller_geo
<fct>,<fct>,<fct>
,,Others
,,


In [12]:
# ewnsure correct number of dimm
dim(expiry_train_df_1)
expiry_train_df_2 <- merge(expiry_train_df_1,geoLookupDF,on=c('reseller','reseller_country'), all.x = TRUE)
dim(expiry_train_df_2)
head(expiry_train_df_2)


reseller,reseller_country,renewal_type,renewed_count,expiry_date,domain_id,domain,creation_date,status,tld,registrar,region,reg_period,registrant_country,renewal_status,renew_mbg,renew_type,autorenew_type,renew_date,renew_registrar,renew_reseller,reg_revenue,reg_arpt,renew_period,renew_domain_revenue,renew_arpt,reg_arpt_org,tld_registrar_index,sld,sld_type,sld_length,sld_type2,day_domains,log_reg_arpt,gibb_score,pattern,cluster,pattern_score,pattern_domain_count,reseller_geo
<chr>,<chr>,<chr>,<int>,<date>,<int>,<chr>,<date>,<chr>,<fct>,<chr>,<chr>,<int>,<chr>,<fct>,<int>,<chr>,<chr>,<date>,<chr>,<chr>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<fct>,<int>,<chr>,<int>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<int>,<fct>
1&1 internet,Germany,FirstTime,1,2020-01-21,91584433,racefor.fun,2019-01-21,Active,fun,1&1 internet,Non China,1,,Renewed,0,auto-renewal,realized,2020-01-21,1&1 Internet,1&1 Internet,15,15,1,15,15,15,fun1&1 internet,racefor,l,7,7l,1,2.70805,6.77,racefor,1,1,1,EU
1&1 internet,Germany,FirstTime,1,2020-01-30,92254793,united4.fun,2019-01-30,Active,fun,1&1 internet,Non China,1,,Renewed,0,auto-renewal,realized,2020-01-30,1&1 Internet,1&1 Internet,15,15,1,15,15,15,fun1&1 internet,united4,ln,7,7ln,1,2.70805,8.65,united4,1,1,1,EU
1&1 internet,Germany,FirstTime,1,2020-02-04,92827233,bereal.fun,2019-02-04,Active,fun,1&1 internet,Non China,1,,Renewed,0,auto-renewal,realized,2020-02-04,1&1 Internet,1&1 Internet,15,15,1,15,15,15,fun1&1 internet,bereal,l,6,6l,1,2.70805,13.09,bereal,1,1,1,EU
1&1 internet,Germany,FirstTime,1,2020-02-12,93490823,dogsoutdoors.fun,2019-02-12,Active,fun,1&1 internet,Non China,1,,Not Renewd,0,auto-renewal,unrealized,2020-02-12,1&1 Internet,1&1 Internet,15,15,1,15,15,15,fun1&1 internet,dogsoutdoors,l,12,12l,1,2.70805,2.44,dogsoutdoors,1,1,1,EU
1&1 internet,Germany,FirstTime,1,2020-02-15,93767978,rosalux.fun,2019-02-15,Active,fun,1&1 internet,Non China,1,,Renewed,0,auto-renewal,realized,2020-02-15,1&1 Internet,1&1 Internet,15,15,1,15,15,15,fun1&1 internet,rosalux,l,7,7l,1,2.70805,2.34,rosalux,1,1,1,EU
1&1 internet,Germany,FirstTime,1,2020-02-16,93823708,verygood.fun,2019-02-16,Active,fun,1&1 internet,Non China,1,,Renewed,0,auto-renewal,realized,2020-02-16,1&1 Internet,1&1 Internet,15,15,1,15,15,15,fun1&1 internet,verygood,l,8,8l,1,2.70805,3.64,verygood,1,1,1,EU


In [13]:
# replace original
expiry_train_df_1 <- expiry_train_df_2

In [14]:
expiry_train_df_1 %>% filter(reseller_country=='') %>% select(reseller)

reseller
<chr>
anygaming
electron networks
innovadeus
innovadeus
netclues
ownregistrar
ownregistrar
ownregistrar
ownregistrar
tucows


In [15]:
# number of unique reseller_country and reseller_geo for eachtld_registrar_index
expiry_train_df_1 %>%
  group_by(tld_registrar_index) %>%
  summarise(u_cntry = n_distinct(reseller_country),
         u_geo = n_distinct(reseller_geo))  %>%
  filter(u_cntry>1)

`summarise()` ungrouping output (override with `.groups` argument)



tld_registrar_index,u_cntry,u_geo
<chr>,<int>,<int>
funkey-systems,2,2
funtucows,2,2
onlineinnovadeus,2,1
onlinekey-systems,2,2
onlineownregistrar,2,2
onlineshopify,2,2
onlinetucows,2,2
onlineuk2,2,2
presskey-systems,2,2
presspdr ltd,2,2


## Feature Engineering for tld-reseller level 

In [16]:
# new geo level feature eng
geo_maj = expiry_train_df_1 %>%
  add_count(tld_registrar_index, reseller_geo) %>%
  group_by(tld_registrar_index) %>%
  mutate(geo_maj = reseller_geo[n == max(n)][1]) %>%
  select(-n) %>% 
  group_by(tld_registrar_index,geo_maj) %>%
    summarise(n = n()) %>% 
    arrange(desc(n)) %>%
    pull(geo_maj)

`summarise()` regrouping output by 'tld_registrar_index' (override with `.groups` argument)



In [17]:
# new geo level feature eng
geo_cnt = expiry_train_df_1 %>%
  add_count(tld_registrar_index, reseller_geo) %>%
  group_by(tld_registrar_index) %>%
  summarise(geo_cnt = n_distinct(reseller_geo)) %>%
  pull(geo_cnt)  

`summarise()` ungrouping output (override with `.groups` argument)



In [18]:
count_pct <- function(df) {
  return(
    df %>%
      tally %>% 
      mutate(n_prp = n/sum(n))
  )
}

In [19]:
tld_reseller_names = expiry_train_df_1 %>% 
    group_by(tld_registrar_index) %>% 
    count_pct %>%
    arrange(desc(n)) %>% 
   pull(tld_registrar_index)

In [20]:
count_obs = expiry_train_df_1 %>% 
    group_by(tld_registrar_index) %>% 
    count_pct %>%
    arrange(desc(n)) %>% 
    pull(n)

In [21]:
ren_prp = expiry_train_df_1 %>% 
    group_by(tld_registrar_index) %>% 
    summarise(n = n(), ren_prp = sum(renewal_status=='Renewed')/sum(n))  %>% 
    arrange(desc(n)) %>% 
    pull(ren_prp)

`summarise()` ungrouping output (override with `.groups` argument)



In [22]:
# not informative -- tld_cnt always = 1 so tld_rat is ismply mult.inverse of count_obs
tld_rat = expiry_train_df_1 %>%
    group_by(tld_registrar_index) %>% 
    summarise(n = n(), tld_cnt = n_distinct(tld), tld_rat = tld_cnt/n) %>% 
    arrange(desc(n)) %>%
    pull(tld_rat)

`summarise()` ungrouping output (override with `.groups` argument)



In [23]:
country_maj = expiry_train_df_1 %>%
  add_count(tld_registrar_index, reseller_country) %>%
  group_by(tld_registrar_index) %>%
  mutate(reseller_country_maj = reseller_country[n == max(n)][1]) %>%
  select(-n) %>% 
  group_by(tld_registrar_index,reseller_country_maj) %>%
    summarise(n = n()) %>% 
    arrange(desc(n)) %>%
    pull(reseller_country_maj)

`summarise()` regrouping output by 'tld_registrar_index' (override with `.groups` argument)



In [24]:
country_cnt = expiry_train_df_1 %>%
  add_count(tld_registrar_index, reseller_country) %>%
  group_by(tld_registrar_index) %>%
  summarise(reseller_country_cnt = n_distinct(reseller_country)) %>%
  pull(reseller_country_cnt)  

`summarise()` ungrouping output (override with `.groups` argument)



In [25]:
region_maj = expiry_train_df_1 %>%
  add_count(tld_registrar_index, region) %>%
  group_by(tld_registrar_index) %>%
  mutate(region_maj = region[n == max(n)][1]) %>%
  select(-n) %>% 
  group_by(tld_registrar_index,region_maj) %>%
    summarise(n = n()) %>% 
    arrange(desc(n)) %>%
    pull(region_maj)

`summarise()` regrouping output by 'tld_registrar_index' (override with `.groups` argument)



In [26]:
region_cnt = expiry_train_df_1 %>%
  add_count(tld_registrar_index, region) %>%
  group_by(tld_registrar_index) %>%
  summarise(reseller_region_cnt = n_distinct(reseller_country)) %>%
  pull(reseller_region_cnt)  

`summarise()` ungrouping output (override with `.groups` argument)



In [27]:
daydom_stats = expiry_train_df_1 %>%
    group_by(tld_registrar_index) %>% 
    summarise(n = n(), 
              daydom_min = min(day_domains), 
              daydom_max = max(day_domains), 
              daydom_mean = mean(day_domains, na.rm = TRUE), 
              daydom_rng = daydom_max - daydom_min, 
              daydom_std = sd(day_domains, na.rm = TRUE), 
              daydom_skew = skewness(day_domains, na.rm = TRUE), 
              daydom_kurt = kurtosis(day_domains, na.rm = TRUE)) %>% 
    arrange(desc(n)) 

daydom_min <- daydom_stats %>% pull(daydom_min)
daydom_max <- daydom_stats %>% pull(daydom_max)
daydom_mean <- daydom_stats %>% pull(daydom_mean)
daydom_rng <- daydom_stats %>% pull(daydom_rng)
daydom_std <- daydom_stats %>% pull(daydom_std)
daydom_skew <- daydom_stats %>% pull(daydom_skew)
daydom_kurt <- daydom_stats %>% pull(daydom_kurt)



`summarise()` ungrouping output (override with `.groups` argument)



In [28]:
sldlen_stats = expiry_train_df_1 %>%
    group_by(tld_registrar_index) %>% 
    summarise(n = n(), 
              min = min(sld_length), 
              max = max(sld_length), 
              mean = mean(sld_length, na.rm = TRUE), 
              rng = max - min, 
              std = sd(sld_length, na.rm = TRUE), 
              skew = skewness(sld_length, na.rm = TRUE), 
              kurt = kurtosis(sld_length, na.rm = TRUE)) %>% 
    arrange(desc(n)) 

sldlen_min <- sldlen_stats %>% pull(min)
sldlen_max <- sldlen_stats %>% pull(max)
sldlen_mean <- sldlen_stats %>% pull(mean)
sldlen_rng <- sldlen_stats %>% pull(rng)
sldlen_std <- sldlen_stats %>% pull(std)
sldlen_skew <- sldlen_stats %>% pull(skew)
sldlen_kurt <- sldlen_stats %>% pull(kurt)



`summarise()` ungrouping output (override with `.groups` argument)



In [29]:
gibbs_stats = expiry_train_df_1 %>%
    group_by(tld_registrar_index) %>% 
    summarise(n = n(), 
              min = min(gibb_score), 
              max = max(gibb_score), 
              mean = mean(gibb_score, na.rm = TRUE), 
              rng = max - min, 
              std = sd(gibb_score, na.rm = TRUE), 
              skew = skewness(gibb_score, na.rm = TRUE), 
              kurt = kurtosis(gibb_score, na.rm = TRUE)) %>% 
    arrange(desc(n)) 

gibbs_min <- gibbs_stats %>% pull(min)
gibbs_max <- gibbs_stats %>% pull(max)
gibbs_mean <- gibbs_stats %>% pull(mean)
gibbs_rng <- gibbs_stats %>% pull(rng)
gibbs_std <- gibbs_stats %>% pull(std)
gibbs_skew <- gibbs_stats %>% pull(skew)
gibbs_kurt <- gibbs_stats %>% pull(kurt)


`summarise()` ungrouping output (override with `.groups` argument)



In [30]:
pdcnt_stats = expiry_train_df_1 %>%
    group_by(tld_registrar_index) %>% 
    summarise(n = n(), 
              min = min(pattern_domain_count), 
              max = max(pattern_domain_count), 
              mean = mean(pattern_domain_count, na.rm = TRUE), 
              rng = max - min, 
              std = sd(pattern_domain_count, na.rm = TRUE), 
              skew = skewness(pattern_domain_count, na.rm = TRUE), 
              kurt = kurtosis(pattern_domain_count, na.rm = TRUE)) %>% 
    arrange(desc(n)) 

pdcnt_min <- pdcnt_stats %>% pull(min)
pdcnt_max <- pdcnt_stats %>% pull(max)
pdcnt_mean <- pdcnt_stats %>% pull(mean)
pdcnt_rng <- pdcnt_stats %>% pull(rng)
pdcnt_std <- pdcnt_stats %>% pull(std)
pdcnt_skew <- pdcnt_stats %>% pull(skew)
pdcnt_kurt <- pdcnt_stats %>% pull(kurt)

`summarise()` ungrouping output (override with `.groups` argument)



In [31]:
rarpt_stats = expiry_train_df_1 %>%
    group_by(tld_registrar_index) %>% 
    summarise(n = n(), 
              min = min(reg_arpt), 
              max = max(reg_arpt), 
              mean = mean(reg_arpt, na.rm = TRUE), 
              rng = max - min, 
              std = sd(reg_arpt, na.rm = TRUE), 
              skew = skewness(reg_arpt, na.rm = TRUE), 
              kurt = kurtosis(reg_arpt, na.rm = TRUE)) %>% 
    arrange(desc(n)) 

rarpt_min <- rarpt_stats %>% pull(min)
rarpt_max <- rarpt_stats %>% pull(max)
rarpt_mean <- rarpt_stats %>% pull(mean)
rarpt_rng <- rarpt_stats %>% pull(rng)
rarpt_std <- rarpt_stats %>% pull(std)
rarpt_skew <- rarpt_stats %>% pull(skew)
rarpt_kurt <- rarpt_stats %>% pull(kurt)

`summarise()` ungrouping output (override with `.groups` argument)



## Add tld_reseller level feature engineering

In [32]:
metrics_df = metrics_df %>% 
    mutate(tld_reseller = tld_reseller_names,
            count = count_obs,
            ren_prp = ren_prp,
            tld_rat = tld_rat,
            country_maj = country_maj,
            region_maj = region_maj,
            country_cnt = country_cnt,
            region_cnt = region_cnt,
            geo_cnt = geo_cnt,
            geo_maj = geo_maj,

            daydom_min = daydom_min,
            daydom_max = daydom_max,
            daydom_mean = daydom_mean,
            daydom_rng = daydom_rng,
            daydom_std = daydom_std,
            daydom_skew = daydom_skew,
            daydom_kurt = daydom_kurt,

            sldlen_min = sldlen_min,
            sldlen_max = sldlen_max,
            sldlen_mean = sldlen_mean,
            sldlen_rng = sldlen_rng,
            sldlen_std = sldlen_std,
            sldlen_skew = sldlen_skew,
            sldlen_kurt = sldlen_kurt,

            gibbs_min = gibbs_min,
            gibbs_max = gibbs_max,
            gibbs_mean = gibbs_mean,
            gibbs_rng = gibbs_rng,
            gibbs_std = gibbs_std,
            gibbs_skew = gibbs_skew,
            gibbs_kurt = gibbs_kurt,

            pdcnt_min = pdcnt_min,
            pdcnt_max = pdcnt_max,
            pdcnt_mean = pdcnt_mean,
            pdcnt_rng = pdcnt_rng,
            pdcnt_std = pdcnt_std,
            pdcnt_skew = pdcnt_skew,
            pdcnt_kurt = pdcnt_kurt,

            rarpt_min = rarpt_min,
            rarpt_max = rarpt_max,
            rarpt_mean = rarpt_mean,
            rarpt_rng = rarpt_rng,
            rarpt_std = rarpt_std,
            rarpt_skew = rarpt_skew,
            rarpt_kurt = rarpt_kurt)

In [33]:
dim(metrics_df)
head(metrics_df)

Unnamed: 0_level_0,tld_registrar_index,reseller,n,l10_seg2_glm,l10_seg_glm,l10_agg_glm,l10_seg2_rf,l10_seg_rf,l10_agg_rf,auc_seg2_glm,auc_seg_glm,auc_agg_glm,auc_seg2_rf,auc_seg_rf,auc_agg_rf,tld_reseller,count,ren_prp,tld_rat,country_maj,region_maj,country_cnt,region_cnt,geo_cnt,geo_maj,⋯,sldlen_rng,sldlen_std,sldlen_skew,sldlen_kurt,gibbs_min,gibbs_max,gibbs_mean,gibbs_rng,gibbs_std,gibbs_skew,gibbs_kurt,pdcnt_min,pdcnt_max,pdcnt_mean,pdcnt_rng,pdcnt_std,pdcnt_skew,pdcnt_kurt,rarpt_min,rarpt_max,rarpt_mean,rarpt_rng,rarpt_std,rarpt_skew,rarpt_kurt
Unnamed: 0_level_1,<fct>,<fct>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<int>,<dbl>,<dbl>,<chr>,<chr>,<int>,<int>,<int>,<fct>,⋯,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,sitegmo,gmo,106569,8.204334,8.126935,6.408669,8.77709,8.76161,8.591331,0.8958204,0.8825077,0.8729102,0.9345201,0.9348297,0.9329721,sitegmo,106569,0.02441611,9.383592e-06,Japan,Non China,1,1,1,Japan,⋯,50,2.508508,6.1455276,55.010118,0,100,1.672944,100,4.083236,15.334299,348.12171,1,51,3.43249,50,5.173298,3.6994369,16.3930266,0.1,13.37,0.103735,13.27,0.20216296,55.1543,3102.462949
2,funalibaba,alibaba,72690,4.883721,5.255814,5.023256,5.813953,6.0,6.186047,0.8262791,0.8430233,0.8518605,0.8769767,0.877907,0.8793023,funalibaba,72690,0.01223002,1.375705e-05,China,China,1,1,1,China,⋯,60,3.289407,2.9497402,14.097156,0,100,6.42792,100,21.265643,4.112206,15.16265,1,326,15.295667,325,37.775267,4.8276608,27.7586974,0.75,4.75,0.7521482,4.0,0.08239843,43.767804,1951.657326
3,sitenamecheap,namecheap,53536,2.298025,1.921005,1.885099,2.280072,2.028725,2.046679,0.6862657,0.6196589,0.633842,0.6654399,0.6792639,0.6893178,sitenamecheap,53536,0.03973027,1.867902e-05,United States,Non China,1,1,1,United States,⋯,57,4.457749,1.8173451,7.306364,0,100,4.29292,100,4.790116,13.813471,271.44212,1,102,5.683839,101,11.925165,4.8845405,28.4273959,0.58,8.33,0.8639192,7.75,0.46643401,2.149515,15.078455
4,pwnamecheap,namecheap,50470,3.478261,3.675889,3.675889,3.438735,4.071146,4.3083,0.7274704,0.6551383,0.636166,0.7167984,0.7405138,0.7507905,pwnamecheap,50470,0.022766,1.981375e-05,United States,Non China,1,1,1,United States,⋯,53,3.816024,0.9727996,3.074438,0,100,5.021115,100,5.889335,13.133157,208.93433,1,133,3.80634,132,11.591963,7.9464521,72.1673024,0.59,5.4,0.8950147,4.81,0.37727284,1.158011,3.878673
5,onlinego daddy,go daddy,42162,2.850995,2.883271,2.474449,2.598171,2.727273,2.646584,0.6506455,0.6493007,0.6381119,0.6095481,0.6266541,0.6249328,onlinego daddy,42162,0.17413785,2.371804e-05,United States,Non China,1,1,1,United States,⋯,57,4.627825,1.2478782,3.883563,0,100,5.01993,100,5.451631,13.738913,235.73852,1,38,1.947346,37,2.554309,8.0693509,82.8231996,0.5,19.38,1.4417611,18.88,2.84260467,3.856059,13.984498
6,sitegandi sas,gandi sas,35309,8.243243,8.445946,7.094595,4.932432,8.445946,8.581081,0.9202703,0.9195946,0.9189189,0.7162162,0.9344595,0.9358108,sitegandi sas,35309,0.01741766,2.832139e-05,France,Non China,1,1,1,EU,⋯,51,4.052579,-0.9342607,1.096902,0,100,3.680121,100,7.475273,11.932197,150.65015,1,332,99.953015,331,114.009691,0.8053729,-0.9005846,0.5,7.0,0.5011045,6.5,0.08472577,76.689854,5879.500288


In [34]:
sum(metrics_df$tld_registrar_index != metrics_df$tld_reseller)

In [35]:
sum(metrics_df$n != metrics_df$count)

In [36]:
metrics_df <- subset(metrics_df, select=-c(tld_registrar_index,reseller,n))

In [37]:
head(metrics_df)

Unnamed: 0_level_0,l10_seg2_glm,l10_seg_glm,l10_agg_glm,l10_seg2_rf,l10_seg_rf,l10_agg_rf,auc_seg2_glm,auc_seg_glm,auc_agg_glm,auc_seg2_rf,auc_seg_rf,auc_agg_rf,tld_reseller,count,ren_prp,tld_rat,country_maj,region_maj,country_cnt,region_cnt,geo_cnt,geo_maj,daydom_min,daydom_max,daydom_mean,⋯,sldlen_rng,sldlen_std,sldlen_skew,sldlen_kurt,gibbs_min,gibbs_max,gibbs_mean,gibbs_rng,gibbs_std,gibbs_skew,gibbs_kurt,pdcnt_min,pdcnt_max,pdcnt_mean,pdcnt_rng,pdcnt_std,pdcnt_skew,pdcnt_kurt,rarpt_min,rarpt_max,rarpt_mean,rarpt_rng,rarpt_std,rarpt_skew,rarpt_kurt
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<int>,<dbl>,<dbl>,<chr>,<chr>,<int>,<int>,<int>,<fct>,<int>,<int>,<dbl>,⋯,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,8.204334,8.126935,6.408669,8.77709,8.76161,8.591331,0.8958204,0.8825077,0.8729102,0.9345201,0.9348297,0.9329721,sitegmo,106569,0.02441611,9.383592e-06,Japan,Non China,1,1,1,Japan,1,16752,6351.106,⋯,50,2.508508,6.1455276,55.010118,0,100,1.672944,100,4.083236,15.334299,348.12171,1,51,3.43249,50,5.173298,3.6994369,16.3930266,0.1,13.37,0.103735,13.27,0.20216296,55.1543,3102.462949
2,4.883721,5.255814,5.023256,5.813953,6.0,6.186047,0.8262791,0.8430233,0.8518605,0.8769767,0.877907,0.8793023,funalibaba,72690,0.01223002,1.375705e-05,China,China,1,1,1,China,1,2779,861.4138,⋯,60,3.289407,2.9497402,14.097156,0,100,6.42792,100,21.265643,4.112206,15.16265,1,326,15.295667,325,37.775267,4.8276608,27.7586974,0.75,4.75,0.7521482,4.0,0.08239843,43.767804,1951.657326
3,2.298025,1.921005,1.885099,2.280072,2.028725,2.046679,0.6862657,0.6196589,0.633842,0.6654399,0.6792639,0.6893178,sitenamecheap,53536,0.03973027,1.867902e-05,United States,Non China,1,1,1,United States,1,2200,641.6495,⋯,57,4.457749,1.8173451,7.306364,0,100,4.29292,100,4.790116,13.813471,271.44212,1,102,5.683839,101,11.925165,4.8845405,28.4273959,0.58,8.33,0.8639192,7.75,0.46643401,2.149515,15.078455
4,3.478261,3.675889,3.675889,3.438735,4.071146,4.3083,0.7274704,0.6551383,0.636166,0.7167984,0.7405138,0.7507905,pwnamecheap,50470,0.022766,1.981375e-05,United States,Non China,1,1,1,United States,1,2277,708.5591,⋯,53,3.816024,0.9727996,3.074438,0,100,5.021115,100,5.889335,13.133157,208.93433,1,133,3.80634,132,11.591963,7.9464521,72.1673024,0.59,5.4,0.8950147,4.81,0.37727284,1.158011,3.878673
5,2.850995,2.883271,2.474449,2.598171,2.727273,2.646584,0.6506455,0.6493007,0.6381119,0.6095481,0.6266541,0.6249328,onlinego daddy,42162,0.17413785,2.371804e-05,United States,Non China,1,1,1,United States,1,2642,444.951,⋯,57,4.627825,1.2478782,3.883563,0,100,5.01993,100,5.451631,13.738913,235.73852,1,38,1.947346,37,2.554309,8.0693509,82.8231996,0.5,19.38,1.4417611,18.88,2.84260467,3.856059,13.984498
6,8.243243,8.445946,7.094595,4.932432,8.445946,8.581081,0.9202703,0.9195946,0.9189189,0.7162162,0.9344595,0.9358108,sitegandi sas,35309,0.01741766,2.832139e-05,France,Non China,1,1,1,EU,1,8032,5529.4799,⋯,51,4.052579,-0.9342607,1.096902,0,100,3.680121,100,7.475273,11.932197,150.65015,1,332,99.953015,331,114.009691,0.8053729,-0.9005846,0.5,7.0,0.5011045,6.5,0.08472577,76.689854,5879.500288


In [38]:
names(metrics_df)

# CREATE new multi-class dependent variables

In [39]:
(auc_vars = grep('auc', names(metrics_df), value=TRUE))
(l10_vars = grep('l10', names(metrics_df), value=TRUE))

In [40]:
# auc_vars = c('seg2_glm_auc','seg_glm_auc','agg_glm_auc','agg_rf_auc')
# l10_vars = c('seg2_glm_lift10','seg_glm_lift10','agg_glm_lift10','agg_rf_lift10')

In [41]:
metrics_df <- metrics_df %>%
    mutate (auc_win=sapply(apply(.[,c(auc_vars)], 
                          1, function(x) names(x)[which.max(x)]) , function(s) if (length(s) == 0) NA else paste(s, collapse = " ")) ,
            l10_win=sapply(apply(.[,c(l10_vars)], 
                          1, function(x) names(x)[which.max(x)]), function(s) if (length(s) == 0) NA else paste(s, collapse = " ")) 
            ) 

# proportion of wins by each class

In [42]:
metrics_df %>% 
    group_by(auc_win) %>%
    tally() %>%
    arrange(desc(n))

auc_win,n
<chr>,<int>
,665
auc_seg2_glm,433
auc_seg_glm,176
auc_agg_glm,144
auc_seg_rf,117
auc_agg_rf,94
auc_seg2_rf,94


In [43]:
metrics_df %>% 
    group_by(l10_win) %>%
    tally() %>%
    arrange(desc(n))

l10_win,n
<chr>,<int>
,665
l10_seg2_glm,603
l10_seg_glm,158
l10_agg_glm,104
l10_seg2_rf,100
l10_seg_rf,51
l10_agg_rf,42


# Note 665 tld-resellers have no wins as a result of NAs across all models

In [44]:
# examine where neither of models returned auc (or lift) values.. ssume this is where test data had no observatins
na_df <- metrics_df %>% filter(is.na(auc_win)) 
dim(na_df)

In [45]:
# auc_win is na IFF all metrics are na
na_df %>%
  select(everything()) %>%  # replace to your needs
  summarise_all(funs(sum(is.na(.)))) %>% t()

“`funs()` is deprecated as of dplyr 0.8.0.
Please use a list of either functions or lambdas: 

  # Simple named list: 
  list(mean = mean, median = median)

  # Auto named with `tibble::lst()`: 
  tibble::lst(mean, median)

  # Using lambdas
  list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))


0,1
l10_seg2_glm,665
l10_seg_glm,665
l10_agg_glm,665
l10_seg2_rf,665
l10_seg_rf,665
l10_agg_rf,665
auc_seg2_glm,665
auc_seg_glm,665
auc_agg_glm,665
auc_seg2_rf,665


# NAs for Ranger

In [46]:
# auc_win is na IFF all metrics are na
metrics_df %>%
  select(everything()) %>%  # replace to your needs
  summarise_all(funs(sum(is.na(.)))) %>% t()

0,1
l10_seg2_glm,665
l10_seg_glm,665
l10_agg_glm,665
l10_seg2_rf,665
l10_seg_rf,665
l10_agg_rf,665
auc_seg2_glm,665
auc_seg_glm,665
auc_agg_glm,665
auc_seg2_rf,665


# Examine indiivdual tld-resellers

In [47]:
na_tridxs <- metrics_df %>% filter(is.na(l10_agg_rf)) %>% pull(tld_reseller)
length(na_tridxs)
head(na_tridxs)

In [48]:
i=1
na_tridxs[[i]]

In [49]:
metrics_df %>% filter(tld_reseller==na_tridxs[[i]]) %>% t()

0,1
l10_seg2_glm,
l10_seg_glm,
l10_agg_glm,
l10_seg2_rf,
l10_seg_rf,
l10_agg_rf,
auc_seg2_glm,
auc_seg_glm,
auc_agg_glm,
auc_seg2_rf,


In [50]:
tld_reseller_str <- na_tridxs[[i]]
cat('tld_reseller_str:\n')
print(tld_reseller_str)
cat('\n')

reseller_lookup = expiry_train_df_1 %>% group_by(tld_registrar_index, reseller) %>% tally() %>% arrange(desc(n))

cat('reseller_str')
(reseller_str = reseller_lookup %>% filter(tld_registrar_index==tld_reseller_str) %>% pull(reseller))
cat('\n')
cat('tld_registrars')
(tld_registrars = names(expiry_train_prepped_2_1)[endsWith(names(expiry_train_prepped_2_1),reseller_str)])

tld_reseller_str:
[1] "funzhengzhou century connect"

reseller_str


tld_registrars

## Verify that test and train have sufficient number of observations

In [51]:
train_list = expiry_train_prepped_2_1
test_list = expiry_test_prepped_2_1

In [52]:
# subset data for seg2 models
train_list_tld_reseller = train_list[tld_reseller_str] # all the dfs associates with this reseller
test_list_tld_reseller = test_list[tld_reseller_str]
train_df_tld_reseller =  rbindlist(train_list_tld_reseller,use.names=TRUE)
test_df_tld_reseller =  rbindlist(test_list_tld_reseller,use.names=TRUE)

In [53]:
dim(test_df_tld_reseller)
dim(train_df_tld_reseller)

In [54]:
# subset data for seg models
train_list_reseller = train_list[tld_registrars]
test_list_reseller = test_list[tld_registrars]
train_df_reseller =  rbindlist(train_list_reseller,use.names=TRUE)
test_df_reseller =  rbindlist(test_list_reseller,use.names=TRUE)




In [55]:
dim(test_df_reseller)
dim(train_df_reseller)

## Verify that test and train haren't missing predictors

In [56]:
test_df_tld_reseller %>% 
  select(pattern_domain_count,log_reg_arpt,sld_length,gibb_score,sld_type,day_domains,reg_period) %>%
  summarise_all(funs(sum(is.na(.)))) %>% t()

train_df_tld_reseller %>%
  select(pattern_domain_count,log_reg_arpt,sld_length,gibb_score,sld_type,day_domains,reg_period) %>%
  summarise_all(funs(sum(is.na(.)))) %>% t()

0,1
pattern_domain_count,0
log_reg_arpt,0
sld_length,0
gibb_score,0
sld_type,0
day_domains,0
reg_period,0


0,1
pattern_domain_count,0
log_reg_arpt,0
sld_length,0
gibb_score,0
sld_type,0
day_domains,0
reg_period,0


In [57]:
test_df_reseller %>% 
  select(pattern_domain_count,log_reg_arpt,sld_length,gibb_score,sld_type,day_domains,reg_period,tld) %>%
  summarise_all(funs(sum(is.na(.)))) %>% t()

train_df_reseller %>%
  select(pattern_domain_count,log_reg_arpt,sld_length,gibb_score,sld_type,day_domains,reg_period,tld) %>%
  summarise_all(funs(sum(is.na(.)))) %>% t()

0,1
pattern_domain_count,0
log_reg_arpt,0
sld_length,0
gibb_score,0
sld_type,0
day_domains,0
reg_period,0
tld,0


0,1
pattern_domain_count,0
log_reg_arpt,0
sld_length,0
gibb_score,0
sld_type,0
day_domains,0
reg_period,0
tld,0


## Verify that model preds get generated

In [58]:
#####################################################################################    
# agg rf (aggregarted rf (including tld and reseller as predictors))
#####################################################################################

cat("\n agg rf")
load("../../data/ranger_03_expiry2_f")
# if test data contains no observations, skip!
 if (dim(test_df_tld_reseller)[1]==0){
    pred_df_agg_rf = NA
}  else {
    model = ranger_03_expiry2_f
    pred <- predict(model, 
                    data = test_df_tld_reseller,
                    type="response")$predictions

    # if all Renewed col doesn't exist in predictions, create it with value 0
    if(is.null(as.data.frame(pred)$Renewed)){
        pred <- as.data.frame(pred)
        pred$Renewed <- 0
    }

    pred_df_agg_rf = data.frame("actual" = test_df_tld_reseller$renewal_status,
                      "predicted" = as.data.frame(pred)$Renewed)
}

rm(ranger_03_expiry2_f)
gc()

#####################################################################################    
# agg glm (aggregarted glm (including tld and reseller as predictors))
#####################################################################################

cat("\n agg glm")
load("../../data/agg_glm_basic_model")

# if test data contains no observations, skip!
if (dim(test_df_tld_reseller)[1]==0){
    pred_df_agg_glm = NA
} else {
    model = agg_glm_basic_model
    pred = predict_first_renewal_agg(test_df_tld_reseller, model)

    pred_df_agg_glm = data.frame("actual" = pred$renewal_status,
                                  "predicted" = pred$first_renewal_prediction)
}


rm(agg_glm_basic_model)
gc()


 agg rf

Unnamed: 0,used,(Mb),gc trigger,(Mb).1,max used,(Mb).2
Ncells,66716968,3563.1,137648491,7351.3,66766598,3565.8
Vcells,501941978,3829.6,803294565,6128.7,502129201,3831.0



 agg glm

Unnamed: 0,used,(Mb),gc trigger,(Mb).1,max used,(Mb).2
Ncells,5558249,296.9,110118793,5881,66895035,3572.6
Vcells,1003266935,7654.4,1562896541,11924,1374134041,10483.9


In [59]:
#####################################################################################
# seg2 glm (tld-reseller-segmented glm)
#####################################################################################

cat("\n seg2 glm")

# if test data contains no observations, skip!
if (dim(test_df_tld_reseller)[1]==0){
    pred_df_seg2_glm = NA
} else {
    model = mass_build_model_first_renewal(train_list_tld_reseller)
    pred = mass_predict_first_renewal(test_list_tld_reseller, model)

    pred_df_seg2_glm = data.frame("actual" = pred$renewal_status,
                                  "predicted" = pred$first_renewal_prediction)
}

#####################################################################################    
# seg glm (reseller-segmented glm (including tld as predictor))
#####################################################################################

cat("\n seg glm")

# if test data contains no observations, skip!
if (dim(test_df_tld_reseller)[1]==0){
    pred_df_seg_glm = NA
} else {

    if((nlevels(train_df_reseller$tld) < 2)){
        model = build_model_first_renewal(train_df_reseller)
    }else{
        model = build_model_first_renewal_reg(train_df_reseller)
    }



    pred = predict_first_renewal_reg(test_df_tld_reseller, model)

    pred_df_seg_glm = data.frame("actual" = pred$renewal_status,
                                  "predicted" = pred$first_renewal_prediction)
}




#####################################################################################
# seg2 rf (tld-reseller-segmented rf)
#####################################################################################

cat("\n seg2 rf")

# if test data contains no observations, skip!
if (dim(test_df_tld_reseller)[1]==0){
    pred_df_seg2_rf = NA
} else {
#     if train data only has one observation, sample_fraction must be 1 (cant sample fraction of 1 observation)
    if(dim(train_df_tld_reseller)[1]==1){
        sample_fraction=1
    }else{
        sample_fraction=.8
    }

    model <- ranger(formula         = renewal_status ~ pattern_domain_count+log_reg_arpt+sld_length+gibb_score+sld_type+day_domains+reg_period, 
                    data            = train_df_tld_reseller, 
                    importance = 'impurity', 
                    num.trees       = 1000,
                    probability = TRUE,
                    replace = FALSE,
                    sample.fraction = sample_fraction,
                    seed            = 123,
                    respect.unordered.factors=TRUE)

    pred <- predict(model, 
                    data = test_df_tld_reseller,
                    type="response")$predictions

    # if all Renewed col doesn't exist in predictions, create it with value 0
    if(is.null(as.data.frame(pred)$Renewed)){
        pred <- as.data.frame(pred)
        pred$Renewed <- 0
    }

    pred_df_seg2_rf = data.frame("actual" = test_df_tld_reseller$renewal_status,
                      "predicted" = as.data.frame(pred)$Renewed)
}


#####################################################################################
# seg rf (reseller-segmented rf)
#####################################################################################

cat("\n seg rf")

# if test data contains no observations, skip!
if (dim(test_df_tld_reseller)[1]==0){
    pred_df_seg_rf = NA
} else {
#     if train data only has one observation, sample_fraction must be 1 (cant sample fraction of 1 observation)
    if(dim(train_df_reseller)[1]==1){
        sample_fraction=1
    }else{
        sample_fraction=.8
    }

    model <- ranger(formula         = renewal_status ~ pattern_domain_count+log_reg_arpt+sld_length+gibb_score+sld_type+day_domains+reg_period+tld, 
                    data            = train_df_reseller, 
                    importance = 'impurity', 
                    num.trees       = 1000,
                    probability = TRUE,
                    replace = FALSE,
                    sample.fraction = sample_fraction,
                    seed            = 123,
                    respect.unordered.factors=TRUE)

    pred <- predict(model, 
                    data = test_df_tld_reseller,
                    type="response")$predictions

    # if all Renewed col doesn't exist in predictions, create it with value 0
    if(is.null(as.data.frame(pred)$Renewed)){
        pred <- as.data.frame(pred)
        pred$Renewed <- 0
    }

    pred_df_seg_rf = data.frame("actual" = test_df_tld_reseller$renewal_status,
                      "predicted" = as.data.frame(pred)$Renewed)
}




 seg2 glm

“glm.fit: fitted probabilities numerically 0 or 1 occurred”
“the condition has length > 1 and only the first element will be used”
“prediction from a rank-deficient fit may be misleading”



 seg glm

“the condition has length > 1 and only the first element will be used”
“prediction from a rank-deficient fit may be misleading”



 seg2 rf
 seg rf

## compare with results from compare_df

In [60]:
compare_DF <- read.csv("../../data/tld_reseller_compare_predictions.csv")

In [61]:
compare_DF_sub <- compare_DF %>% filter(tld_registrar_index==na_tridxs[[i]])
dim(compare_DF_sub)
head(compare_DF_sub)

Unnamed: 0_level_0,actual,pred_df_seg2_glm,pred_df_seg_glm,pred_df_agg_glm,pred_df_seg2_rf,pred_df_seg_rf,pred_df_agg_rf,tld_registrar_index,reseller,n,domain_id,domain
Unnamed: 0_level_1,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<fct>,<int>,<int>,<fct>
1,Not Renewd,0.99685973,0.001,0.008,0.3239427,0.14889914,0.11534867,funzhengzhou century connect,zhengzhou century connect,4249,89694757,japana.fun
2,Not Renewd,0.02230039,0.013,0.003,0.1631052,0.0170751,0.03484726,funzhengzhou century connect,zhengzhou century connect,4249,91954643,msc8844.fun
3,Not Renewd,0.02230039,0.013,0.003,0.1631052,0.0170751,0.03484726,funzhengzhou century connect,zhengzhou century connect,4249,91954663,msc8855.fun
4,Not Renewd,0.02230039,0.013,0.003,0.1631052,0.0170751,0.03484726,funzhengzhou century connect,zhengzhou century connect,4249,91954648,msc8866.fun
5,Not Renewd,0.02230039,0.013,0.003,0.1631052,0.0170751,0.03484726,funzhengzhou century connect,zhengzhou century connect,4249,91954933,msc7333.fun
6,Not Renewd,0.01943478,0.013,0.004,0.3321312,0.07657796,0.07859592,funzhengzhou century connect,zhengzhou century connect,4249,91954708,msc9922.fun


In [80]:
# IF all 0's THEN all values equal with output here vs output in compare DF 
sum(round(pred_df_seg2_glm$predicted/compare_DF_sub$pred_df_seg2_glm,5)!=1)
sum(round(ifelse(compare_DF_sub$pred_df_seg_glm==0, pred_df_seg_glm$predicted==compare_DF_sub$pred_df_seg_glm, 
                 pred_df_seg_glm$predicted/compare_DF_sub$pred_df_seg_glm),5)!=1)
sum(round(ifelse(compare_DF_sub$pred_df_agg_glm==0, pred_df_agg_glm$predicted==compare_DF_sub$pred_df_agg_glm, 
                 pred_df_agg_glm$predicted/compare_DF_sub$pred_df_agg_glm),5)!=1)
sum(round(pred_df_seg2_rf$predicted/compare_DF_sub$pred_df_seg2_rf,5)!=1)
sum(round(pred_df_seg_rf$predicted/compare_DF_sub$pred_df_seg_rf,5)!=1)
sum(round(pred_df_agg_rf$predicted/compare_DF_sub$pred_df_agg_rf,5)!=1)

In [83]:
# check l10 & auc computation from tld_reseller_compare_metrics.csv -- NO
df1 <- read.csv("../../data/tld_reseller_compare_metrics.csv")
df1_sub <- df1 %>% filter(tld_registrar_index==na_tridxs[[i]])
dim(df1_sub)
df1_sub %>% t()

0,1
tld_registrar_index,funzhengzhou century connect
reseller,zhengzhou century connect
n,4249
l10_seg2_glm,
l10_seg_glm,
l10_agg_glm,
l10_seg2_rf,
l10_seg_rf,
l10_agg_rf,
auc_seg2_glm,


## Recalculate l10 & AUC from DF (originally done from list of df's)

In [85]:
chart_lift <- function (pred_df=first_renewal_model_test_predict,
                        dep_var = "renewal_status",
                        pred_var = "first_renewal_prediction") {
  N <- 10  # total number of rows to preallocate--possibly an overestimate
  lift_df <- data.frame(P =rep(NA, N), 
                        actu_renwd2=rep(NA, N), 
                        gain=rep(NA, N), 
                        lift=rep(NA, N), 
                        stringsAsFactors=FALSE)          # you don't know levels yet
  actu_renwd <- sum(pred_df[[dep_var]]=='Renewed')
  
  i = 1
  for(P in seq(.1,1,length=10)){
    temp_df <- data.frame(pred_df)[c(dep_var,pred_var)]
    ttmp_df <- temp_df[order(temp_df[pred_var],decreasing = TRUE),][1:round(dim(temp_df)[1]*P),]
    actu_renwd2 <-  sum(ttmp_df[[dep_var]] == 'Renewed')
    gain = actu_renwd2/actu_renwd
    lift = gain/(P)
    
    lift_df[i, ] <- list(P, actu_renwd2, gain, lift)
    i = i+1
  }
  return(lift_df)
}

In [86]:
chart_lift(pred_df=compare_DF_sub,
           dep_var = "actual",
           pred_var = "pred_df_seg2_glm"
          )

P,actu_renwd2,gain,lift
<dbl>,<int>,<dbl>,<dbl>
0.1,0,,
0.2,0,,
0.3,0,,
0.4,0,,
0.5,0,,
0.6,0,,
0.7,0,,
0.8,0,,
0.9,0,,
1.0,0,,


In [91]:
# ISSUE: all actual values are not renewed
table(compare_DF_sub$actual)
table(compare_DF_sub$actual)[[2]]


Not Renewd    Renewed 
      1151          0 

In [101]:
i
na_tridxs[[i]]

In [106]:
df <- compare_DF %>% filter(tld_registrar_index==na_tridxs[[i]] )
df
!is.na(df$actual)

actual,pred_df_seg2_glm,pred_df_seg_glm,pred_df_agg_glm,pred_df_seg2_rf,pred_df_seg_rf,pred_df_agg_rf,tld_registrar_index,reseller,n,domain_id,domain
<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<fct>,<int>,<int>,<fct>
,,,,,,,hostbaidu,baidu,23,,


In [99]:
compare_DF %>% filter(tld_registrar_index==na_tridxs[[i]]) %>% pull(actual) == 'Renewed'

# Verify that this is the underlying issue for all 665 tld-registrars w/ NA's


In [110]:
for (i in 1:length(na_tridxs)){
    df <- compare_DF %>% filter(tld_registrar_index==na_tridxs[[i]] )
    if (suppressWarnings(!is.na(df$actual))){
        if (sum(df %>% pull(actual) == 'Renewed')>0){
            print(na_tridxs[[i]])
            print('\n')
        }
    }
    
}

“the condition has length > 1 and only the first element will be used”
“the condition has length > 1 and only the first element will be used”
“the condition has length > 1 and only the first element will be used”
“the condition has length > 1 and only the first element will be used”
“the condition has length > 1 and only the first element will be used”
“the condition has length > 1 and only the first element will be used”
“the condition has length > 1 and only the first element will be used”
“the condition has length > 1 and only the first element will be used”
“the condition has length > 1 and only the first element will be used”
“the condition has length > 1 and only the first element will be used”
“the condition has length > 1 and only the first element will be used”
“the condition has length > 1 and only the first element will be used”
“the condition has length > 1 and only the first element will be used”
“the condition has length > 1 and only the first element will be used”
“the c