# RF: ANALYZE tld_reseller model performance comparison (w/ reseller_geo)

In [1]:
options(repr.matrix.max.cols=50, repr.matrix.max.rows=100)

In [2]:
# install.packages("pkgcond")

In [3]:
library(dplyr)
library(data.table)
library(partykit)
library(tictoc)
library(caret)
library(e1071)
library(randomForest)
library(ranger)

#for 3d plotting
library(akima)
library(plotly)

# for prep data
library(rPython)
library(stringr)
library(pbapply)
library(stringdist)
library(data.table)
library(dominanceanalysis)


getwd()


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last


Loading required package: grid

Loading required package: libcoin

Loading required package: mvtnorm

Loading required package: lattice

Loading required package: ggplot2

randomForest 4.6-14

Type rfNews() to see new features/changes/bug fixes.


Attaching package: ‘randomForest’


The following object is masked from ‘package:ggplot2’:

    margin


The following object is masked from ‘package:dplyr’:

    combine



Attaching package: ‘ranger’


The following object is masked from ‘package:randomForest’:

    importance



Attaching package: ‘plotly’


The following object is masked from ‘package:ggplot2’:

    last_plot


The following object is masked from ‘package:sta

## List tld_reseller labels

In [5]:
tld_reseller_lookup_df = expiry_train_df_1 %>% group_by(tld_registrar_index, reseller) %>% tally() %>% arrange(desc(n))
tld_reseller_lookup_df %>% head()

tld_registrar_index,reseller,n
<chr>,<chr>,<int>
sitegmo,gmo,106569
funalibaba,alibaba,72690
sitenamecheap,namecheap,53536
pwnamecheap,namecheap,50470
onlinego daddy,go daddy,42162
sitegandi sas,gandi sas,35309


## Load PREPed tld_reseller compare

In [6]:
predictions_df <- read.csv("../../data/tld_reseller_compare_predictions.csv")
metrics_df <- read.csv("../../data/tld_reseller_compare_metrics.csv")

dim(predictions_df)
head(predictions_df)
dim(metrics_df)
head(metrics_df)

Unnamed: 0_level_0,actual,pred_df_seg2_glm,pred_df_seg_glm,pred_df_agg_glm,pred_df_seg2_rf,pred_df_seg_rf,pred_df_agg_rf,tld_registrar_index,reseller,n,domain_id,domain
Unnamed: 0_level_1,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<fct>,<int>,<int>,<fct>
1,Not Renewd,0.6470992,0.558,0.419,0.4765771,0.4545657,0.5386137,sitegmo,gmo,106569,44295183,kagen.site
2,Renewed,0.6762222,0.567,0.397,0.5410605,0.5769973,0.6091513,sitegmo,gmo,106569,44573611,designlab.site
3,Renewed,0.9098495,0.835,0.41,0.7018108,0.6343404,0.688668,sitegmo,gmo,106569,45304858,hokatu-blog.site
4,Renewed,0.602935,0.515,0.402,0.2414726,0.3999603,0.3464197,sitegmo,gmo,106569,46235129,suzuya.site
5,Not Renewd,0.9311321,0.863,0.417,0.7749462,0.7039504,0.671174,sitegmo,gmo,106569,46276970,wins-company.site
6,Renewed,0.9347869,0.861,0.391,0.6376848,0.656674,0.6717237,sitegmo,gmo,106569,47809960,yamatoku-company.site


Unnamed: 0_level_0,tld_registrar_index,reseller,n,l10_seg2_glm,l10_seg_glm,l10_agg_glm,l10_seg2_rf,l10_seg_rf,l10_agg_rf,auc_seg2_glm,auc_seg_glm,auc_agg_glm,auc_seg2_rf,auc_seg_rf,auc_agg_rf
Unnamed: 0_level_1,<fct>,<fct>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,sitegmo,gmo,106569,8.204334,8.126935,6.408669,8.77709,8.76161,8.591331,0.8958204,0.8825077,0.8729102,0.9345201,0.9348297,0.9329721
2,funalibaba,alibaba,72690,4.883721,5.255814,5.023256,5.813953,6.0,6.186047,0.8262791,0.8430233,0.8518605,0.8769767,0.877907,0.8793023
3,sitenamecheap,namecheap,53536,2.298025,1.921005,1.885099,2.280072,2.028725,2.046679,0.6862657,0.6196589,0.633842,0.6654399,0.6792639,0.6893178
4,pwnamecheap,namecheap,50470,3.478261,3.675889,3.675889,3.438735,4.071146,4.3083,0.7274704,0.6551383,0.636166,0.7167984,0.7405138,0.7507905
5,onlinego daddy,go daddy,42162,2.850995,2.883271,2.474449,2.598171,2.727273,2.646584,0.6506455,0.6493007,0.6381119,0.6095481,0.6266541,0.6249328
6,sitegandi sas,gandi sas,35309,8.243243,8.445946,7.094595,4.932432,8.445946,8.581081,0.9202703,0.9195946,0.9189189,0.7162162,0.9344595,0.9358108


## Add reseller_geo to train_df before feature eng

In [7]:
head(expiry_train_df_1)

renewal_type,renewed_count,expiry_date,domain_id,domain,creation_date,status,tld,registrar,reseller,reseller_country,region,reg_period,registrant_country,renewal_status,renew_mbg,renew_type,autorenew_type,renew_date,renew_registrar,renew_reseller,reg_revenue,reg_arpt,renew_period,renew_domain_revenue,renew_arpt,reg_arpt_org,tld_registrar_index,sld,sld_type,sld_length,sld_type2,day_domains,log_reg_arpt,gibb_score,pattern,cluster,pattern_score,pattern_domain_count
<chr>,<int>,<date>,<int>,<chr>,<date>,<chr>,<fct>,<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<fct>,<int>,<chr>,<chr>,<date>,<chr>,<chr>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<fct>,<int>,<chr>,<int>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<int>
FirstTime,1,2020-01-21,91584433,racefor.fun,2019-01-21,Active,fun,1&1 internet,1&1 internet,Germany,Non China,1,,Renewed,0,auto-renewal,realized,2020-01-21,1&1 Internet,1&1 Internet,15,15,1,15,15,15,fun1&1 internet,racefor,l,7,7l,1,2.70805,6.77,racefor,1,1,1
FirstTime,1,2020-01-30,92254793,united4.fun,2019-01-30,Active,fun,1&1 internet,1&1 internet,Germany,Non China,1,,Renewed,0,auto-renewal,realized,2020-01-30,1&1 Internet,1&1 Internet,15,15,1,15,15,15,fun1&1 internet,united4,ln,7,7ln,1,2.70805,8.65,united4,1,1,1
FirstTime,1,2020-02-04,92827233,bereal.fun,2019-02-04,Active,fun,1&1 internet,1&1 internet,Germany,Non China,1,,Renewed,0,auto-renewal,realized,2020-02-04,1&1 Internet,1&1 Internet,15,15,1,15,15,15,fun1&1 internet,bereal,l,6,6l,1,2.70805,13.09,bereal,1,1,1
FirstTime,1,2020-02-12,93490823,dogsoutdoors.fun,2019-02-12,Active,fun,1&1 internet,1&1 internet,Germany,Non China,1,,Not Renewd,0,auto-renewal,unrealized,2020-02-12,1&1 Internet,1&1 Internet,15,15,1,15,15,15,fun1&1 internet,dogsoutdoors,l,12,12l,1,2.70805,2.44,dogsoutdoors,1,1,1
FirstTime,1,2020-02-15,93767978,rosalux.fun,2019-02-15,Active,fun,1&1 internet,1&1 internet,Germany,Non China,1,,Renewed,0,auto-renewal,realized,2020-02-15,1&1 Internet,1&1 Internet,15,15,1,15,15,15,fun1&1 internet,rosalux,l,7,7l,1,2.70805,2.34,rosalux,1,1,1
FirstTime,1,2020-02-16,93823708,verygood.fun,2019-02-16,Active,fun,1&1 internet,1&1 internet,Germany,Non China,1,,Renewed,0,auto-renewal,realized,2020-02-16,1&1 Internet,1&1 Internet,15,15,1,15,15,15,fun1&1 internet,verygood,l,8,8l,1,2.70805,3.64,verygood,1,1,1


In [8]:
system("gsutil cp gs://data_input/PredictiveModelAnalysis_AnomalyRegistrars.csv /home/jupyter/local/Domains_202003/data/PredictiveModelAnalysis_AnomalyRegistrars.csv")
anomalyDF <- read.csv("/home/jupyter/local/Domains_202003/data/PredictiveModelAnalysis_AnomalyRegistrars.csv")
anomalyDF <- apply(anomalyDF, 2, function(x) trimws(x,which = c("both")) )
geoLookupDF <- unique(anomalyDF[,c('reseller','reseller_country','reseller_geo')])
geoLookupDF <- as.data.frame(geoLookupDF)
head(geoLookupDF)

Unnamed: 0_level_0,reseller,reseller_country,reseller_geo
Unnamed: 0_level_1,<fct>,<fct>,<fct>
1,007names,United States,United States
2,0101 internet,Hong Kong,China
3,1&1 internet,Germany,EU
4,101 domain,United States,United States
5,101domain discovery,Germany,EU
6,10dencehispahard,Spain,EU


In [9]:
# number of unique geos per reseller
geoLookupDF %>%
  group_by(reseller) %>%
  summarise(u_geo = n_distinct(reseller_geo))  %>%
  filter(u_geo>1)

`summarise()` ungrouping output (override with `.groups` argument)



reseller,u_geo
<fct>,<int>
hostgator,2
moniker,2
psi-japan,2
registrarsec,2
uk2,2
,2


In [10]:
# number of unique geos per reseller-country
geoLookupDF %>%
  group_by(reseller, reseller_country) %>%
  summarise(u_geo = n_distinct(reseller_geo))  %>%
  filter(u_geo>1)

`summarise()` regrouping output by 'reseller' (override with `.groups` argument)



reseller,reseller_country,u_geo
<fct>,<fct>,<int>
,,2


In [11]:
geoLookupDF %>% filter(is.na(reseller))

reseller,reseller_country,reseller_geo
<fct>,<fct>,<fct>
,,Others
,,


In [12]:
# ensure correct number of dimm
dim(expiry_train_df_1)
expiry_train_df_2 <- merge(expiry_train_df_1,geoLookupDF,on=c('reseller','reseller_country'), all.x = TRUE)
dim(expiry_train_df_2)
head(expiry_train_df_2)


reseller,reseller_country,renewal_type,renewed_count,expiry_date,domain_id,domain,creation_date,status,tld,registrar,region,reg_period,registrant_country,renewal_status,renew_mbg,renew_type,autorenew_type,renew_date,renew_registrar,renew_reseller,reg_revenue,reg_arpt,renew_period,renew_domain_revenue,renew_arpt,reg_arpt_org,tld_registrar_index,sld,sld_type,sld_length,sld_type2,day_domains,log_reg_arpt,gibb_score,pattern,cluster,pattern_score,pattern_domain_count,reseller_geo
<chr>,<chr>,<chr>,<int>,<date>,<int>,<chr>,<date>,<chr>,<fct>,<chr>,<chr>,<int>,<chr>,<fct>,<int>,<chr>,<chr>,<date>,<chr>,<chr>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<fct>,<int>,<chr>,<int>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<int>,<fct>
1&1 internet,Germany,FirstTime,1,2020-01-21,91584433,racefor.fun,2019-01-21,Active,fun,1&1 internet,Non China,1,,Renewed,0,auto-renewal,realized,2020-01-21,1&1 Internet,1&1 Internet,15,15,1,15,15,15,fun1&1 internet,racefor,l,7,7l,1,2.70805,6.77,racefor,1,1,1,EU
1&1 internet,Germany,FirstTime,1,2020-01-30,92254793,united4.fun,2019-01-30,Active,fun,1&1 internet,Non China,1,,Renewed,0,auto-renewal,realized,2020-01-30,1&1 Internet,1&1 Internet,15,15,1,15,15,15,fun1&1 internet,united4,ln,7,7ln,1,2.70805,8.65,united4,1,1,1,EU
1&1 internet,Germany,FirstTime,1,2020-02-04,92827233,bereal.fun,2019-02-04,Active,fun,1&1 internet,Non China,1,,Renewed,0,auto-renewal,realized,2020-02-04,1&1 Internet,1&1 Internet,15,15,1,15,15,15,fun1&1 internet,bereal,l,6,6l,1,2.70805,13.09,bereal,1,1,1,EU
1&1 internet,Germany,FirstTime,1,2020-02-12,93490823,dogsoutdoors.fun,2019-02-12,Active,fun,1&1 internet,Non China,1,,Not Renewd,0,auto-renewal,unrealized,2020-02-12,1&1 Internet,1&1 Internet,15,15,1,15,15,15,fun1&1 internet,dogsoutdoors,l,12,12l,1,2.70805,2.44,dogsoutdoors,1,1,1,EU
1&1 internet,Germany,FirstTime,1,2020-02-15,93767978,rosalux.fun,2019-02-15,Active,fun,1&1 internet,Non China,1,,Renewed,0,auto-renewal,realized,2020-02-15,1&1 Internet,1&1 Internet,15,15,1,15,15,15,fun1&1 internet,rosalux,l,7,7l,1,2.70805,2.34,rosalux,1,1,1,EU
1&1 internet,Germany,FirstTime,1,2020-02-16,93823708,verygood.fun,2019-02-16,Active,fun,1&1 internet,Non China,1,,Renewed,0,auto-renewal,realized,2020-02-16,1&1 Internet,1&1 Internet,15,15,1,15,15,15,fun1&1 internet,verygood,l,8,8l,1,2.70805,3.64,verygood,1,1,1,EU


In [13]:
# replace original
expiry_train_df_1 <- expiry_train_df_2

In [14]:
expiry_train_df_1 %>% filter(reseller_country=='') %>% select(reseller)

reseller
<chr>
anygaming
electron networks
innovadeus
innovadeus
netclues
ownregistrar
ownregistrar
ownregistrar
ownregistrar
tucows


In [15]:
# number of unique reseller_country and reseller_geo for eachtld_registrar_index
expiry_train_df_1 %>%
  group_by(tld_registrar_index) %>%
  summarise(u_cntry = n_distinct(reseller_country),
         u_geo = n_distinct(reseller_geo))  %>%
  filter(u_cntry>1)

`summarise()` ungrouping output (override with `.groups` argument)



tld_registrar_index,u_cntry,u_geo
<chr>,<int>,<int>
funkey-systems,2,2
funtucows,2,2
onlineinnovadeus,2,1
onlinekey-systems,2,2
onlineownregistrar,2,2
onlineshopify,2,2
onlinetucows,2,2
onlineuk2,2,2
presskey-systems,2,2
presspdr ltd,2,2


## Feature Engineering for tld-reseller level 

In [16]:
# new geo level feature eng
geo_maj = expiry_train_df_1 %>%
  add_count(tld_registrar_index, reseller_geo) %>%
  group_by(tld_registrar_index) %>%
  mutate(geo_maj = reseller_geo[n == max(n)][1]) %>%
  select(-n) %>% 
  group_by(tld_registrar_index,geo_maj) %>%
    summarise(n = n()) %>% 
    arrange(desc(n)) %>%
    pull(geo_maj)

`summarise()` regrouping output by 'tld_registrar_index' (override with `.groups` argument)



In [17]:
# new geo level feature eng
geo_cnt = expiry_train_df_1 %>%
  add_count(tld_registrar_index, reseller_geo) %>%
  group_by(tld_registrar_index) %>%
  summarise(geo_cnt = n_distinct(reseller_geo)) %>%
  pull(geo_cnt)  

`summarise()` ungrouping output (override with `.groups` argument)



In [18]:
count_pct <- function(df) {
  return(
    df %>%
      tally %>% 
      mutate(n_prp = n/sum(n))
  )
}

In [19]:
tld_reseller_names = expiry_train_df_1 %>% 
    group_by(tld_registrar_index) %>% 
    count_pct %>%
    arrange(desc(n)) %>% 
   pull(tld_registrar_index)

In [20]:
count_obs = expiry_train_df_1 %>% 
    group_by(tld_registrar_index) %>% 
    count_pct %>%
    arrange(desc(n)) %>% 
    pull(n)

In [21]:
ren_prp = expiry_train_df_1 %>% 
    group_by(tld_registrar_index) %>% 
    summarise(n = n(), ren_prp = sum(renewal_status=='Renewed')/sum(n))  %>% 
    arrange(desc(n)) %>% 
    pull(ren_prp)

`summarise()` ungrouping output (override with `.groups` argument)



In [22]:
# not informative -- tld_cnt always = 1 so tld_rat is ismply mult.inverse of count_obs
tld_rat = expiry_train_df_1 %>%
    group_by(tld_registrar_index) %>% 
    summarise(n = n(), tld_cnt = n_distinct(tld), tld_rat = tld_cnt/n) %>% 
    arrange(desc(n)) %>%
    pull(tld_rat)

`summarise()` ungrouping output (override with `.groups` argument)



In [23]:
country_maj = expiry_train_df_1 %>%
  add_count(tld_registrar_index, reseller_country) %>%
  group_by(tld_registrar_index) %>%
  mutate(reseller_country_maj = reseller_country[n == max(n)][1]) %>%
  select(-n) %>% 
  group_by(tld_registrar_index,reseller_country_maj) %>%
    summarise(n = n()) %>% 
    arrange(desc(n)) %>%
    pull(reseller_country_maj)

`summarise()` regrouping output by 'tld_registrar_index' (override with `.groups` argument)



In [24]:
country_cnt = expiry_train_df_1 %>%
  add_count(tld_registrar_index, reseller_country) %>%
  group_by(tld_registrar_index) %>%
  summarise(reseller_country_cnt = n_distinct(reseller_country)) %>%
  pull(reseller_country_cnt)  

`summarise()` ungrouping output (override with `.groups` argument)



In [25]:
region_maj = expiry_train_df_1 %>%
  add_count(tld_registrar_index, region) %>%
  group_by(tld_registrar_index) %>%
  mutate(region_maj = region[n == max(n)][1]) %>%
  select(-n) %>% 
  group_by(tld_registrar_index,region_maj) %>%
    summarise(n = n()) %>% 
    arrange(desc(n)) %>%
    pull(region_maj)

`summarise()` regrouping output by 'tld_registrar_index' (override with `.groups` argument)



In [26]:
region_cnt = expiry_train_df_1 %>%
  add_count(tld_registrar_index, region) %>%
  group_by(tld_registrar_index) %>%
  summarise(reseller_region_cnt = n_distinct(reseller_country)) %>%
  pull(reseller_region_cnt)  

`summarise()` ungrouping output (override with `.groups` argument)



In [27]:
daydom_stats = expiry_train_df_1 %>%
    group_by(tld_registrar_index) %>% 
    summarise(n = n(), 
              daydom_min = min(day_domains), 
              daydom_max = max(day_domains), 
              daydom_mean = mean(day_domains, na.rm = TRUE), 
              daydom_rng = daydom_max - daydom_min, 
              daydom_std = sd(day_domains, na.rm = TRUE), 
              daydom_skew = skewness(day_domains, na.rm = TRUE), 
              daydom_kurt = kurtosis(day_domains, na.rm = TRUE)) %>% 
    arrange(desc(n)) 

daydom_min <- daydom_stats %>% pull(daydom_min)
daydom_max <- daydom_stats %>% pull(daydom_max)
daydom_mean <- daydom_stats %>% pull(daydom_mean)
daydom_rng <- daydom_stats %>% pull(daydom_rng)
daydom_std <- daydom_stats %>% pull(daydom_std)
daydom_skew <- daydom_stats %>% pull(daydom_skew)
daydom_kurt <- daydom_stats %>% pull(daydom_kurt)



`summarise()` ungrouping output (override with `.groups` argument)



In [28]:
sldlen_stats = expiry_train_df_1 %>%
    group_by(tld_registrar_index) %>% 
    summarise(n = n(), 
              min = min(sld_length), 
              max = max(sld_length), 
              mean = mean(sld_length, na.rm = TRUE), 
              rng = max - min, 
              std = sd(sld_length, na.rm = TRUE), 
              skew = skewness(sld_length, na.rm = TRUE), 
              kurt = kurtosis(sld_length, na.rm = TRUE)) %>% 
    arrange(desc(n)) 

sldlen_min <- sldlen_stats %>% pull(min)
sldlen_max <- sldlen_stats %>% pull(max)
sldlen_mean <- sldlen_stats %>% pull(mean)
sldlen_rng <- sldlen_stats %>% pull(rng)
sldlen_std <- sldlen_stats %>% pull(std)
sldlen_skew <- sldlen_stats %>% pull(skew)
sldlen_kurt <- sldlen_stats %>% pull(kurt)



`summarise()` ungrouping output (override with `.groups` argument)



In [29]:
gibbs_stats = expiry_train_df_1 %>%
    group_by(tld_registrar_index) %>% 
    summarise(n = n(), 
              min = min(gibb_score), 
              max = max(gibb_score), 
              mean = mean(gibb_score, na.rm = TRUE), 
              rng = max - min, 
              std = sd(gibb_score, na.rm = TRUE), 
              skew = skewness(gibb_score, na.rm = TRUE), 
              kurt = kurtosis(gibb_score, na.rm = TRUE)) %>% 
    arrange(desc(n)) 

gibbs_min <- gibbs_stats %>% pull(min)
gibbs_max <- gibbs_stats %>% pull(max)
gibbs_mean <- gibbs_stats %>% pull(mean)
gibbs_rng <- gibbs_stats %>% pull(rng)
gibbs_std <- gibbs_stats %>% pull(std)
gibbs_skew <- gibbs_stats %>% pull(skew)
gibbs_kurt <- gibbs_stats %>% pull(kurt)


`summarise()` ungrouping output (override with `.groups` argument)



In [30]:
pdcnt_stats = expiry_train_df_1 %>%
    group_by(tld_registrar_index) %>% 
    summarise(n = n(), 
              min = min(pattern_domain_count), 
              max = max(pattern_domain_count), 
              mean = mean(pattern_domain_count, na.rm = TRUE), 
              rng = max - min, 
              std = sd(pattern_domain_count, na.rm = TRUE), 
              skew = skewness(pattern_domain_count, na.rm = TRUE), 
              kurt = kurtosis(pattern_domain_count, na.rm = TRUE)) %>% 
    arrange(desc(n)) 

pdcnt_min <- pdcnt_stats %>% pull(min)
pdcnt_max <- pdcnt_stats %>% pull(max)
pdcnt_mean <- pdcnt_stats %>% pull(mean)
pdcnt_rng <- pdcnt_stats %>% pull(rng)
pdcnt_std <- pdcnt_stats %>% pull(std)
pdcnt_skew <- pdcnt_stats %>% pull(skew)
pdcnt_kurt <- pdcnt_stats %>% pull(kurt)

`summarise()` ungrouping output (override with `.groups` argument)



In [31]:
rarpt_stats = expiry_train_df_1 %>%
    group_by(tld_registrar_index) %>% 
    summarise(n = n(), 
              min = min(reg_arpt), 
              max = max(reg_arpt), 
              mean = mean(reg_arpt, na.rm = TRUE), 
              rng = max - min, 
              std = sd(reg_arpt, na.rm = TRUE), 
              skew = skewness(reg_arpt, na.rm = TRUE), 
              kurt = kurtosis(reg_arpt, na.rm = TRUE)) %>% 
    arrange(desc(n)) 

rarpt_min <- rarpt_stats %>% pull(min)
rarpt_max <- rarpt_stats %>% pull(max)
rarpt_mean <- rarpt_stats %>% pull(mean)
rarpt_rng <- rarpt_stats %>% pull(rng)
rarpt_std <- rarpt_stats %>% pull(std)
rarpt_skew <- rarpt_stats %>% pull(skew)
rarpt_kurt <- rarpt_stats %>% pull(kurt)

`summarise()` ungrouping output (override with `.groups` argument)



## Add tld_reseller level feature engineering

In [32]:
metrics_df = metrics_df %>% 
    mutate(tld_reseller = tld_reseller_names,
            count = count_obs,
            ren_prp = ren_prp,
            tld_rat = tld_rat,
            country_maj = country_maj,
            region_maj = region_maj,
            country_cnt = country_cnt,
            region_cnt = region_cnt,
            geo_cnt = geo_cnt,
            geo_maj = geo_maj,

            daydom_min = daydom_min,
            daydom_max = daydom_max,
            daydom_mean = daydom_mean,
            daydom_rng = daydom_rng,
            daydom_std = daydom_std,
            daydom_skew = daydom_skew,
            daydom_kurt = daydom_kurt,

            sldlen_min = sldlen_min,
            sldlen_max = sldlen_max,
            sldlen_mean = sldlen_mean,
            sldlen_rng = sldlen_rng,
            sldlen_std = sldlen_std,
            sldlen_skew = sldlen_skew,
            sldlen_kurt = sldlen_kurt,

            gibbs_min = gibbs_min,
            gibbs_max = gibbs_max,
            gibbs_mean = gibbs_mean,
            gibbs_rng = gibbs_rng,
            gibbs_std = gibbs_std,
            gibbs_skew = gibbs_skew,
            gibbs_kurt = gibbs_kurt,

            pdcnt_min = pdcnt_min,
            pdcnt_max = pdcnt_max,
            pdcnt_mean = pdcnt_mean,
            pdcnt_rng = pdcnt_rng,
            pdcnt_std = pdcnt_std,
            pdcnt_skew = pdcnt_skew,
            pdcnt_kurt = pdcnt_kurt,

            rarpt_min = rarpt_min,
            rarpt_max = rarpt_max,
            rarpt_mean = rarpt_mean,
            rarpt_rng = rarpt_rng,
            rarpt_std = rarpt_std,
            rarpt_skew = rarpt_skew,
            rarpt_kurt = rarpt_kurt)

In [33]:
dim(metrics_df)
head(metrics_df)

Unnamed: 0_level_0,tld_registrar_index,reseller,n,l10_seg2_glm,l10_seg_glm,l10_agg_glm,l10_seg2_rf,l10_seg_rf,l10_agg_rf,auc_seg2_glm,auc_seg_glm,auc_agg_glm,auc_seg2_rf,auc_seg_rf,auc_agg_rf,tld_reseller,count,ren_prp,tld_rat,country_maj,region_maj,country_cnt,region_cnt,geo_cnt,geo_maj,⋯,sldlen_rng,sldlen_std,sldlen_skew,sldlen_kurt,gibbs_min,gibbs_max,gibbs_mean,gibbs_rng,gibbs_std,gibbs_skew,gibbs_kurt,pdcnt_min,pdcnt_max,pdcnt_mean,pdcnt_rng,pdcnt_std,pdcnt_skew,pdcnt_kurt,rarpt_min,rarpt_max,rarpt_mean,rarpt_rng,rarpt_std,rarpt_skew,rarpt_kurt
Unnamed: 0_level_1,<fct>,<fct>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<int>,<dbl>,<dbl>,<chr>,<chr>,<int>,<int>,<int>,<fct>,⋯,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,sitegmo,gmo,106569,8.204334,8.126935,6.408669,8.77709,8.76161,8.591331,0.8958204,0.8825077,0.8729102,0.9345201,0.9348297,0.9329721,sitegmo,106569,0.02441611,9.383592e-06,Japan,Non China,1,1,1,Japan,⋯,50,2.508508,6.1455276,55.010118,0,100,1.672944,100,4.083236,15.334299,348.12171,1,51,3.43249,50,5.173298,3.6994369,16.3930266,0.1,13.37,0.103735,13.27,0.20216296,55.1543,3102.462949
2,funalibaba,alibaba,72690,4.883721,5.255814,5.023256,5.813953,6.0,6.186047,0.8262791,0.8430233,0.8518605,0.8769767,0.877907,0.8793023,funalibaba,72690,0.01223002,1.375705e-05,China,China,1,1,1,China,⋯,60,3.289407,2.9497402,14.097156,0,100,6.42792,100,21.265643,4.112206,15.16265,1,326,15.295667,325,37.775267,4.8276608,27.7586974,0.75,4.75,0.7521482,4.0,0.08239843,43.767804,1951.657326
3,sitenamecheap,namecheap,53536,2.298025,1.921005,1.885099,2.280072,2.028725,2.046679,0.6862657,0.6196589,0.633842,0.6654399,0.6792639,0.6893178,sitenamecheap,53536,0.03973027,1.867902e-05,United States,Non China,1,1,1,United States,⋯,57,4.457749,1.8173451,7.306364,0,100,4.29292,100,4.790116,13.813471,271.44212,1,102,5.683839,101,11.925165,4.8845405,28.4273959,0.58,8.33,0.8639192,7.75,0.46643401,2.149515,15.078455
4,pwnamecheap,namecheap,50470,3.478261,3.675889,3.675889,3.438735,4.071146,4.3083,0.7274704,0.6551383,0.636166,0.7167984,0.7405138,0.7507905,pwnamecheap,50470,0.022766,1.981375e-05,United States,Non China,1,1,1,United States,⋯,53,3.816024,0.9727996,3.074438,0,100,5.021115,100,5.889335,13.133157,208.93433,1,133,3.80634,132,11.591963,7.9464521,72.1673024,0.59,5.4,0.8950147,4.81,0.37727284,1.158011,3.878673
5,onlinego daddy,go daddy,42162,2.850995,2.883271,2.474449,2.598171,2.727273,2.646584,0.6506455,0.6493007,0.6381119,0.6095481,0.6266541,0.6249328,onlinego daddy,42162,0.17413785,2.371804e-05,United States,Non China,1,1,1,United States,⋯,57,4.627825,1.2478782,3.883563,0,100,5.01993,100,5.451631,13.738913,235.73852,1,38,1.947346,37,2.554309,8.0693509,82.8231996,0.5,19.38,1.4417611,18.88,2.84260467,3.856059,13.984498
6,sitegandi sas,gandi sas,35309,8.243243,8.445946,7.094595,4.932432,8.445946,8.581081,0.9202703,0.9195946,0.9189189,0.7162162,0.9344595,0.9358108,sitegandi sas,35309,0.01741766,2.832139e-05,France,Non China,1,1,1,EU,⋯,51,4.052579,-0.9342607,1.096902,0,100,3.680121,100,7.475273,11.932197,150.65015,1,332,99.953015,331,114.009691,0.8053729,-0.9005846,0.5,7.0,0.5011045,6.5,0.08472577,76.689854,5879.500288


In [34]:
sum(metrics_df$tld_registrar_index != metrics_df$tld_reseller)

In [35]:
sum(metrics_df$n != metrics_df$count)

In [36]:
metrics_df <- subset(metrics_df, select=-c(tld_registrar_index,reseller,n))

In [37]:
head(metrics_df)

Unnamed: 0_level_0,l10_seg2_glm,l10_seg_glm,l10_agg_glm,l10_seg2_rf,l10_seg_rf,l10_agg_rf,auc_seg2_glm,auc_seg_glm,auc_agg_glm,auc_seg2_rf,auc_seg_rf,auc_agg_rf,tld_reseller,count,ren_prp,tld_rat,country_maj,region_maj,country_cnt,region_cnt,geo_cnt,geo_maj,daydom_min,daydom_max,daydom_mean,⋯,sldlen_rng,sldlen_std,sldlen_skew,sldlen_kurt,gibbs_min,gibbs_max,gibbs_mean,gibbs_rng,gibbs_std,gibbs_skew,gibbs_kurt,pdcnt_min,pdcnt_max,pdcnt_mean,pdcnt_rng,pdcnt_std,pdcnt_skew,pdcnt_kurt,rarpt_min,rarpt_max,rarpt_mean,rarpt_rng,rarpt_std,rarpt_skew,rarpt_kurt
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<int>,<dbl>,<dbl>,<chr>,<chr>,<int>,<int>,<int>,<fct>,<int>,<int>,<dbl>,⋯,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,8.204334,8.126935,6.408669,8.77709,8.76161,8.591331,0.8958204,0.8825077,0.8729102,0.9345201,0.9348297,0.9329721,sitegmo,106569,0.02441611,9.383592e-06,Japan,Non China,1,1,1,Japan,1,16752,6351.106,⋯,50,2.508508,6.1455276,55.010118,0,100,1.672944,100,4.083236,15.334299,348.12171,1,51,3.43249,50,5.173298,3.6994369,16.3930266,0.1,13.37,0.103735,13.27,0.20216296,55.1543,3102.462949
2,4.883721,5.255814,5.023256,5.813953,6.0,6.186047,0.8262791,0.8430233,0.8518605,0.8769767,0.877907,0.8793023,funalibaba,72690,0.01223002,1.375705e-05,China,China,1,1,1,China,1,2779,861.4138,⋯,60,3.289407,2.9497402,14.097156,0,100,6.42792,100,21.265643,4.112206,15.16265,1,326,15.295667,325,37.775267,4.8276608,27.7586974,0.75,4.75,0.7521482,4.0,0.08239843,43.767804,1951.657326
3,2.298025,1.921005,1.885099,2.280072,2.028725,2.046679,0.6862657,0.6196589,0.633842,0.6654399,0.6792639,0.6893178,sitenamecheap,53536,0.03973027,1.867902e-05,United States,Non China,1,1,1,United States,1,2200,641.6495,⋯,57,4.457749,1.8173451,7.306364,0,100,4.29292,100,4.790116,13.813471,271.44212,1,102,5.683839,101,11.925165,4.8845405,28.4273959,0.58,8.33,0.8639192,7.75,0.46643401,2.149515,15.078455
4,3.478261,3.675889,3.675889,3.438735,4.071146,4.3083,0.7274704,0.6551383,0.636166,0.7167984,0.7405138,0.7507905,pwnamecheap,50470,0.022766,1.981375e-05,United States,Non China,1,1,1,United States,1,2277,708.5591,⋯,53,3.816024,0.9727996,3.074438,0,100,5.021115,100,5.889335,13.133157,208.93433,1,133,3.80634,132,11.591963,7.9464521,72.1673024,0.59,5.4,0.8950147,4.81,0.37727284,1.158011,3.878673
5,2.850995,2.883271,2.474449,2.598171,2.727273,2.646584,0.6506455,0.6493007,0.6381119,0.6095481,0.6266541,0.6249328,onlinego daddy,42162,0.17413785,2.371804e-05,United States,Non China,1,1,1,United States,1,2642,444.951,⋯,57,4.627825,1.2478782,3.883563,0,100,5.01993,100,5.451631,13.738913,235.73852,1,38,1.947346,37,2.554309,8.0693509,82.8231996,0.5,19.38,1.4417611,18.88,2.84260467,3.856059,13.984498
6,8.243243,8.445946,7.094595,4.932432,8.445946,8.581081,0.9202703,0.9195946,0.9189189,0.7162162,0.9344595,0.9358108,sitegandi sas,35309,0.01741766,2.832139e-05,France,Non China,1,1,1,EU,1,8032,5529.4799,⋯,51,4.052579,-0.9342607,1.096902,0,100,3.680121,100,7.475273,11.932197,150.65015,1,332,99.953015,331,114.009691,0.8053729,-0.9005846,0.5,7.0,0.5011045,6.5,0.08472577,76.689854,5879.500288


In [38]:
names(metrics_df)

# CREATE new multi-class dependent variables

In [39]:
(auc_vars = grep('auc', names(metrics_df), value=TRUE))
(l10_vars = grep('l10', names(metrics_df), value=TRUE))

In [40]:
# auc_vars = c('seg2_glm_auc','seg_glm_auc','agg_glm_auc','agg_rf_auc')
# l10_vars = c('seg2_glm_lift10','seg_glm_lift10','agg_glm_lift10','agg_rf_lift10')

In [41]:
metrics_df <- metrics_df %>%
    mutate (auc_win=sapply(apply(.[,c(auc_vars)], 
                          1, function(x) names(x)[which.max(x)]) , function(s) if (length(s) == 0) NA else paste(s, collapse = " ")) ,
            l10_win=sapply(apply(.[,c(l10_vars)], 
                          1, function(x) names(x)[which.max(x)]), function(s) if (length(s) == 0) NA else paste(s, collapse = " ")) 
            ) 

# proportion of wins by each class

In [42]:
metrics_df %>% 
    group_by(auc_win) %>%
    tally() %>%
    arrange(desc(n))

auc_win,n
<chr>,<int>
,665
auc_seg2_glm,433
auc_seg_glm,176
auc_agg_glm,144
auc_seg_rf,117
auc_agg_rf,94
auc_seg2_rf,94


In [43]:
metrics_df %>% 
    group_by(l10_win) %>%
    tally() %>%
    arrange(desc(n))

l10_win,n
<chr>,<int>
,665
l10_seg2_glm,603
l10_seg_glm,158
l10_agg_glm,104
l10_seg2_rf,100
l10_seg_rf,51
l10_agg_rf,42


# Note some resellers have no wins -- these are all where no renewals exist

In [44]:
(tld_reseller_NAs <- metrics_df %>% filter(is.na(auc_win)) %>% pull(tld_reseller))

In [45]:
expiry_test_df_1 %>% 
  filter(tld_registrar_index %in% tld_reseller_NAs) %>% 
  group_by(tld_registrar_index,renewal_status) %>%
  summarise(n = n()) %>%
  mutate(freq = n / sum(n)) %>%
  arrange(desc(n))

`summarise()` regrouping output by 'tld_registrar_index' (override with `.groups` argument)



tld_registrar_index,renewal_status,n,freq
<chr>,<fct>,<int>,<dbl>
funzhengzhou century connect,Not Renewd,1151,1
sitepdr china,Not Renewd,561,1
onlinepdr china,Not Renewd,278,1
spaceeranet,Not Renewd,210,1
spaceglobal domains,Not Renewd,103,1
funmoniker,Not Renewd,97,1
siteeranet,Not Renewd,94,1
siteidwebhost,Not Renewd,90,1
sitenethouse,Not Renewd,89,1
spaceniaga hoster,Not Renewd,71,1


# proportion of wins by each class - NA removed

In [46]:
metrics_df <- metrics_df %>% filter(!is.na(auc_win)) 
dim(metrics_df)

In [47]:
metrics_df %>% 
    group_by(auc_win) %>%
    tally() %>%
    arrange(desc(n))

auc_win,n
<chr>,<int>
auc_seg2_glm,433
auc_seg_glm,176
auc_agg_glm,144
auc_seg_rf,117
auc_agg_rf,94
auc_seg2_rf,94


In [48]:
metrics_df %>% 
    group_by(l10_win) %>%
    tally() %>%
    arrange(desc(n))

l10_win,n
<chr>,<int>
l10_seg2_glm,603
l10_seg_glm,158
l10_agg_glm,104
l10_seg2_rf,100
l10_seg_rf,51
l10_agg_rf,42


# ***MULTICLASS RF***

# Data Prep

In [49]:
metrics_df <- metrics_df %>%
  mutate_if(sapply(metrics_df, is.character), as.factor)

In [50]:
metrics_df %>%
  select(everything()) %>%  
  summarise_all(funs(sum(is.na(.)))) %>%  
  t() 

“`funs()` is deprecated as of dplyr 0.8.0.
Please use a list of either functions or lambdas: 

  # Simple named list: 
  list(mean = mean, median = median)

  # Auto named with `tibble::lst()`: 
  tibble::lst(mean, median)

  # Using lambdas
  list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))


0,1
l10_seg2_glm,0
l10_seg_glm,0
l10_agg_glm,0
l10_seg2_rf,0
l10_seg_rf,0
l10_agg_rf,0
auc_seg2_glm,0
auc_seg_glm,0
auc_agg_glm,0
auc_seg2_rf,0


## Train/Test Split

In [51]:
names(metrics_df)

In [52]:
metrics_df_sub = metrics_df %>% 
  select('auc_win', 'l10_win',
         'count','ren_prp','tld_rat',
         #'country_maj','region_maj','country_cnt','region_cnt',
         'geo_cnt', 'geo_maj',
         'daydom_min','daydom_max','daydom_mean','daydom_rng','daydom_std','daydom_skew','daydom_kurt',
         'sldlen_min','sldlen_max','sldlen_mean','sldlen_rng','sldlen_std','sldlen_skew','sldlen_kurt',
         'gibbs_min','gibbs_max','gibbs_mean','gibbs_rng','gibbs_std','gibbs_skew','gibbs_kurt',
         'pdcnt_min','pdcnt_max','pdcnt_mean','pdcnt_rng','pdcnt_std','pdcnt_skew','pdcnt_kurt',
         'rarpt_min','rarpt_max','rarpt_mean','rarpt_rng','rarpt_std','rarpt_skew','rarpt_kurt')
head(metrics_df_sub)

Unnamed: 0_level_0,auc_win,l10_win,count,ren_prp,tld_rat,geo_cnt,geo_maj,daydom_min,daydom_max,daydom_mean,daydom_rng,daydom_std,daydom_skew,daydom_kurt,sldlen_min,sldlen_max,sldlen_mean,sldlen_rng,sldlen_std,sldlen_skew,sldlen_kurt,gibbs_min,gibbs_max,gibbs_mean,gibbs_rng,gibbs_std,gibbs_skew,gibbs_kurt,pdcnt_min,pdcnt_max,pdcnt_mean,pdcnt_rng,pdcnt_std,pdcnt_skew,pdcnt_kurt,rarpt_min,rarpt_max,rarpt_mean,rarpt_rng,rarpt_std,rarpt_skew,rarpt_kurt
Unnamed: 0_level_1,<fct>,<fct>,<int>,<dbl>,<dbl>,<int>,<fct>,<int>,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<int>,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,auc_seg_rf,l10_seg2_rf,106569,0.02441611,9.383592e-06,1,Japan,1,16752,6351.106,16751,5949.609,0.7065131,-1.0294486,3,53,6.642335,50,2.508508,6.1455276,55.010118,0,100,1.672944,100,4.083236,15.334299,348.12171,1,51,3.43249,50,5.173298,3.6994369,16.3930266,0.1,13.37,0.103735,13.27,0.20216296,55.1543,3102.462949
2,auc_agg_rf,l10_agg_rf,72690,0.01223002,1.375705e-05,1,China,1,2779,861.4138,2778,598.0098,1.5584245,2.6494681,3,63,6.947572,60,3.289407,2.9497402,14.097156,0,100,6.42792,100,21.265643,4.112206,15.16265,1,326,15.295667,325,37.775267,4.8276608,27.7586974,0.75,4.75,0.7521482,4.0,0.08239843,43.767804,1951.657326
3,auc_agg_rf,l10_seg2_glm,53536,0.03973027,1.867902e-05,1,United States,1,2200,641.6495,2199,552.7995,1.4770911,1.2445628,3,60,10.040702,57,4.457749,1.8173451,7.306364,0,100,4.29292,100,4.790116,13.813471,271.44212,1,102,5.683839,101,11.925165,4.8845405,28.4273959,0.58,8.33,0.8639192,7.75,0.46643401,2.149515,15.078455
4,auc_agg_rf,l10_agg_rf,50470,0.022766,1.981375e-05,1,United States,1,2277,708.5591,2276,495.1889,1.4025666,2.4231262,3,56,10.612245,53,3.816024,0.9727996,3.074438,0,100,5.021115,100,5.889335,13.133157,208.93433,1,133,3.80634,132,11.591963,7.9464521,72.1673024,0.59,5.4,0.8950147,4.81,0.37727284,1.158011,3.878673
5,auc_seg2_glm,l10_seg_glm,42162,0.17413785,2.371804e-05,1,United States,1,2642,444.951,2641,590.7182,3.1800012,9.0282709,3,60,10.46096,57,4.627825,1.2478782,3.883563,0,100,5.01993,100,5.451631,13.738913,235.73852,1,38,1.947346,37,2.554309,8.0693509,82.8231996,0.5,19.38,1.4417611,18.88,2.84260467,3.856059,13.984498
6,auc_agg_rf,l10_agg_rf,35309,0.01741766,2.832139e-05,1,EU,1,8032,5529.4799,8031,2800.913,-0.8344171,-0.4973638,3,54,15.473874,51,4.052579,-0.9342607,1.096902,0,100,3.680121,100,7.475273,11.932197,150.65015,1,332,99.953015,331,114.009691,0.8053729,-0.9005846,0.5,7.0,0.5011045,6.5,0.08472577,76.689854,5879.500288


In [53]:
set.seed(123) 
smp_siz = floor(0.8*nrow(metrics_df_sub))
train_ind = sample(seq_len(nrow(metrics_df_sub)),size = smp_siz) 
train = metrics_df_sub[train_ind,] 
test = metrics_df_sub[-train_ind,]  

In [54]:
dim(train)

In [55]:
dim(test)

In [56]:
str(train)

'data.frame':	846 obs. of  42 variables:
 $ auc_win    : Factor w/ 6 levels "auc_agg_glm",..: 3 1 2 1 5 5 5 2 1 5 ...
  ..- attr(*, "names")= chr  "434" "489" "183" "560" ...
 $ l10_win    : Factor w/ 6 levels "l10_agg_glm",..: 5 5 5 5 2 5 5 5 3 5 ...
  ..- attr(*, "names")= chr  "434" "489" "183" "560" ...
 $ count      : int  127 98 689 74 604 5 1 34 51 26 ...
 $ ren_prp    : num  0.646 0.388 0.196 0.23 0.106 ...
 $ tld_rat    : num  0.00787 0.0102 0.00145 0.01351 0.00166 ...
 $ geo_cnt    : int  1 1 1 1 1 1 1 1 1 1 ...
 $ geo_maj    : Factor w/ 15 levels "Africa","Australia & New Zealand",..: 5 14 5 11 6 14 11 5 NA 5 ...
 $ daydom_min : int  1 1 1 1 1 1 1 1 1 1 ...
 $ daydom_max : int  3 3 66 2 14 1 1 14 4 2 ...
 $ daydom_mean: num  1.43 1.45 15.44 1.32 6.16 ...
 $ daydom_rng : int  2 2 65 1 13 0 0 13 3 1 ...
 $ daydom_std : num  0.696 0.558 19.095 0.471 3.293 ...
 $ daydom_skew: num  1.317 0.726 1.823 0.735 0.485 ...
 $ daydom_kurt: num  0.28 -0.567 2.012 -1.479 -0.516 ...
 $ sldle

In [57]:
train_auc <- subset(train, select = -c(l10_win) )
levels(train_auc$auc_win)
train_auc$auc_win <- relevel(train_auc$auc_win, ref=5)
str(train_auc)

'data.frame':	846 obs. of  41 variables:
 $ auc_win    : Factor w/ 6 levels "auc_seg2_glm",..: 4 2 3 2 1 1 1 3 2 1 ...
 $ count      : int  127 98 689 74 604 5 1 34 51 26 ...
 $ ren_prp    : num  0.646 0.388 0.196 0.23 0.106 ...
 $ tld_rat    : num  0.00787 0.0102 0.00145 0.01351 0.00166 ...
 $ geo_cnt    : int  1 1 1 1 1 1 1 1 1 1 ...
 $ geo_maj    : Factor w/ 15 levels "Africa","Australia & New Zealand",..: 5 14 5 11 6 14 11 5 NA 5 ...
 $ daydom_min : int  1 1 1 1 1 1 1 1 1 1 ...
 $ daydom_max : int  3 3 66 2 14 1 1 14 4 2 ...
 $ daydom_mean: num  1.43 1.45 15.44 1.32 6.16 ...
 $ daydom_rng : int  2 2 65 1 13 0 0 13 3 1 ...
 $ daydom_std : num  0.696 0.558 19.095 0.471 3.293 ...
 $ daydom_skew: num  1.317 0.726 1.823 0.735 0.485 ...
 $ daydom_kurt: num  0.28 -0.567 2.012 -1.479 -0.516 ...
 $ sldlen_min : int  3 4 4 4 4 4 9 5 4 4 ...
 $ sldlen_max : int  18 22 52 15 29 15 9 20 20 15 ...
 $ sldlen_mean: num  9.01 11.66 11.74 8.05 11.14 ...
 $ sldlen_rng : int  15 18 48 11 25 11 0 15 16

In [58]:
train_l10 <- subset(train, select = -c(auc_win) )
levels(train_l10$l10_win)
train_l10$l10_win <- relevel(train_l10$l10_win, ref=5)
str(train_l10)

'data.frame':	846 obs. of  41 variables:
 $ l10_win    : Factor w/ 6 levels "l10_seg2_glm",..: 1 1 1 1 3 1 1 1 4 1 ...
 $ count      : int  127 98 689 74 604 5 1 34 51 26 ...
 $ ren_prp    : num  0.646 0.388 0.196 0.23 0.106 ...
 $ tld_rat    : num  0.00787 0.0102 0.00145 0.01351 0.00166 ...
 $ geo_cnt    : int  1 1 1 1 1 1 1 1 1 1 ...
 $ geo_maj    : Factor w/ 15 levels "Africa","Australia & New Zealand",..: 5 14 5 11 6 14 11 5 NA 5 ...
 $ daydom_min : int  1 1 1 1 1 1 1 1 1 1 ...
 $ daydom_max : int  3 3 66 2 14 1 1 14 4 2 ...
 $ daydom_mean: num  1.43 1.45 15.44 1.32 6.16 ...
 $ daydom_rng : int  2 2 65 1 13 0 0 13 3 1 ...
 $ daydom_std : num  0.696 0.558 19.095 0.471 3.293 ...
 $ daydom_skew: num  1.317 0.726 1.823 0.735 0.485 ...
 $ daydom_kurt: num  0.28 -0.567 2.012 -1.479 -0.516 ...
 $ sldlen_min : int  3 4 4 4 4 4 9 5 4 4 ...
 $ sldlen_max : int  18 22 52 15 29 15 9 20 20 15 ...
 $ sldlen_mean: num  9.01 11.66 11.74 8.05 11.14 ...
 $ sldlen_rng : int  15 18 48 11 25 11 0 15 16

## RF missing values addressing

In [59]:
train_auc %>%
  select(everything()) %>%  
  summarise_all(funs(sum(is.na(.)))) %>%  
  t() 

0,1
auc_win,0
count,0
ren_prp,0
tld_rat,0
geo_cnt,0
geo_maj,24
daydom_min,0
daydom_max,0
daydom_mean,0
daydom_rng,0


In [60]:
# install.packages("missRanger")
library(missRanger)

In [61]:
train_auc_imp <- missRanger(train_auc, num.trees = 100)


Missing value imputation by random forests

  Variables to impute:		geo_maj, daydom_std, daydom_skew, daydom_kurt, sldlen_std, sldlen_skew, sldlen_kurt, gibbs_std, gibbs_skew, gibbs_kurt, pdcnt_std, pdcnt_skew, pdcnt_kurt, rarpt_std, rarpt_skew, rarpt_kurt
  Variables used to impute:	auc_win, count, ren_prp, tld_rat, geo_cnt, geo_maj, daydom_min, daydom_max, daydom_mean, daydom_rng, daydom_std, daydom_skew, daydom_kurt, sldlen_min, sldlen_max, sldlen_mean, sldlen_rng, sldlen_std, sldlen_skew, sldlen_kurt, gibbs_min, gibbs_max, gibbs_mean, gibbs_rng, gibbs_std, gibbs_skew, gibbs_kurt, pdcnt_min, pdcnt_max, pdcnt_mean, pdcnt_rng, pdcnt_std, pdcnt_skew, pdcnt_kurt, rarpt_min, rarpt_max, rarpt_mean, rarpt_rng, rarpt_std, rarpt_skew, rarpt_kurt
iter 1:	.......

“Dropped unused factor level(s) in dependent variable: Middle East.”


.........
iter 2:	.......

“Dropped unused factor level(s) in dependent variable: Middle East.”


.........
iter 3:	.......

“Dropped unused factor level(s) in dependent variable: Middle East.”


.........
iter 4:	.......

“Dropped unused factor level(s) in dependent variable: Middle East.”


.........
iter 5:	.......

“Dropped unused factor level(s) in dependent variable: Middle East.”


.........


In [62]:
train_l10_imp <- missRanger(train_l10, num.trees = 100)


Missing value imputation by random forests

  Variables to impute:		geo_maj, daydom_std, daydom_skew, daydom_kurt, sldlen_std, sldlen_skew, sldlen_kurt, gibbs_std, gibbs_skew, gibbs_kurt, pdcnt_std, pdcnt_skew, pdcnt_kurt, rarpt_std, rarpt_skew, rarpt_kurt
  Variables used to impute:	l10_win, count, ren_prp, tld_rat, geo_cnt, geo_maj, daydom_min, daydom_max, daydom_mean, daydom_rng, daydom_std, daydom_skew, daydom_kurt, sldlen_min, sldlen_max, sldlen_mean, sldlen_rng, sldlen_std, sldlen_skew, sldlen_kurt, gibbs_min, gibbs_max, gibbs_mean, gibbs_rng, gibbs_std, gibbs_skew, gibbs_kurt, pdcnt_min, pdcnt_max, pdcnt_mean, pdcnt_rng, pdcnt_std, pdcnt_skew, pdcnt_kurt, rarpt_min, rarpt_max, rarpt_mean, rarpt_rng, rarpt_std, rarpt_skew, rarpt_kurt
iter 1:	.......

“Dropped unused factor level(s) in dependent variable: Middle East.”


.........
iter 2:	.......

“Dropped unused factor level(s) in dependent variable: Middle East.”


.........
iter 3:	.......

“Dropped unused factor level(s) in dependent variable: Middle East.”


.........


In [63]:
test_imp <- missRanger(test, num.trees = 100)


Missing value imputation by random forests

  Variables to impute:		geo_maj, daydom_std, daydom_skew, daydom_kurt, sldlen_std, sldlen_skew, sldlen_kurt, gibbs_std, gibbs_skew, gibbs_kurt, pdcnt_std, pdcnt_skew, pdcnt_kurt, rarpt_std, rarpt_skew, rarpt_kurt
  Variables used to impute:	auc_win, l10_win, count, ren_prp, tld_rat, geo_cnt, geo_maj, daydom_min, daydom_max, daydom_mean, daydom_rng, daydom_std, daydom_skew, daydom_kurt, sldlen_min, sldlen_max, sldlen_mean, sldlen_rng, sldlen_std, sldlen_skew, sldlen_kurt, gibbs_min, gibbs_max, gibbs_mean, gibbs_rng, gibbs_std, gibbs_skew, gibbs_kurt, pdcnt_min, pdcnt_max, pdcnt_mean, pdcnt_rng, pdcnt_std, pdcnt_skew, pdcnt_kurt, rarpt_min, rarpt_max, rarpt_mean, rarpt_rng, rarpt_std, rarpt_skew, rarpt_kurt
iter 1:	.........

“Dropped unused factor level(s) in dependent variable: Middle East.”


.......
iter 2:	.........

“Dropped unused factor level(s) in dependent variable: Middle East.”


.......
iter 3:	.........

“Dropped unused factor level(s) in dependent variable: Middle East.”


.......


## RF multiclass auc

In [64]:
library(ranger) 

In [65]:
model <- ranger(formula         = auc_win ~ ., 
                data            = train_auc_imp, 
                importance = 'impurity', 
                num.trees       = 500,
                probability = TRUE,
                replace = FALSE,
                sample.fraction = .8,
                seed            = 123,
                respect.unordered.factors=TRUE)

In [66]:
pred <- predict(model, 
                data = test_imp,
                type="response")$predictions

In [67]:
head(pred)

auc_seg2_glm,auc_agg_glm,auc_agg_rf,auc_seg_glm,auc_seg_rf,auc_seg2_rf
0.1291548,0.2509706,0.2703452,0.07572143,0.1462754,0.12753254
0.1993786,0.1938516,0.3238452,0.08795952,0.13624921,0.05871587
0.1615857,0.1081619,0.3577563,0.09724048,0.07991825,0.1953373
0.1649397,0.1513889,0.2968254,0.09179524,0.12959921,0.16545159
0.0996881,0.1129746,0.476296,0.06365,0.17222619,0.07516508
0.1784881,0.1235659,0.3946738,0.04305238,0.14948889,0.11073095


In [68]:
pred <- as.data.frame(pred)
auc_vars <- names(pred)
pred <- pred %>%
    mutate (auc_win=sapply(apply(.[,c(auc_vars)], 
                          1, function(x) names(x)[which.max(x)]) , function(s) if (length(s) == 0) NA else paste(s, collapse = " ")) 
            ) 
head(pred)

Unnamed: 0_level_0,auc_seg2_glm,auc_agg_glm,auc_agg_rf,auc_seg_glm,auc_seg_rf,auc_seg2_rf,auc_win
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1,0.1291548,0.2509706,0.2703452,0.07572143,0.1462754,0.12753254,auc_agg_rf
2,0.1993786,0.1938516,0.3238452,0.08795952,0.13624921,0.05871587,auc_agg_rf
3,0.1615857,0.1081619,0.3577563,0.09724048,0.07991825,0.1953373,auc_agg_rf
4,0.1649397,0.1513889,0.2968254,0.09179524,0.12959921,0.16545159,auc_agg_rf
5,0.0996881,0.1129746,0.476296,0.06365,0.17222619,0.07516508,auc_agg_rf
6,0.1784881,0.1235659,0.3946738,0.04305238,0.14948889,0.11073095,auc_agg_rf


In [69]:
xtab <- table(test_imp$auc_win, pred$auc_win)
xtab

              
               auc_agg_glm auc_agg_rf auc_seg_glm auc_seg_rf auc_seg2_glm
  auc_agg_glm            8          0           2          0           12
  auc_agg_rf             4          1           6          1            9
  auc_seg_glm            6          0           4          0           23
  auc_seg_rf             3          2           5          0           13
  auc_seg2_glm          11          4           8          0           67
  auc_seg2_rf            2          1           4          0           13
              
               auc_seg2_rf
  auc_agg_glm            0
  auc_agg_rf             0
  auc_seg_glm            0
  auc_seg_rf             0
  auc_seg2_glm           1
  auc_seg2_rf            2

In [70]:
confusionMatrix(data=factor(test_imp$auc_win), reference=factor(pred$auc_win))

Confusion Matrix and Statistics

              Reference
Prediction     auc_agg_glm auc_agg_rf auc_seg_glm auc_seg_rf auc_seg2_glm
  auc_agg_glm            8          0           2          0           12
  auc_agg_rf             4          1           6          1            9
  auc_seg_glm            6          0           4          0           23
  auc_seg_rf             3          2           5          0           13
  auc_seg2_glm          11          4           8          0           67
  auc_seg2_rf            2          1           4          0           13
              Reference
Prediction     auc_seg2_rf
  auc_agg_glm            0
  auc_agg_rf             0
  auc_seg_glm            0
  auc_seg_rf             0
  auc_seg2_glm           1
  auc_seg2_rf            2

Overall Statistics
                                          
               Accuracy : 0.3868          
                 95% CI : (0.3209, 0.4559)
    No Information Rate : 0.6462          
    P-Value [Acc > N

## RF multiclass l10

In [71]:
library(ranger) 

In [72]:
model_l10 <- ranger(formula         = l10_win ~ ., 
                data            = train_l10_imp, 
                importance = 'impurity', 
                num.trees       = 500,
                probability = TRUE,
                replace = FALSE,
                sample.fraction = .8,
                seed            = 123,
                respect.unordered.factors=TRUE)

In [73]:
pred <- predict(model_l10, 
                data = test_imp,
                type="response")$predictions

In [74]:
head(pred)

l10_seg2_glm,l10_agg_glm,l10_agg_rf,l10_seg_glm,l10_seg_rf,l10_seg2_rf
0.18515159,0.2418857,0.2593643,0.1233373,0.11780714,0.07245397
0.09553492,0.1807754,0.4270405,0.1471857,0.07892698,0.07053651
0.43676587,0.1182675,0.1314913,0.1023103,0.04190079,0.16926429
0.3052127,0.1837349,0.2030857,0.1266516,0.09498968,0.0863254
0.16655397,0.2340754,0.2525183,0.1434706,0.08654603,0.11683571
0.2935254,0.1516532,0.1819857,0.1104183,0.10993333,0.15248413


In [75]:
pred <- as.data.frame(pred)
vars <- names(pred)
pred <- pred %>%
    mutate (win=sapply(apply(.[,c(vars)], 
                          1, function(x) names(x)[which.max(x)]) , function(s) if (length(s) == 0) NA else paste(s, collapse = " ")) 
            ) 
head(pred)

Unnamed: 0_level_0,l10_seg2_glm,l10_agg_glm,l10_agg_rf,l10_seg_glm,l10_seg_rf,l10_seg2_rf,win
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1,0.18515159,0.2418857,0.2593643,0.1233373,0.11780714,0.07245397,l10_agg_rf
2,0.09553492,0.1807754,0.4270405,0.1471857,0.07892698,0.07053651,l10_agg_rf
3,0.43676587,0.1182675,0.1314913,0.1023103,0.04190079,0.16926429,l10_seg2_glm
4,0.3052127,0.1837349,0.2030857,0.1266516,0.09498968,0.0863254,l10_seg2_glm
5,0.16655397,0.2340754,0.2525183,0.1434706,0.08654603,0.11683571,l10_agg_rf
6,0.2935254,0.1516532,0.1819857,0.1104183,0.10993333,0.15248413,l10_seg2_glm


In [76]:
xtab <- table(test_imp$l10_win, pred$win)
xtab

              
               l10_agg_glm l10_agg_rf l10_seg_glm l10_seg2_glm l10_seg2_rf
  l10_agg_glm            0          0           1           18           1
  l10_agg_rf             0          0           4            6           0
  l10_seg_glm            0          0           0           33           0
  l10_seg_rf             0          1           0            2           0
  l10_seg2_glm           0          1           0          127           0
  l10_seg2_rf            1          1           1           14           1

In [77]:
confusionMatrix(data=factor(test_imp$l10_win), reference=factor(pred$win, levels=levels(factor(test_imp$l10_win))))

Confusion Matrix and Statistics

              Reference
Prediction     l10_agg_glm l10_agg_rf l10_seg_glm l10_seg_rf l10_seg2_glm
  l10_agg_glm            0          0           1          0           18
  l10_agg_rf             0          0           4          0            6
  l10_seg_glm            0          0           0          0           33
  l10_seg_rf             0          1           0          0            2
  l10_seg2_glm           0          1           0          0          127
  l10_seg2_rf            1          1           1          0           14
              Reference
Prediction     l10_seg2_rf
  l10_agg_glm            1
  l10_agg_rf             0
  l10_seg_glm            0
  l10_seg_rf             0
  l10_seg2_glm           0
  l10_seg2_rf            1

Overall Statistics
                                          
               Accuracy : 0.6038          
                 95% CI : (0.5345, 0.6701)
    No Information Rate : 0.9434          
    P-Value [Acc > N

In [78]:
str(factor(test_imp$l10_win))
str(factor(pred$win))

 Factor w/ 6 levels "l10_agg_glm",..: 6 5 5 6 4 5 5 1 5 5 ...
 - attr(*, "names")= chr [1:212] "1" "3" "7" "12" ...
 Factor w/ 5 levels "l10_agg_glm",..: 2 2 4 4 2 4 4 4 4 4 ...
 - attr(*, "names")= chr [1:212] "l10_agg_rf" "l10_agg_rf" "l10_seg2_glm" "l10_seg2_glm" ...


In [79]:
levels(factor(test_imp$l10_win))
levels(factor(pred$win))

In [80]:
table(test_imp$l10_win)


 l10_agg_glm   l10_agg_rf  l10_seg_glm   l10_seg_rf l10_seg2_glm  l10_seg2_rf 
          20           10           33            3          128           18 

In [81]:
table(pred$win)


 l10_agg_glm   l10_agg_rf  l10_seg_glm l10_seg2_glm  l10_seg2_rf 
           1            3            6          200            2 

## Variable importance across auc & l10

In [82]:
varimp_df <- as.data.frame(sort(importance(model),decreasing=TRUE))
varimp_df <- merge(varimp_df, as.data.frame(sort(importance(model_l10),decreasing=TRUE)), by=0, all=TRUE)
names(varimp_df) <- c("var","auc_imp","l10_imp")
varimp_df <- varimp_df[order(varimp_df$l10_imp, decreasing=TRUE),]
varimp_df$rank <- seq(1:nrow(varimp_df))
varimp_df

Unnamed: 0_level_0,var,auc_imp,l10_imp,rank
Unnamed: 0_level_1,<I<chr>>,<dbl>,<dbl>,<int>
40,tld_rat,16.207191595,12.7484377,1
1,count,15.889443255,12.3682971,2
30,rarpt_skew,7.056740301,9.2596278,3
11,gibbs_kurt,13.161256415,9.11388,4
32,ren_prp,9.122650732,8.6384074,5
16,gibbs_skew,8.712807112,8.4799362,6
14,gibbs_min,7.867619677,8.2239824,7
18,pdcnt_kurt,9.043655376,7.9437501,8
23,pdcnt_skew,9.050061937,7.4374853,9
33,sldlen_kurt,11.618180029,7.3317912,10


## RF multiclass w/ country, region variables

In [83]:
metrics_df_sub2 = metrics_df %>% 
  select('auc_win', 'l10_win',
         'count','ren_prp','tld_rat',
         'country_maj','region_maj','country_cnt','region_cnt',
#          'geo_cnt', 'geo_maj',
         'daydom_min','daydom_max','daydom_mean','daydom_rng','daydom_std','daydom_skew','daydom_kurt',
         'sldlen_min','sldlen_max','sldlen_mean','sldlen_rng','sldlen_std','sldlen_skew','sldlen_kurt',
         'gibbs_min','gibbs_max','gibbs_mean','gibbs_rng','gibbs_std','gibbs_skew','gibbs_kurt',
         'pdcnt_min','pdcnt_max','pdcnt_mean','pdcnt_rng','pdcnt_std','pdcnt_skew','pdcnt_kurt',
         'rarpt_min','rarpt_max','rarpt_mean','rarpt_rng','rarpt_std','rarpt_skew','rarpt_kurt')

In [84]:
set.seed(123) 
smp_siz = floor(0.8*nrow(metrics_df_sub2))
train_ind = sample(seq_len(nrow(metrics_df_sub2)),size = smp_siz) 
train2 = metrics_df_sub2[train_ind,] 
test2 = metrics_df_sub2[-train_ind,]  

In [85]:
dim(train2)
dim(test2)

In [86]:
train2_auc <- subset(train2, select = -c(l10_win) )
levels(train2_auc$auc_win)
train2_auc$auc_win <- relevel(train2_auc$auc_win, ref=5)
str(train2_auc$auc_win)

 Factor w/ 6 levels "auc_seg2_glm",..: 4 2 3 2 1 1 1 3 2 1 ...


In [87]:
train2_l10 <- subset(train2, select = -c(auc_win) )
train2_l10$l10_win <- relevel(train2_l10$l10_win, ref=5)
str(train2_l10$l10_win )

 Factor w/ 6 levels "l10_seg2_glm",..: 1 1 1 1 3 1 1 1 4 1 ...


In [88]:
train2_auc_imp <- missRanger(train2_auc, num.trees = 100)
train2_l10_imp <- missRanger(train2_l10, num.trees = 100)
test2_imp <- missRanger(test2, num.trees = 100)


Missing value imputation by random forests

  Variables to impute:		daydom_std, daydom_skew, daydom_kurt, sldlen_std, sldlen_skew, sldlen_kurt, gibbs_std, gibbs_skew, gibbs_kurt, pdcnt_std, pdcnt_skew, pdcnt_kurt, rarpt_std, rarpt_skew, rarpt_kurt
  Variables used to impute:	auc_win, count, ren_prp, tld_rat, country_maj, region_maj, country_cnt, region_cnt, daydom_min, daydom_max, daydom_mean, daydom_rng, daydom_std, daydom_skew, daydom_kurt, sldlen_min, sldlen_max, sldlen_mean, sldlen_rng, sldlen_std, sldlen_skew, sldlen_kurt, gibbs_min, gibbs_max, gibbs_mean, gibbs_rng, gibbs_std, gibbs_skew, gibbs_kurt, pdcnt_min, pdcnt_max, pdcnt_mean, pdcnt_rng, pdcnt_std, pdcnt_skew, pdcnt_kurt, rarpt_min, rarpt_max, rarpt_mean, rarpt_rng, rarpt_std, rarpt_skew, rarpt_kurt
iter 1:	...............
iter 2:	...............
iter 3:	...............

Missing value imputation by random forests

  Variables to impute:		daydom_std, daydom_skew, daydom_kurt, sldlen_std, sldlen_skew, sldlen_kurt, gibbs_std

## RF multiclass auc - country & region

In [89]:
model <- ranger(formula         = auc_win ~ ., 
                data            = train2_auc_imp, 
                importance = 'impurity', 
                num.trees       = 500,
                probability = TRUE,
                replace = FALSE,
                sample.fraction = .8,
                seed            = 123,
                respect.unordered.factors=TRUE)

pred <- predict(model, 
                data = test2_imp,
                type="response")$predictions

pred <- as.data.frame(pred)
vars <- names(pred)
pred <- pred %>%
    mutate (win=sapply(apply(.[,c(vars)], 
                          1, function(x) names(x)[which.max(x)]) , function(s) if (length(s) == 0) NA else paste(s, collapse = " ")) 
            ) 

confusionMatrix(data=factor(test2_imp$auc_win), reference=factor(pred$win, levels=levels(factor(test2_imp$auc_win))))

Confusion Matrix and Statistics

              Reference
Prediction     auc_agg_glm auc_agg_rf auc_seg_glm auc_seg_rf auc_seg2_glm
  auc_agg_glm            6          0           2          0           14
  auc_agg_rf             2          1           7          1           10
  auc_seg_glm            5          0           4          0           24
  auc_seg_rf             1          3           4          1           14
  auc_seg2_glm           9          3           6          0           72
  auc_seg2_rf            3          2           6          0           11
              Reference
Prediction     auc_seg2_rf
  auc_agg_glm            0
  auc_agg_rf             0
  auc_seg_glm            0
  auc_seg_rf             0
  auc_seg2_glm           1
  auc_seg2_rf            0

Overall Statistics
                                          
               Accuracy : 0.3962          
                 95% CI : (0.3299, 0.4655)
    No Information Rate : 0.684           
    P-Value [Acc > N

## RF multiclass l10

In [90]:
model_l10 <- ranger(formula         = l10_win ~ ., 
                data            = train2_l10_imp, 
                importance = 'impurity', 
                num.trees       = 500,
                probability = TRUE,
                replace = FALSE,
                sample.fraction = .8,
                seed            = 123,
                respect.unordered.factors=TRUE)

pred <- predict(model_l10, 
                data = test2_imp,
                type="response")$predictions

pred <- as.data.frame(pred)
vars <- names(pred)
pred <- pred %>%
    mutate (win=sapply(apply(.[,c(vars)], 
                          1, function(x) names(x)[which.max(x)]) , function(s) if (length(s) == 0) NA else paste(s, collapse = " ")) 
            ) 

confusionMatrix(data=factor(test2_imp$l10_win), reference=factor(pred$win, levels=levels(factor(test2_imp$l10_win))))

Confusion Matrix and Statistics

              Reference
Prediction     l10_agg_glm l10_agg_rf l10_seg_glm l10_seg_rf l10_seg2_glm
  l10_agg_glm            0          0           1          0           18
  l10_agg_rf             0          0           4          0            6
  l10_seg_glm            0          0           1          0           32
  l10_seg_rf             0          1           0          0            2
  l10_seg2_glm           0          1           1          1          124
  l10_seg2_rf            2          0           1          0           14
              Reference
Prediction     l10_seg2_rf
  l10_agg_glm            1
  l10_agg_rf             0
  l10_seg_glm            0
  l10_seg_rf             0
  l10_seg2_glm           1
  l10_seg2_rf            1

Overall Statistics
                                          
               Accuracy : 0.5943          
                 95% CI : (0.5249, 0.6611)
    No Information Rate : 0.9245          
    P-Value [Acc > N

## Variable importance across auc & l10 -- country & region

In [91]:
varimp_df <- as.data.frame(sort(importance(model),decreasing=TRUE))
varimp_df <- merge(varimp_df, as.data.frame(sort(importance(model_l10),decreasing=TRUE)), by=0, all=TRUE)
names(varimp_df) <- c("var","auc_imp","l10_imp")
varimp_df <- varimp_df[order(varimp_df$l10_imp, decreasing=TRUE),]
varimp_df$rank <- seq(1:nrow(varimp_df))
varimp_df

Unnamed: 0_level_0,var,auc_imp,l10_imp,rank
Unnamed: 0_level_1,<I<chr>>,<dbl>,<dbl>,<int>
1,count,15.311693,12.2144782,1
42,tld_rat,15.23831185,11.994571,2
3,country_maj,11.07141571,9.8014162,3
11,gibbs_kurt,13.30721958,9.7556244,4
30,rarpt_skew,6.78713983,8.6828113,5
16,gibbs_skew,8.16672373,8.4766116,6
34,ren_prp,8.88075429,8.4745175,7
14,gibbs_min,7.90966643,7.5154632,8
18,pdcnt_kurt,8.36394211,7.248391,9
35,sldlen_kurt,11.40667832,7.1475892,10


# Leveraging l10 geo multiclass assigns

In [92]:
model_l10

Ranger result

Call:
 ranger(formula = l10_win ~ ., data = train2_l10_imp, importance = "impurity",      num.trees = 500, probability = TRUE, replace = FALSE, sample.fraction = 0.8,      seed = 123, respect.unordered.factors = TRUE) 

Type:                             Probability estimation 
Number of trees:                  500 
Sample size:                      846 
Number of independent variables:  42 
Mtry:                             6 
Target node size:                 10 
Variable importance mode:         impurity 
Splitrule:                        gini 
OOB prediction error (Brier s.):  0.4460021 

In [94]:
head(pred)

Unnamed: 0_level_0,l10_seg2_glm,l10_agg_glm,l10_agg_rf,l10_seg_glm,l10_seg_rf,l10_seg2_rf,win
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1,0.1647619,0.2533397,0.244927,0.1208151,0.1433746,0.07278175,l10_agg_glm
2,0.09413095,0.1814429,0.4197706,0.1442079,0.08977302,0.0706746,l10_agg_rf
3,0.46560238,0.1062786,0.1017913,0.1114437,0.0390373,0.17584683,l10_seg2_glm
4,0.30808095,0.1571103,0.1971937,0.140146,0.11175159,0.08571746,l10_seg2_glm
5,0.16270635,0.2351746,0.25565,0.1483667,0.08074365,0.11735873,l10_agg_rf
6,0.25851508,0.1724857,0.1747921,0.1126397,0.12087619,0.16069127,l10_seg2_glm


In [102]:
pred_df <- metrics_df[-train_ind,] 
pred_df$pred_l10win <- pred$win 
# dim(pred_df)
# names(pred_df)
pred_df_lookup <- pred_df %>% select(auc_win, l10_win, pred_l10win, tld_reseller)
head(pred_df_lookup)

Unnamed: 0_level_0,auc_win,l10_win,pred_l10win,tld_reseller
Unnamed: 0_level_1,<fct>,<fct>,<chr>,<fct>
1,auc_seg_rf,l10_seg2_rf,l10_agg_glm,sitegmo
3,auc_agg_rf,l10_seg2_glm,l10_agg_rf,sitenamecheap
7,auc_seg2_glm,l10_seg2_glm,l10_seg2_glm,sitereg.ru
12,auc_seg2_rf,l10_seg2_rf,l10_seg2_glm,funchengdu west
14,auc_seg_rf,l10_seg_rf,l10_agg_rf,spacego daddy
15,auc_seg2_glm,l10_seg2_glm,l10_seg2_glm,funnamecheap


In [100]:
names(test)

### add assigned model to pred_df

In [107]:
head(predictions_df)

Unnamed: 0_level_0,actual,pred_df_seg2_glm,pred_df_seg_glm,pred_df_agg_glm,pred_df_seg2_rf,pred_df_seg_rf,pred_df_agg_rf,tld_registrar_index,reseller,n,domain_id,domain
Unnamed: 0_level_1,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<fct>,<int>,<int>,<fct>
1,Not Renewd,0.6470992,0.558,0.419,0.4765771,0.4545657,0.5386137,sitegmo,gmo,106569,44295183,kagen.site
2,Renewed,0.6762222,0.567,0.397,0.5410605,0.5769973,0.6091513,sitegmo,gmo,106569,44573611,designlab.site
3,Renewed,0.9098495,0.835,0.41,0.7018108,0.6343404,0.688668,sitegmo,gmo,106569,45304858,hokatu-blog.site
4,Renewed,0.602935,0.515,0.402,0.2414726,0.3999603,0.3464197,sitegmo,gmo,106569,46235129,suzuya.site
5,Not Renewd,0.9311321,0.863,0.417,0.7749462,0.7039504,0.671174,sitegmo,gmo,106569,46276970,wins-company.site
6,Renewed,0.9347869,0.861,0.391,0.6376848,0.656674,0.6717237,sitegmo,gmo,106569,47809960,yamatoku-company.site


In [110]:
pred_assigned_df <- merge(predictions_df,pred_df_lookup, by.x='tld_registrar_index', by.y='tld_reseller', all=TRUE)
dim(predictions_df)
dim(pred_assigned_df)
head(pred_assigned_df %>% filter(tld_registrar_index=='sitegmo'))

Unnamed: 0_level_0,tld_registrar_index,actual,pred_df_seg2_glm,pred_df_seg_glm,pred_df_agg_glm,pred_df_seg2_rf,pred_df_seg_rf,pred_df_agg_rf,reseller,n,domain_id,domain,auc_win,l10_win,pred_l10win
Unnamed: 0_level_1,<fct>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<int>,<int>,<fct>,<fct>,<fct>,<chr>
1,sitegmo,Not Renewd,0.007010644,0.006,0.0,0.012973132,0.005583303,0.0076133783,gmo,106569,105125506,igguub.site,auc_seg_rf,l10_seg2_rf,l10_agg_glm
2,sitegmo,Not Renewd,0.017977875,0.02,0.016,0.0012636,0.001205006,0.0002808805,gmo,106569,103854676,iuofhn.site,auc_seg_rf,l10_seg2_rf,l10_agg_glm
3,sitegmo,Not Renewd,0.004407615,0.004,0.0,0.007525737,0.00457536,0.0064641536,gmo,106569,105192206,zanozz.site,auc_seg_rf,l10_seg2_rf,l10_agg_glm
4,sitegmo,Not Renewd,0.00699838,0.006,0.0,0.011422635,0.004519164,0.0056738281,gmo,106569,105102351,fxynor.site,auc_seg_rf,l10_seg2_rf,l10_agg_glm
5,sitegmo,Not Renewd,0.006769366,0.006,0.0,0.008389888,0.004467357,0.0042599138,gmo,106569,105114161,tqtszm.site,auc_seg_rf,l10_seg2_rf,l10_agg_glm
6,sitegmo,Not Renewd,0.063116514,0.073,0.089,0.022211532,0.019650927,0.0496194658,gmo,106569,97429969,avbenc.site,auc_seg_rf,l10_seg2_rf,l10_agg_glm


### calculate overall metrics based on assigned model

In [113]:
dim(pred_df_lookup)
dim(predictions_df)
dim(pred_assigned_df)
dim(expiry_test_df_1)

In [137]:
# rename columns to correspond with pred_l10win values
names(pred_assigned_df) <- c('tld_registrar_index','actual',
                             'l10_seg2_glm',    'l10_seg_glm',    'l10_agg_glm',    'l10_seg2_rf',    'l10_seg_rf',    'l10_agg_rf',
#                              'pred_df_seg2_glm','pred_df_seg_glm','pred_df_agg_glm','pred_df_seg2_rf','pred_df_seg_rf','pred_df_agg_rf',
                             'reseller','n','domain_id','domain','auc_win','l10_win','pred_l10win','pred_assigned')
# replace na values in pred_10win, create a fill column for those missing/"none" values
pred_assigned_df <- pred_assigned_df %>%
  mutate_at(vars(pred_l10win), ~replace_na(., "none"))
pred_assigned_df$none <- NA
head(pred_assigned_df)

Unnamed: 0_level_0,tld_registrar_index,actual,l10_seg2_glm,l10_seg_glm,l10_agg_glm,l10_seg2_rf,l10_seg_rf,l10_agg_rf,reseller,n,domain_id,domain,auc_win,l10_win,pred_l10win,pred_assigned,none
Unnamed: 0_level_1,<fct>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<int>,<int>,<fct>,<fct>,<fct>,<chr>,<dbl>,<lgl>
1,fun1&1 internet,Renewed,0.8459141,0.802,0.371,0.7125206,0.6801492,0.6402113,1&1 internet,19,91639803,segeln.fun,,,none,0.8459141,
2,fun1&1 internet,Renewed,0.4607455,0.785,0.36,0.7248957,0.6428311,0.5817404,1&1 internet,19,95819759,harzhotel.fun,,,none,0.4607455,
3,fun10dencehispahard,Not Renewd,,0.0,0.123,0.0,0.3462084,0.4382183,10dencehispahard,1,91044973,movistar.fun,,,none,,
4,fun10dencehispahard,Not Renewd,,0.0,0.12,0.0,0.3169892,0.2664795,10dencehispahard,1,92472913,kuinik.fun,,,none,,
5,fun1api,Renewed,0.9999971,0.842,0.326,0.832,0.8486598,0.7788035,1api,6,91532433,bulgari.fun,,,none,0.9999971,
6,fun1api,Not Renewd,0.9999992,0.843,0.33,0.832,0.8421671,0.8345259,1api,6,101717674,gotsome.fun,,,none,0.9999992,


In [142]:
# pull value from pred_assigned from column specified in pred_l10win
# pred_assigned_df$pred_assigned <- 
x <- pred_assigned_df[[,pred_assigned_df[['pred_l10win']]]]
head(x)

ERROR: Error in .subset2(x, ..2, exact = exact): recursive indexing failed at level 2



In [140]:
dim(x)

In [141]:
length(x)

In [125]:
# need to select one of pred_df_seg2_glm	pred_df_seg_glm	pred_df_agg_glm	pred_df_seg2_rf	pred_df_seg_rf	pred_df_agg_rf 
# based on entry in pred_l10win
lapply(pred_assigned_df, function(x) x)

In [117]:

head(pred_assigned_df)

ERROR: Error in `[.data.frame`(pred_assigned_df, , pred_assigned_df[["pred_l10win"]]): undefined columns selected


In [None]:
sub_df <- 
chart_lift (pred_df=sub_df,
                        dep_var = "actual",
                        pred_var = "first_renewal_prediction") 

# RF Binary -- seg2_glm vs agg_rf