# Fallback Table Metalearning Scripting
- Takes as input preds_df output from predictions_metalearning
- Supplements with fallback
- Computes overall model performance
- Engineers features at tld-reseller level
- Trains metalearning model(s) to assign model based on features
- Assigns model based on previous

In [1]:
options(repr.matrix.max.cols=50, repr.matrix.max.rows=100)

In [2]:
# install.packages("pkgcond")

In [3]:
library(dplyr)
library(data.table)
library(partykit)
library(tictoc)
library(caret)
library(e1071)
library(randomForest)
library(ranger)

#for 3d plotting
library(akima)
library(plotly)

# for prep data
library(stringr)
library(pbapply)
library(stringdist)
library(data.table)
library(dominanceanalysis)


getwd()


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last


Loading required package: grid

Loading required package: libcoin

Loading required package: mvtnorm

Loading required package: lattice

Loading required package: ggplot2

randomForest 4.6-14

Type rfNews() to see new features/changes/bug fixes.


Attaching package: ‘randomForest’


The following object is masked from ‘package:ggplot2’:

    margin


The following object is masked from ‘package:dplyr’:

    combine



Attaching package: ‘ranger’


The following object is masked from ‘package:randomForest’:

    importance



Attaching package: ‘plotly’


The following object is masked from ‘package:ggplot2’:

    last_plot


The following object is masked from ‘package:sta

In [4]:

source('../orig/functions.R')

source('../orig/functions_models.R')

source('../phaseII_03_forest/functions_eval.R')



Attaching package: ‘tidyr’


The following object is masked from ‘package:stringdist’:

    extract




In [5]:
source('functions_metalearning.R')

In [6]:
source('functions_fallback.R')

In [7]:
dataDir='/home/jupyter/Domains_202003/data/output/datapull_20201116'

modelDir='/home/jupyter/Domains_202003/data/output/models_20201104'

outputDir='/home/jupyter/Domains_202003/data/output/datapull_20201127'

In [8]:
# data defined in this notebook
metametrics_df <- read.csv("../../data/output/metametrics_df.csv")
new_metametrics_imp_pred_df <- read.csv( "../../data/output/new_metametrics_imp_pred_df.csv")
geoLookupDF <- read.csv("/home/jupyter/Domains_202003/data/input/PredictiveModelAnalysis_ResellerGeoMap.csv")

In [9]:
# new data pull
expiry_new_df <- readRDS(file.path(outputDir,"expiry_20200902_20201102_20201127"))
expiry_new_df <- expiry_new_df %>% filter(expiry_date < "2020-10-08")
expiry_new_df <- expiry_new_df %>% filter(!is.na(gibb_score))
expiry_new_df <- expiry_new_df %>% mutate (reg_arpt = ifelse(reg_arpt <= 0, 0.0001,reg_arpt),
                                   log_reg_arpt = log(reg_arpt),
                                   tld_registrar_index = tolower(paste(tld, reseller,sep="")))
expiry_new_df <- geo_suppl(expiry_new_df, geoLookupDF = geoLookupDF)

Expiry data originally has 476586 rows and 476586 missing geo's.
... after intial merge on reseller & _country, expiry has 476586 rows and 476408 missing geo's.
... after secondary fill with _country, expiry has 476586 rows and 496 missing geo's.
... after manual tweaks with _country, expiry has 476586 rows and 71 missing geo's.


# LOAD DATA

In [10]:

# Load preds output from predictions_metalearning.R
expiry_df_test_preds <- read.csv("../../data/output/datapull_20201116/expiry_df_test_preds.csv")

# Load training data used for predictions_metalearning.R to assign fallback values
expiry_df_train <- read.csv("../../data/output/datapull_20201116/expiry_df_train.csv")

# Load geo_suppl for train and test-pred data
geoLookupDF <- read.csv("/home/jupyter/Domains_202003/data/input/PredictiveModelAnalysis_ResellerGeoMap.csv")


In [11]:
# Supplement both train and test_preds with geo information
expiry_df_train_g <- geo_suppl(expiry_df_train, geoLookupDF = geoLookupDF)
expiry_df_test_preds_g <- geo_suppl(expiry_df_test_preds, geoLookupDF = geoLookupDF)

Expiry data originally has 3729384 rows and 3729384 missing geo's.
... after intial merge on reseller & _country, expiry has 3729384 rows and 3726455 missing geo's.
... after secondary fill with _country, expiry has 3729384 rows and 6983 missing geo's.
... after manual tweaks with _country, expiry has 3729384 rows and 29 missing geo's.
Expiry data originally has 932347 rows and 932347 missing geo's.
... after intial merge on reseller & _country, expiry has 932347 rows and 931606 missing geo's.
... after secondary fill with _country, expiry has 932347 rows and 1741 missing geo's.
... after manual tweaks with _country, expiry has 932347 rows and 8 missing geo's.


# SUPPL FALLBACK

In [None]:
# generate list of fallback tables
npv_fallback_list = fallback_gen( npv_historic_renewal_data = expiry_df_train_g, 
                                 reseller_am_geo_map = geoLookupDF)

# return list members to in-memory objects of the same name
names(npv_fallback_list)
for(i in 1:length(npv_fallback_list)) assign(names(npv_fallback_list)[i], npv_fallback_list[[i]])

In [None]:
# generate list of low-volume tld-re's from training data
tld_registrar_excl_list = tld_registrar_excl_df(expiry_df_train_g)

# generate placeholder (*_fb) columns in preds df where predictions for low-volume tld-registrars get set to NA
expiry_df_test_preds_g <- expiry_df_test_preds_g %>%
     mutate( across(contains('pred_'), 
                    .fns = list(fb = ~ifelse(tld_registrar_index %in% tld_registrar_excl_list, NA, . )) ))

In [None]:
# apply fallback tables TST
expiry_df_test_preds_g <- fallback_app_1(test_data_op=expiry_df_test_preds_g,
               in_col='pred_seg2_rf_ALL_fb',
               out_col='pred_seg2_rf_ALL_fb2')
expiry_df_test_preds_g <- fallback_app_1(test_data_op=expiry_df_test_preds_g,
               in_col='pred_seg2_glm_ALL_fb',
               out_col='pred_seg2_glm_ALL_fb2')
expiry_df_test_preds_g <- fallback_app_1(test_data_op=expiry_df_test_preds_g,
               in_col='pred_seg_rf_ALL_fb',
               out_col='pred_seg_rf_ALL_fb2')
expiry_df_test_preds_g <- fallback_app_1(test_data_op=expiry_df_test_preds_g,
               in_col='pred_seg_glm_ALL_fb',
               out_col='pred_seg_glm_ALL_fb2')
expiry_df_test_preds_g <- fallback_app_1(test_data_op=expiry_df_test_preds_g,
               in_col='pred_agg_rf_fb',
               out_col='pred_agg_rf_fb2')
expiry_df_test_preds_g <- fallback_app_1(test_data_op=expiry_df_test_preds_g,
               in_col='pred_agg_glm_fb',
               out_col='pred_agg_glm_fb2')
expiry_df_test_preds_g <- fallback_app_1(test_data_op=expiry_df_test_preds_g,
               in_col='pred_agg_rf_ALL_fb',
               out_col='pred_agg_rf_ALL_fb2')
expiry_df_test_preds_g <- fallback_app_1(test_data_op=expiry_df_test_preds_g,
               in_col='pred_agg_glm_ALL_fb',
               out_col='pred_agg_glm_ALL_fb2')

# GENERATE Tld-reseller level Performance Metrics from preds DF

In [None]:
metrics_df <- expiry_df_test_preds_g %>%
  group_by(tld_registrar_index) %>%
  do( l10_seg2_glm = l10_dplyr(., pred_var = "pred_seg2_glm_ALL"),
      l10_seg_glm = l10_dplyr(., pred_var = "pred_seg_glm_ALL"),
       l10_agg_glm_ALL = l10_dplyr(., pred_var = "pred_agg_glm_ALL"),
       l10_agg_glm = l10_dplyr(., pred_var = "pred_agg_glm"),
       l10_seg2_rf = l10_dplyr(., pred_var = "pred_seg2_rf_ALL"),
       l10_seg_rf = l10_dplyr(., pred_var = "pred_seg_rf_ALL"),
       l10_agg_rf = l10_dplyr(., pred_var = "pred_agg_rf"),
       l10_agg_rf_ALL = l10_dplyr(., pred_var = "pred_agg_rf_ALL"),
     
      auc_seg2_glm = auc_dplyr(., pred_var = "pred_seg2_glm_ALL"),
      auc_seg_glm = auc_dplyr(., pred_var = "pred_seg_glm_ALL"),
       auc_agg_glm_ALL = auc_dplyr(., pred_var = "pred_agg_glm_ALL"),
       auc_agg_glm = auc_dplyr(., pred_var = "pred_agg_glm"),
       auc_seg2_rf = auc_dplyr(., pred_var = "pred_seg2_rf_ALL"),
       auc_seg_rf = auc_dplyr(., pred_var = "pred_seg_rf_ALL"),
       auc_agg_rf_ALL = auc_dplyr(., pred_var = "pred_agg_rf_ALL"),
       auc_agg_rf = auc_dplyr(., pred_var = "pred_agg_rf"),
      
     l10_seg2_glm_fb = l10_dplyr(., pred_var = "pred_seg2_glm_ALL_fb2"),
      l10_seg_glm_fb = l10_dplyr(., pred_var = "pred_seg_glm_ALL_fb2"),
       l10_agg_glm_fb = l10_dplyr(., pred_var = "pred_agg_glm_fb2"),
       l10_agg_glm_ALL_fb = l10_dplyr(., pred_var = "pred_agg_glm_ALL_fb2"),
       l10_seg2_rf_fb = l10_dplyr(., pred_var = "pred_seg2_rf_ALL_fb2"),
       l10_seg_rf_fb = l10_dplyr(., pred_var = "pred_seg_rf_ALL_fb2"),
       l10_agg_rf_fb = l10_dplyr(., pred_var = "pred_agg_rf_fb2"),
       l10_agg_rf_ALL_fb = l10_dplyr(., pred_var = "pred_agg_rf_ALL_fb2"),
     
      auc_seg2_glm_fb = auc_dplyr(., pred_var = "pred_seg2_glm_ALL_fb2"),
      auc_seg_glm_fb = auc_dplyr(., pred_var = "pred_seg_glm_ALL_fb2"),
       auc_agg_glm_fb = auc_dplyr(., pred_var = "pred_agg_glm_fb2"),
       auc_agg_glm_ALL_fb = auc_dplyr(., pred_var = "pred_agg_glm_ALL_fb2"),
       auc_seg2_rf_fb = auc_dplyr(., pred_var = "pred_seg2_rf_ALL_fb2"),
       auc_seg_rf_fb = auc_dplyr(., pred_var = "pred_seg_rf_ALL_fb2"),
       auc_agg_rf_fb = auc_dplyr(., pred_var = "pred_agg_rf_fb2"),
       auc_agg_rf_ALL_fb = auc_dplyr(., pred_var = "pred_agg_rf_ALL_fb2"),
      )


In [None]:
head(metrics_df)

# FEATURE ENGINEERING at tld-reseller level 

In [None]:
meta_df = expiry_df_test_preds_g %>%
  add_count(tld_registrar_index, reseller_geo) %>%
  group_by(tld_registrar_index) %>%
  summarise(
            geo_maj = reseller_geo[n == max(n)][1],
            geo_cnt = n_distinct(reseller_geo),
            n = n(),
            ren_prp = sum(renewal_status=='Renewed')/sum(n),
            tld_cnt = n_distinct(tld), tld_rat = tld_cnt/n,
              
            daydom_min = min(day_domains), 
            daydom_max = max(day_domains), 
            daydom_mean = mean(day_domains, na.rm = TRUE), 
            daydom_rng = daydom_max - daydom_min, 
            daydom_std = sd(day_domains, na.rm = TRUE), 
            daydom_skew = skewness(day_domains, na.rm = TRUE), 
            daydom_kurt = kurtosis(day_domains, na.rm = TRUE),

            sldlen_min = min(sld_length), 
            sldlen_max = max(sld_length), 
            sldlen_mean = mean(sld_length, na.rm = TRUE), 
            sldlen_rng = sldlen_max - sldlen_min, 
            sldlen_std = sd(sld_length, na.rm = TRUE), 
            sldlen_skew = skewness(sld_length, na.rm = TRUE), 
            sldlen_kurt = kurtosis(sld_length, na.rm = TRUE),
  
            gibbs_min = min(gibb_score), 
            gibbs_max = max(gibb_score), 
            gibbs_mean = mean(gibb_score, na.rm = TRUE), 
            gibbs_rng = gibbs_max - gibbs_min, 
            gibbs_std = sd(gibb_score, na.rm = TRUE), 
            gibbs_skew = skewness(gibb_score, na.rm = TRUE), 
            gibbs_kurt = kurtosis(gibb_score, na.rm = TRUE),
  
            pdcnt_min = min(pattern_domain_count), 
            pdcnt_max = max(pattern_domain_count), 
            pdcnt_mean = mean(pattern_domain_count, na.rm = TRUE), 
            pdcnt_rng = pdcnt_max - pdcnt_min, 
            pdcnt_std = sd(pattern_domain_count, na.rm = TRUE), 
            pdcnt_skew = skewness(pattern_domain_count, na.rm = TRUE), 
            pdcnt_kurt = kurtosis(pattern_domain_count, na.rm = TRUE),
  
            regarpt_min = min(reg_arpt), 
            regarpt_max = max(reg_arpt), 
            regarpt_mean = mean(reg_arpt, na.rm = TRUE), 
            regarpt_rng = regarpt_max - regarpt_min, 
            regarpt_std = sd(reg_arpt, na.rm = TRUE), 
            regarpt_skew = skewness(reg_arpt, na.rm = TRUE), 
            regarpt_kurt = kurtosis(reg_arpt, na.rm = TRUE))

# add a handful more vars 
country_maj = expiry_df_test_preds_g %>%
  add_count(tld_registrar_index, reseller_country) %>%
  group_by(tld_registrar_index) %>%
  mutate(reseller_country_maj = reseller_country[n == max(n)][1]) %>%
  select(-n) %>% 
  group_by(tld_registrar_index,reseller_country_maj) %>%
    summarise(n = n()) %>% 
    arrange(desc(n)) %>%
    pull(reseller_country_maj)

country_cnt = expiry_df_test_preds_g %>%
  add_count(tld_registrar_index, reseller_country) %>%
  group_by(tld_registrar_index) %>%
  summarise(reseller_country_cnt = n_distinct(reseller_country)) %>%
  pull(reseller_country_cnt)  

region_maj = expiry_df_test_preds_g %>%
  add_count(tld_registrar_index, region) %>%
  group_by(tld_registrar_index) %>%
  mutate(region_maj = region[n == max(n)][1]) %>%
  select(-n) %>% 
  group_by(tld_registrar_index,region_maj) %>%
    summarise(n = n()) %>% 
    arrange(desc(n)) %>%
    pull(region_maj)

region_cnt = expiry_df_test_preds_g %>%
  add_count(tld_registrar_index, region) %>%
  group_by(tld_registrar_index) %>%
  summarise(reseller_region_cnt = n_distinct(reseller_country)) %>%
  pull(reseller_region_cnt)  

meta_df = meta_df %>% 
    mutate(country_maj = country_maj,
            region_maj = region_maj,
            country_cnt = country_cnt,
            region_cnt = region_cnt,
            )

# JOIN preds metrics with pred meta

In [None]:
metametrics_df <- merge(meta_df, metrics_df, on = 'tld_registrar_index', all = TRUE)

In [None]:
dim(metametrics_df)


# ADD win flags for seg2_glm vs. seg2_gm_fb vs. agg_rf_ALL


In [None]:
(auc_vars = c('auc_seg2_glm','auc_agg_rf_ALL','auc_seg2_glm_fb'))
(l10_vars = c('l10_seg2_glm','l10_agg_rf_ALL','l10_seg2_glm_fb'))

metametrics_df <- metametrics_df %>%
    mutate (auc_win_04=sapply(apply(.[,c(auc_vars)], 
                          1, function(x) names(x)[which.max(x)]) , function(s) if (length(s) == 0) NA else paste(s, collapse = " ")) ,
            l10_win_04=sapply(apply(.[,c(l10_vars)], 
                          1, function(x) names(x)[which.max(x)]), function(s) if (length(s) == 0) NA else paste(s, collapse = " ")) 
            ) 

In [None]:
metametrics_df <- metametrics_df %>% mutate_if(is.list,as.numeric) 
dim(metametrics_df)

In [None]:
write.csv(metametrics_df, "../../data/output/metametrics_df.csv", row.names=FALSE)

In [None]:
metametrics_df <- read.csv("../../data/output/metametrics_df.csv")

# IMPUTE missing values

In [None]:
# remove observations with missing wins -- we don't want to impute these dependent variables 
metametrics_df <- metametrics_df %>% filter(!is.na(auc_win_04))

In [None]:
# install.packages("missRanger")
library(missRanger)

In [None]:
metametrics_imp_df <- missRanger(metametrics_df, num.trees = 100)

# Train Models 

In [None]:
# l10

#Compute weights to balance the RF
Y = metametrics_imp_df$l10_win_04
w <- 1/table(Y)
w <- w/sum(w)

weights <- rep(0, length(Y))

for (model in unique(Y)){
    weights[Y==model] <- w[model]
}


model_l10 <- ranger(formula         = l10_win_04 ~ ., 
                data            = metametrics_imp_df %>% 
                                    select('l10_win_04') %>% 
                                    bind_cols(
                                        metametrics_imp_df %>% 
                                        select(-contains('auc'),-contains('l10'),-'tld_registrar_index', -'tld_rat')), 
                importance = 'impurity', 
                num.trees       = 500,
                probability = TRUE,
                replace = FALSE,
                sample.fraction = .8,
                seed            = 123,
                respect.unordered.factors=TRUE,
               case.weights=weights)
                           
# auc
                                        
#Compute weights to balance the RF
Y = metametrics_imp_df$auc_win_04
w <- 1/table(Y)
w <- w/sum(w)

weights <- rep(0, length(Y))

for (model in unique(Y)){
    weights[Y==model] <- w[model]
}

                                               
model_auc <- ranger(formula         = auc_win_04 ~ ., 
                data            = metametrics_imp_df %>% 
                                    select('auc_win_04') %>% 
                                    bind_cols(
                                        metametrics_imp_df %>% 
                                        select(-contains('auc'),-contains('l10'),-'tld_registrar_index', -'tld_rat')), 
                importance = 'impurity', 
                num.trees       = 500,
                probability = TRUE,
                replace = FALSE,
                sample.fraction = .8,
                seed            = 123,
                respect.unordered.factors=TRUE,
               case.weights=weights)
                              

# Load new data pull, create meta-features

In [None]:
expiry_new_df <- readRDS("/home/jupyter/Domains_202003/data/output/datapull_20201127/expiry_20200902_20201102_20201127")

In [None]:
# strip out data that is within 50 days of data pull -- incomplete
# from Parag: "... are stil in agp so we don’t know their final renewal status 
#             So you cannot use those domains to check the actual renewal status"
expiry_new_df <- expiry_new_df %>% filter(expiry_date < "2020-10-08")

#remove missing gibb_score, etc.
expiry_new_df <- expiry_new_df %>% filter(!is.na(gibb_score))

# add necessary columns
expiry_new_df <- expiry_new_df %>% mutate (reg_arpt = ifelse(reg_arpt <= 0, 0.0001,reg_arpt),
                                   log_reg_arpt = log(reg_arpt),
                                   tld_registrar_index = tolower(paste(tld, reseller,sep="")))
expiry_new_df <- geo_suppl(expiry_new_df, geoLookupDF = geoLookupDF)

In [None]:
# engineer metadata
new_meta_df = expiry_new_df %>%
  add_count(tld_registrar_index, reseller_geo) %>%
  group_by(tld_registrar_index) %>%
  summarise(
            geo_maj = reseller_geo[n == max(n)][1],
            geo_cnt = n_distinct(reseller_geo),
            n = n(),
            ren_prp = sum(renewal_status=='Renewed')/sum(n),
            tld_cnt = n_distinct(tld), tld_rat = tld_cnt/n,
              
            daydom_min = min(day_domains), 
            daydom_max = max(day_domains), 
            daydom_mean = mean(day_domains, na.rm = TRUE), 
            daydom_rng = daydom_max - daydom_min, 
            daydom_std = sd(day_domains, na.rm = TRUE), 
            daydom_skew = skewness(day_domains, na.rm = TRUE), 
            daydom_kurt = kurtosis(day_domains, na.rm = TRUE),

            sldlen_min = min(sld_length), 
            sldlen_max = max(sld_length), 
            sldlen_mean = mean(sld_length, na.rm = TRUE), 
            sldlen_rng = sldlen_max - sldlen_min, 
            sldlen_std = sd(sld_length, na.rm = TRUE), 
            sldlen_skew = skewness(sld_length, na.rm = TRUE), 
            sldlen_kurt = kurtosis(sld_length, na.rm = TRUE),
  
            gibbs_min = min(gibb_score), 
            gibbs_max = max(gibb_score), 
            gibbs_mean = mean(gibb_score, na.rm = TRUE), 
            gibbs_rng = gibbs_max - gibbs_min, 
            gibbs_std = sd(gibb_score, na.rm = TRUE), 
            gibbs_skew = skewness(gibb_score, na.rm = TRUE), 
            gibbs_kurt = kurtosis(gibb_score, na.rm = TRUE),
  
            pdcnt_min = min(pattern_domain_count), 
            pdcnt_max = max(pattern_domain_count), 
            pdcnt_mean = mean(pattern_domain_count, na.rm = TRUE), 
            pdcnt_rng = pdcnt_max - pdcnt_min, 
            pdcnt_std = sd(pattern_domain_count, na.rm = TRUE), 
            pdcnt_skew = skewness(pattern_domain_count, na.rm = TRUE), 
            pdcnt_kurt = kurtosis(pattern_domain_count, na.rm = TRUE),
  
            regarpt_min = min(reg_arpt), 
            regarpt_max = max(reg_arpt), 
            regarpt_mean = mean(reg_arpt, na.rm = TRUE), 
            regarpt_rng = regarpt_max - regarpt_min, 
            regarpt_std = sd(reg_arpt, na.rm = TRUE), 
            regarpt_skew = skewness(reg_arpt, na.rm = TRUE), 
            regarpt_kurt = kurtosis(reg_arpt, na.rm = TRUE))

# add a handful more vars 
country_maj = expiry_new_df %>%
  add_count(tld_registrar_index, reseller_country) %>%
  group_by(tld_registrar_index) %>%
  mutate(reseller_country_maj = reseller_country[n == max(n)][1]) %>%
  select(-n) %>% 
  group_by(tld_registrar_index,reseller_country_maj) %>%
    summarise(n = n()) %>% 
    arrange(desc(n)) %>%
    pull(reseller_country_maj)

country_cnt = expiry_new_df %>%
  add_count(tld_registrar_index, reseller_country) %>%
  group_by(tld_registrar_index) %>%
  summarise(reseller_country_cnt = n_distinct(reseller_country)) %>%
  pull(reseller_country_cnt)  

region_maj = expiry_new_df %>%
  add_count(tld_registrar_index, region) %>%
  group_by(tld_registrar_index) %>%
  mutate(region_maj = region[n == max(n)][1]) %>%
  select(-n) %>% 
  group_by(tld_registrar_index,region_maj) %>%
    summarise(n = n()) %>% 
    arrange(desc(n)) %>%
    pull(region_maj)

region_cnt = expiry_new_df %>%
  add_count(tld_registrar_index, region) %>%
  group_by(tld_registrar_index) %>%
  summarise(reseller_region_cnt = n_distinct(reseller_country)) %>%
  pull(reseller_region_cnt)  

new_meta_df = new_meta_df %>% 
    mutate(country_maj = country_maj,
            region_maj = region_maj,
            country_cnt = country_cnt,
            region_cnt = region_cnt,
            )

In [None]:
new_meta_df %>%
  select(everything()) %>%  
  summarise_all(funs(round(sum(is.na(.))/nrow(expiry_new_df),3))) %>% t() 

In [None]:
new_meta_imp_df <- missRanger(new_meta_df, num.trees = 100)

# Predict modeltype for new_meta_df

In [None]:
# l10

new_pred_l10 <- as.data.frame(predict(model_l10, 
                data = new_meta_imp_df,
                type="response")$predictions) %>%
    mutate (l10_win_04_pred_model=sapply(apply(., 
                          1, function(x) names(x)[which.max(x)]) , 
                                    function(s) if (length(s) == 0) NA else paste(s, collapse = " ")) 
            ) 
                                        
# auc


new_pred_auc <- as.data.frame(predict(model_auc, 
                data = new_meta_imp_df,
                type="response")$predictions) %>%
                                               
    mutate (auc_win_04_pred_model=sapply(apply(., 
                          1, function(x) names(x)[which.max(x)]) , 
                                    function(s) if (length(s) == 0) NA else paste(s, collapse = " ")) 
            ) 

    

In [None]:
new_metametrics_imp_pred_df <- cbind(new_meta_df,new_pred_l10$l10_win_04_pred_model,new_pred_auc$auc_win_04_pred_model)
new_metametrics_imp_pred_df <- new_metametrics_imp_pred_df %>% rename(l10_win_04_pred_model = length(new_metametrics_imp_pred_df)-1,
                                                                     auc_win_04_pred_model = length(new_metametrics_imp_pred_df))

In [None]:
new_metametrics_imp_pred_df <- new_metametrics_imp_pred_df %>% 
   mutate_at(vars(l10_win_04_pred_model), list(~(gsub("l10_", "", .)))) %>% 
   mutate_at(vars(auc_win_04_pred_model), list(~(gsub("auc_", "", .))))

In [None]:
dim(new_metametrics_imp_pred_df)
head(new_metametrics_imp_pred_df)

In [None]:
table(new_metametrics_imp_pred_df$l10_win_04_pred_model)
table(new_metametrics_imp_pred_df$auc_win_04_pred_model)

In [None]:
# write.csv(new_metametrics_imp_pred_df, "../../data/output/new_metametrics_imp_pred_df.csv", row.names=FALSE)

new_metametrics_imp_pred_df <- read.csv( "../../data/output/new_metametrics_imp_pred_df.csv")

# Compute predictions based on assignment

In [None]:
tld_re_model_lookup <- new_metametrics_imp_pred_df %>% 
   select(tld_registrar_index, l10_win_04_pred_model, auc_win_04_pred_model) %>%
   melt(id.vars = c("tld_registrar_index"), variable.name = "model") %>%
   select (tld_registrar_index, value) %>% distinct() 

In [None]:
dim(tld_re_model_lookup)
head(tld_re_model_lookup)

In [None]:
head(expiry_new_df)

In [None]:
test_list = split(expiry_new_df, expiry_new_df$tld_registrar_index)
length(test_list)
test_list[[1]] %>% head()

In [None]:
tld_re_model_lookup_sub <- tld_re_model_lookup %>% sample_n(size=20)
tld_re_list <- tld_re_model_lookup_sub %>% pull(tld_registrar_index)
expiry_new_df_sub <- expiry_new_df %>% filter(tld_registrar_index %in% tld_re_list )
test_list = split(expiry_new_df_sub , expiry_new_df_sub$tld_registrar_index)
tld_re_model_lookup_sub$tld_registrar_index <- as.character(tld_re_model_lookup_sub$tld_registrar_index)

for (model in unique(tld_re_model_lookup_sub$value)){
    tld_registrar_list <- tld_re_model_lookup_sub %>% filter(value==model) %>% pull(tld_registrar_index)
    
    if (model == 'agg_rf_ALL'){
        cat("\n\nPredicting model_agg_rf_ALL for",length(tld_registrar_list),"tld-re's \n")
        load(file.path(modelDir, 'model_agg_rf_ALL.Rdata'))
        preds_agg_rf_ALL = lapply(tld_registrar_list, 
               function(tld_reseller_str) pred_agg_rf(model_agg_rf_ALL, 
                                                      test_list, 
                                                      tld_reseller_str)
               )
        rm(model_agg_rf_ALL)
        gc() 

        save(preds_agg_rf_ALL, file=file.path(outputDir, 'meta_preds', 'preds_agg_rf_ALL.RData'))
        preds_agg_rf_ALL_df <- cbind(rbindlist(test_list[tld_registrar_list], use.names=TRUE), 
                                     rbindlist(preds_agg_rf_ALL, use.names=TRUE))
        preds_agg_rf_ALL_df$model <- 'preds_agg_rf_ALL'
        
        
    }
    
    if (model == 'seg2_glm'){
        cat("\n\nPredicting model_seg2_glm_ALL for",length(tld_registrar_list),"tld-re's\n")
        lapply(Sys.glob(file.path(modelDir,'model_seg2_glm_*')),load,.GlobalEnv)
        preds_seg2_glm_ALL = lapply(tld_registrar_list, 
               function(tld_reseller_str) pred_seg2_glm(
                   test_list, 
                   tld_reseller_str)
               )
        rm(list=ls(pattern='^model_seg2_glm_'))    
        save(preds_seg2_glm_ALL, file=file.path(outputDir, 'meta_preds', 'preds_seg2_glm_ALL.RData'))          
        preds_seg2_glm_ALL_df <- cbind(rbindlist(test_list[tld_registrar_list], use.names=TRUE), 
                                     rbindlist(preds_seg2_glm_ALL, use.names=TRUE))
        preds_seg2_glm_ALL_df$model <- 'preds_seg2_glm_ALL'
    }
    
    if (model == 'seg2_glm_fb'){
        cat("\n\nPredicting model_seg2_glm_fb for",length(tld_registrar_list),"tld-re's\n")
        # generate list of fallback tables
        npv_fallback_list = fallback_gen( npv_historic_renewal_data = expiry_df_train_g, 
                                     reseller_am_geo_map = geoLookupDF)

        # return list members to in-memory objects of the same name
        for(i in 1:length(npv_fallback_list)) assign(names(npv_fallback_list)[i], npv_fallback_list[[i]])


        # generate placeholder (*_fb) columns in preds df where predictions for low-volume tld-registrars get set to NA       
        tld_registrar_excl_list = tld_registrar_list
        expiry_df_test_preds_g <- expiry_df_test_preds_g %>%
             mutate( pred_seg2_glm_fb = NA)

        # apply fallback tables (creating cols *_fb2)
        expiry_df_test_preds_g <- fallback_app_1(test_data_op=expiry_df_test_preds_g,
                       in_col='pred_seg2_glm_fb',
                       out_col='pred_seg2_glm_fb2')

        preds_seg2_glm_fb <- expiry_df_test_preds_g %>% 
            filter(tld_registrar_index %in% tld_registrar_excl_list) %>% 
            select(renewal_status,pred_seg2_glm_fb2) 
        names(preds_seg2_glm_fb) = c('actual','predicted')


        save(preds_seg2_glm_fb, file=file.path(outputDir, 'meta_preds', 'preds_seg2_glm_fb.RData'))         
        preds_seg2_glm_fb_df <- cbind(rbindlist(test_list[tld_registrar_list], use.names=TRUE), 
                                      preds_seg2_glm_fb)
        
        preds_seg2_glm_fb_df$model <- 'preds_seg2_glm_fb'
    }
    }

preds_meta <- rbind(preds_agg_rf_ALL_df, preds_seg2_glm_ALL_df, preds_seg2_glm_fb_df)

In [None]:
preds_meta

In [None]:
tld_registrar_list
tld_registrar_excl_list

In [None]:
dim(preds_seg2_glm_fb)

In [None]:
cbind(rbindlist(test_list[tld_registrar_list], use.names=TRUE), 
                                      preds_seg2_glm_fb)

In [None]:
preds_meta

In [None]:
pred_seg2_glm(
                   test_list, 
                   tld_reseller_str)
               

In [None]:
preds_seg2_glm_ALL[[2]]

In [None]:
tld_registrar_list[[6]]

In [None]:
test_list[tld_registrar_list[[6]]]

In [None]:
test_list %>% filter(tld_registrar_index == 'spacecloudflare, inc.')

In [None]:
preds_seg2_glm_ALL[[6]]

In [None]:
# lapply(Sys.glob(file.path(modelDir,'model_seg2_glm_*')),load,.GlobalEnv)

In [None]:
# ls()

In [None]:
tld_registrar_list

In [None]:
tld_registrar_list[[6]]

In [None]:
pred_seg2_glm(
                   test_list, 
                   'spacecloudflare, inc.'
               )

In [None]:
tld_reseller_str='spacecloudflare, inc.'

In [None]:
# pred_seg2_glm <- function(test_list, tld_reseller_str) {
    
    print(tld_reseller_str)
    # seg2 glm (tld-reseller-segmented glm)

    test_list_tld_reseller = test_list[tld_reseller_str]
    test_df_tld_reseller =  rbindlist(test_list_tld_reseller,use.names=TRUE)
    
    # if test data contains no observations, skip!
    if ((dim(test_df_tld_reseller)[1]==0)){
        pred_df_seg2_glm = NA
    } else {
        
        model_name <- paste0('model_seg2_glm_',str_replace_all(tld_reseller_str, "[^[:alnum:]]", ""))
        
        if ((!exists(model_name))){
            pred_df_seg2_glm = data.frame("actual" = rep(NA, nrow(test_df_tld_reseller)),
                              "predicted" = rep(NA, nrow(test_df_tld_reseller)))
        } else{
            model <- get(model_name)
            pred = mass_predict_first_renewal(test_list_tld_reseller, model)
    
            pred_df_seg2_glm = data.frame("actual" = pred$renewal_status,
                                      "predicted" = pred$first_renewal_prediction)
        }

        
    }
    
#     return(pred_df_seg2_glm)
    
# }

In [None]:
pred_df_seg2_glm = data.frame("actual" = rep(NA, nrow(test_df_tld_reseller)),
                              "predicted" = rep(NA, nrow(test_df_tld_reseller)))

In [None]:
pred_df_seg2_glm

In [None]:
tld_reseller_str = 'storeonlinenic'
test = pred_seg2_glm(
                   test_list, 
                   tld_reseller_str)
test

In [None]:
Sys.glob(file.path(modelDir,'model_seg2_glm_*'))

In [None]:
lapply(Sys.glob(file.path(modelDir,'model_seg2_glm_*')),load,.GlobalEnv)

In [None]:
# write.csv(preds_agg_rf_ALL_df, "../../data/output/preds_agg_rf_ALL_df.csv", row.names=FALSE)
# write.csv(preds_seg2_glm_ALL_df, "../../data/output/preds_seg2_glm_ALL_df.csv", row.names=FALSE)
# write.csv(preds_seg2_glm_fb_df, "../../data/output/preds_seg2_glm_fb_df.csv", row.names=FALSE)
# write.csv(tld_re_model_lookup_sub, "../../data/output/tld_re_model_lookup_sub.csv", row.names=FALSE)


In [None]:
# preds_agg_rf_ALL_df <- read.csv("../../data/output/preds_agg_rf_ALL_df.csv")
# preds_seg2_glm_ALL_df <- read.csv( "../../data/output/preds_seg2_glm_ALL_df.csv")
# preds_seg2_glm_fb_df <- read.csv("../../data/output/preds_seg2_glm_fb_df.csv")
# tld_re_model_lookup_sub <- read.csv("../../data/output/tld_re_model_lookup_sub.csv")

In [None]:
head(preds_meta)

# Troubleshoot script

In [None]:
pred_select <- function (expiry_new_df,
                         new_metametrics_imp_pred_df,
                         dataDir='/home/jupyter/Domains_202003/data/output/datapull_20201116',
                         modelDir='/home/jupyter/Domains_202003/data/output/models_20201104',
                         outputDir='/home/jupyter/Domains_202003/data/output/datapull_20201127'
                      ){  
    
    test_list = split(expiry_new_df, expiry_new_df$tld_registrar_index)
    tld_re_model_lookup <- new_metametrics_imp_pred_df %>% 
       select(tld_registrar_index, l10_win_04_pred_model, auc_win_04_pred_model) %>%
       melt(id.vars = c("tld_registrar_index"), variable.name = "model") %>%
       select (tld_registrar_index, value) %>% distinct() 
    tld_re_model_lookup$tld_registrar_index <- as.character(tld_re_model_lookup$tld_registrar_index)
    
    for (model in unique(tld_re_model_lookup$value)){
        tld_registrar_list <- tld_re_model_lookup %>% filter(value==model) %>% pull(tld_registrar_index)

#         if (model == 'agg_rf_ALL'){
#             cat("\n\nPredicting model_agg_rf_ALL for",length(tld_registrar_list),"tld-re's \n")
#             load(file.path(modelDir, 'model_agg_rf_ALL.Rdata'))
#             preds_agg_rf_ALL = lapply(tld_registrar_list, 
#                    function(tld_reseller_str) pred_agg_rf(model_agg_rf_ALL, 
#                                                           test_list, 
#                                                           tld_reseller_str)
#                    )
#             rm(model_agg_rf_ALL)
#             gc() 

#             save(preds_agg_rf_ALL, file=file.path(outputDir, 'meta_preds', 'preds_agg_rf_ALL.RData'))
#             preds_agg_rf_ALL_df <- cbind(rbindlist(test_list[tld_registrar_list], use.names=TRUE), 
#                                          rbindlist(preds_agg_rf_ALL, use.names=TRUE))
#             preds_agg_rf_ALL_df$model <- 'preds_agg_rf_ALL'


#         }

#         if (model == 'seg2_glm'){
#             cat("\n\nPredicting model_seg2_glm_ALL for",length(tld_registrar_list),"tld-re's\n")
#             lapply(Sys.glob(file.path(modelDir,'model_seg2_glm_*')),load,.GlobalEnv)
#             preds_seg2_glm_ALL = lapply(tld_registrar_list, 
#                    function(tld_reseller_str) pred_seg2_glm(
#                        test_list, 
#                        tld_reseller_str)
#                    )
#             rm(list=ls(pattern='^model_seg2_glm_'))    
#             save(preds_seg2_glm_ALL, file=file.path(outputDir, 'meta_preds', 'preds_seg2_glm_ALL.RData'))          
#             preds_seg2_glm_ALL_df <- cbind(rbindlist(test_list[tld_registrar_list], use.names=TRUE), 
#                                          rbindlist(preds_seg2_glm_ALL, use.names=TRUE))
#             preds_seg2_glm_ALL_df$model <- 'preds_seg2_glm_ALL'
#         }

        if (model == 'seg2_glm_fb'){

            cat("\n\nPredicting model_seg2_glm_fb for",length(tld_registrar_list),"tld-re's\n")
            # generate list of fallback tables
            npv_fallback_list = fallback_gen( npv_historic_renewal_data = expiry_df_train_g, 
                                         reseller_am_geo_map = geoLookupDF)

            # return list members to in-memory objects of the same name
            #             for(i in 1:length(npv_fallback_list)) assign(names(npv_fallback_list)[i], npv_fallback_list[[i]]) # doesn't work in script
            list2env(npv_fallback_list, envir = .GlobalEnv)

            # subset expiry_new_df to only include tld-re's for fallback
            tld_registrar_excl_list = tld_registrar_list
            expiry_new_df_sample <- expiry_new_df %>% filter(tld_registrar_index %in% tld_registrar_excl_list)

            # geo suppl for fallback app
            preds_seg2_glm_fb <- geo_suppl(expiry_new_df_sample, geoLookupDF = geoLookupDF)

            # generate placeholder (*_fb) columns in preds df where predictions for low-volume tld-registrars get set to NA
            preds_seg2_glm_fb <- preds_seg2_glm_fb %>% mutate( pred_seg2_glm_fb = NA)

            # apply fallback tables (creating cols *_fb2)
            preds_seg2_glm_fb <- fallback_app_1(test_data_op=preds_seg2_glm_fb,
                           in_col='pred_seg2_glm_fb',
                           out_col='pred_seg2_glm_fb2')


            preds_seg2_glm_fb <- preds_seg2_glm_fb %>% select(renewal_status,pred_seg2_glm_fb2) 
            names(preds_seg2_glm_fb) = c('actual','predicted')


            # save(preds_seg2_glm_fb, file=file.path(outputDir, 'meta_preds', 'preds_seg2_glm_fb.RData'))         
            preds_seg2_glm_fb_df <- cbind(expiry_new_df_sample, 
                                          preds_seg2_glm_fb)

            preds_seg2_glm_fb_df$model <- 'preds_seg2_glm_fb'
            
        }
        }

#     preds_meta <- rbind(preds_agg_rf_ALL_df, preds_seg2_glm_ALL_df, preds_seg2_glm_fb_df)
    preds_meta <- preds_seg2_glm_fb_df
#     write.csv(preds_meta, file.path(outputDir, 'preds_select','preds.csv'))
    return(preds_meta)

}

In [None]:
# load(file.path(outputDir, 'meta_preds','expiry_new_df.RData'))
# load(file.path(outputDir, 'meta_preds','new_metametrics_imp_pred_df.RData'))

In [None]:


preds_meta <- pred_select(expiry_new_df_sample,
                         new_metametrics_imp_pred_df,
                         dataDir=dataDir,
                         modelDir=modelDir,
                         outputDir=outputDir
                      )

In [None]:
head(preds_meta)

In [None]:
expiry_new_df_sample <- expiry_new_df %>% sample_n(0.01*nrow(expiry_new_df))

tld_re_model_lookup <- new_metametrics_imp_pred_df %>% 
  select(tld_registrar_index, l10_win_04_pred_model, auc_win_04_pred_model) %>%
  melt(id.vars = c("tld_registrar_index"), variable.name = "model") %>%
  select (tld_registrar_index, value) %>% distinct() 
  tld_re_model_lookup$tld_registrar_index <- as.character(tld_re_model_lookup$tld_registrar_index)

model ='seg2_glm_fb'
tld_registrar_list <- tld_re_model_lookup %>% filter(value==model) %>% pull(tld_registrar_index)

cat("\n\nPredicting model_seg2_glm_fb for",length(tld_registrar_list),"tld-re's\n")
# generate list of fallback tables
npv_fallback_list = fallback_gen( npv_historic_renewal_data = expiry_df_train_g, 
                             reseller_am_geo_map = geoLookupDF)

# return list members to in-memory objects of the same name
#             for(i in 1:length(npv_fallback_list)) assign(names(npv_fallback_list)[i], npv_fallback_list[[i]]) # doesn't work in script
list2env(npv_fallback_list, envir = .GlobalEnv)

# subset expiry_new_df_sample to only include tld-re's for fallback, supplement with geo for fallback app
tld_registrar_excl_list = tld_registrar_list
expiry_new_df_sample <- expiry_new_df_sample %>% filter(tld_registrar_index %in% tld_registrar_excl_list)

# geo suppl for fallback app
preds_seg2_glm_fb <- geo_suppl(expiry_new_df_sample, geoLookupDF = geoLookupDF)

# generate placeholder (*_fb) columns in preds df where predictions for low-volume tld-registrars get set to NA
preds_seg2_glm_fb <- preds_seg2_glm_fb %>% mutate( pred_seg2_glm_fb = NA)

# apply fallback tables (creating cols *_fb2)
preds_seg2_glm_fb <- fallback_app_1(test_data_op=preds_seg2_glm_fb,
               in_col='pred_seg2_glm_fb',
               out_col='pred_seg2_glm_fb2')


preds_seg2_glm_fb <- preds_seg2_glm_fb %>% select(renewal_status,pred_seg2_glm_fb2) 
names(preds_seg2_glm_fb) = c('actual','predicted')


# save(preds_seg2_glm_fb, file=file.path(outputDir, 'meta_preds', 'preds_seg2_glm_fb.RData'))         
preds_seg2_glm_fb_df <- cbind(expiry_new_df_sample, 
                              preds_seg2_glm_fb)

preds_seg2_glm_fb_df$model <- 'preds_seg2_glm_fb'

In [None]:
head(preds_seg2_glm_fb_df)

In [None]:
# tld_registrar_str = 'funafrihost'

In [None]:
objname = load(file.path(outputDir, 'meta_preds', 'preds_seg2_glm_ALL.RData'))

In [None]:
objname

In [None]:
length(preds_seg2_glm_ALL)
sum(sapply(preds_seg2_glm_ALL, is.na))

# Verify output

In [None]:
# meta_preds <- read.csv(file.path(outputDir, 'meta_preds', 'meta_preds.csv'))
dim(meta_preds)
head(meta_preds)

In [None]:
meta_preds %>% group_by(tld_registrar_index) %>% select(model) %>% distinct() %>% pull(model) %>% table()

In [None]:
tld_re_model_lookup %>% pull(value) %>% table()

In [None]:
table(meta_preds$model)

In [None]:
head(tld_re_model_lookup)

# QA script

In [16]:
model='model_seg2_rf_ALL'
tld_reseller_str='storezhuhai naisinike'
tld_registrar_str2 = str_replace_all(tld_reseller_str, "[^[:alnum:]]", "")
tld_registrar_str2

In [17]:
# lapply(Sys.glob(file.path(modelDir,'model_seg2_rf_*')),load,.GlobalEnv)

In [None]:
test_list = 

In [None]:
pred_seg2_rf(
               test_list, 
               tld_reseller_str)
           )

In [None]:
    rm(list=ls(pattern='^model_seg2_rf_'))