# Load & Analyze tld_reseller model performance comparison

In [3]:
options(repr.matrix.max.cols=50, repr.matrix.max.rows=100)

In [4]:
# install.packages("pkgcond")

In [5]:
library(dplyr)
library(data.table)
library(partykit)
library(tictoc)
library(caret)
library(e1071)
library(randomForest)
library(ranger)

#for 3d plotting
library(akima)
library(plotly)

# for prep data
library(rPython)
library(stringr)
library(pbapply)
library(stringdist)
library(data.table)
library(dominanceanalysis)

library(pkgcond)

getwd()


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last


Loading required package: grid

Loading required package: libcoin

Loading required package: mvtnorm

Loading required package: lattice

Loading required package: ggplot2

randomForest 4.6-14

Type rfNews() to see new features/changes/bug fixes.


Attaching package: ‘randomForest’


The following object is masked from ‘package:ggplot2’:

    margin


The following object is masked from ‘package:dplyr’:

    combine



Attaching package: ‘ranger’


The following object is masked from ‘package:randomForest’:

    importance



Attaching package: ‘plotly’


The following object is masked from ‘package:ggplot2’:

    last_plot


The following object is masked from ‘package:sta

In [6]:

source('../orig/functions.R')

source('../orig/functions_models.R')

source('../phaseII_03_forest/functions_eval.R')


source('../phaseII_03_forest/load_prep_data_expiry.R')

# expiry_train_prepped_2_1 (list, less df's w/ 0 obs)
# expiry_test_prepped_2_1  (list, less df's w/ 0 obs)
# expiry_train_df_1 (above, rbound)
# expiry_test_df_1 (above, rbound)
# expiry_train_df_sub (subset vars)                                        
# expiry_test_df_sub (subset vars)

## List tld_reseller labels

In [7]:
tld_reseller_lookup_df = expiry_train_df_1 %>% group_by(tld_registrar_index, reseller) %>% tally() %>% arrange(desc(n))
tld_reseller_lookup_df %>% head()

tld_registrar_index,reseller,n
<chr>,<chr>,<int>
sitegmo,gmo,106569
funalibaba,alibaba,72690
sitenamecheap,namecheap,53536
pwnamecheap,namecheap,50470
onlinego daddy,go daddy,42162
sitegandi sas,gandi sas,35309


## Load script results

In [8]:
ls()

In [9]:
load("../../data/tld_reseller_compare_list_0001_0817")
tld_reseller_compare_list_0001_0817 <- return_list
load("../../data/tld_reseller_compare_list_0001_0817_seg_rf")
tld_reseller_compare_list_0001_0817_seg_rf <- return_list
load("../../data/tld_reseller_compare_list_0818_1723.RData")
tld_reseller_compare_list_0818_1723.RData <- return_list

In [10]:
length(tld_reseller_compare_list_0001_0817)
length(tld_reseller_compare_list_0001_0817[[1]])
names(tld_reseller_compare_list_0001_0817[[1]])

In [11]:
length(tld_reseller_compare_list_0001_0817_seg_rf)
length(tld_reseller_compare_list_0001_0817_seg_rf[[1]])
names(tld_reseller_compare_list_0001_0817_seg_rf[[1]])

In [12]:
length(tld_reseller_compare_list_0818_1723.RData)

## Munge List of Lists to List of Dataframes

In [13]:
test_funct <- function(list_elem){
    df <- bind_cols(list_elem)
    if (ncol(df)>6) {
        df <- df[c(1,2,4,6,8,10,12)]
        names(df) <- c("actual",names(list_elem))
    } else {
        df <- df
        df$actual <- NA
        df <- df %>% select(actual, everything())
    }
        

    return(df)
}

In [14]:
suppress_messages(compare_df_list_0001_0817 <- lapply(tld_reseller_compare_list_0001_0817, test_funct))

In [15]:
for (i in 1:length(tld_reseller_compare_list_0001_0817)){
    compare_df_list_0001_0817[[i]] <- cbind(compare_df_list_0001_0817[[i]], tld_reseller_compare_list_0001_0817_seg_rf[[i]])
    compare_df_list_0001_0817[[i]] <- compare_df_list_0001_0817[[i]][c(1:5,7,9)]
    names(compare_df_list_0001_0817[[i]]) <- c('actual','pred_df_seg2_glm','pred_df_seg_glm','pred_df_agg_glm','pred_df_seg2_rf',
                                          'pred_df_agg_rf','pred_df_seg_rf')
    compare_df_list_0001_0817[[i]] <- compare_df_list_0001_0817[[i]][c('actual',
                                                             'pred_df_seg2_glm','pred_df_seg_glm','pred_df_agg_glm',
                                                             'pred_df_seg2_rf','pred_df_seg_rf','pred_df_agg_rf')]
}

In [16]:
suppress_messages(compare_df_list_0818_1723 <- lapply(tld_reseller_compare_list_0818_1723.RData, test_funct))

In [17]:
length(compare_df_list_0001_0817)
length(compare_df_list_0818_1723)

compare_df_list_0001_1723 <- append(compare_df_list_0001_0817,compare_df_list_0818_1723)
length(compare_df_list_0001_1723)


## Mange List of Dataframes to a single Dataframe

### Add lookup values to each dataframe in list

In [27]:
for (i in 1:length(compare_df_list_0001_1723)){
    compare_df_list_0001_1723[[i]]$tld_registrar_index <- tld_reseller_lookup_df[[i,"tld_registrar_index"]]
    compare_df_list_0001_1723[[i]]$reseller <- tld_reseller_lookup_df[[i,"reseller"]]
    compare_df_list_0001_1723[[i]]$n <- tld_reseller_lookup_df[[i,"n"]]
}

In [28]:
head(compare_df_list_0001_1723[[1]])

Unnamed: 0_level_0,actual,pred_df_seg2_glm,pred_df_seg_glm,pred_df_agg_glm,pred_df_seg2_rf,pred_df_seg_rf,pred_df_agg_rf,tld_registrar_index,reseller,n
Unnamed: 0_level_1,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<int>
1,Not Renewd,0.6470992,0.558,0.419,0.4765771,0.4545657,0.5386137,sitegmo,gmo,106569
2,Renewed,0.6762222,0.567,0.397,0.5410605,0.5769973,0.6091513,sitegmo,gmo,106569
3,Renewed,0.9098495,0.835,0.41,0.7018108,0.6343404,0.688668,sitegmo,gmo,106569
4,Renewed,0.602935,0.515,0.402,0.2414726,0.3999603,0.3464197,sitegmo,gmo,106569
5,Not Renewd,0.9311321,0.863,0.417,0.7749462,0.7039504,0.671174,sitegmo,gmo,106569
6,Renewed,0.9347869,0.861,0.391,0.6376848,0.656674,0.6717237,sitegmo,gmo,106569


### Add domain info

In [49]:
for (i in 1:length(compare_df_list_0001_1723)){
    lookup_df <- expiry_test_df_1 %>% filter(tld_registrar_index==compare_df_list_0001_1723[[i]]$tld_registrar_index[[1]]) %>% select(domain_id, domain)
    compare_df_list_0001_1723[[i]] <- cbind(compare_df_list_0001_1723[[i]],lookup_df)
}

In [48]:
head(compare_df_list_0001_1723[[1]])

Unnamed: 0_level_0,actual,pred_df_seg2_glm,pred_df_seg_glm,pred_df_agg_glm,pred_df_seg2_rf,pred_df_seg_rf,pred_df_agg_rf,tld_registrar_index,reseller,n,domain_id,domain
Unnamed: 0_level_1,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<int>,<int>,<chr>
1,Not Renewd,0.6470992,0.558,0.419,0.4765771,0.4545657,0.5386137,sitegmo,gmo,106569,44295183,kagen.site
2,Renewed,0.6762222,0.567,0.397,0.5410605,0.5769973,0.6091513,sitegmo,gmo,106569,44573611,designlab.site
3,Renewed,0.9098495,0.835,0.41,0.7018108,0.6343404,0.688668,sitegmo,gmo,106569,45304858,hokatu-blog.site
4,Renewed,0.602935,0.515,0.402,0.2414726,0.3999603,0.3464197,sitegmo,gmo,106569,46235129,suzuya.site
5,Not Renewd,0.9311321,0.863,0.417,0.7749462,0.7039504,0.671174,sitegmo,gmo,106569,46276970,wins-company.site
6,Renewed,0.9347869,0.861,0.391,0.6376848,0.656674,0.6717237,sitegmo,gmo,106569,47809960,yamatoku-company.site


### rbindlist

In [52]:
compare_DF <- rbindlist(compare_df_list_0001_1723)

In [59]:
sum(sapply(compare_df_list_0001_1723,nrow))
dim(compare_DF)

In [60]:
head(compare_DF)

actual,pred_df_seg2_glm,pred_df_seg_glm,pred_df_agg_glm,pred_df_seg2_rf,pred_df_seg_rf,pred_df_agg_rf,tld_registrar_index,reseller,n,domain_id,domain
<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<int>,<int>,<chr>
Not Renewd,0.6470992,0.558,0.419,0.4765771,0.4545657,0.5386137,sitegmo,gmo,106569,44295183,kagen.site
Renewed,0.6762222,0.567,0.397,0.5410605,0.5769973,0.6091513,sitegmo,gmo,106569,44573611,designlab.site
Renewed,0.9098495,0.835,0.41,0.7018108,0.6343404,0.688668,sitegmo,gmo,106569,45304858,hokatu-blog.site
Renewed,0.602935,0.515,0.402,0.2414726,0.3999603,0.3464197,sitegmo,gmo,106569,46235129,suzuya.site
Not Renewd,0.9311321,0.863,0.417,0.7749462,0.7039504,0.671174,sitegmo,gmo,106569,46276970,wins-company.site
Renewed,0.9347869,0.861,0.391,0.6376848,0.656674,0.6717237,sitegmo,gmo,106569,47809960,yamatoku-company.site


In [62]:
write.csv(compare_DF, "../../data/output/tld_reseller_compare_predictions.csv", row.names=FALSE)

In [95]:
compare_DF %>% select(tld_registrar_index,n) %>% unique() %>% head()

tld_registrar_index,n
<chr>,<int>
sitegmo,106569
funalibaba,72690
sitenamecheap,53536
pwnamecheap,50470
onlinego daddy,42162
sitegandi sas,35309


## Output performance metrics
for each model:
- lift10
- auc

In [117]:
l10_seg2_glm <- unlist(lapply(lapply(compare_df_list_0001_1723, chart_lift, dep_var = "actual", 
                                     pred_var = "pred_df_seg2_glm"), function(x) x[[1,"lift"]]))
l10_seg_glm <- unlist(lapply(lapply(compare_df_list_0001_1723, chart_lift, dep_var = "actual", 
                                     pred_var = "pred_df_seg_glm"), function(x) x[[1,"lift"]]))
l10_agg_glm <- unlist(lapply(lapply(compare_df_list_0001_1723, chart_lift, dep_var = "actual", 
                                     pred_var = "pred_df_agg_glm"), function(x) x[[1,"lift"]]))
l10_seg2_rf <- unlist(lapply(lapply(compare_df_list_0001_1723, chart_lift, dep_var = "actual", 
                                     pred_var = "pred_df_seg2_rf"), function(x) x[[1,"lift"]]))
l10_seg_rf <- unlist(lapply(lapply(compare_df_list_0001_1723, chart_lift, dep_var = "actual", 
                                     pred_var = "pred_df_seg_rf"), function(x) x[[1,"lift"]]))
l10_agg_rf <- unlist(lapply(lapply(compare_df_list_0001_1723, chart_lift, dep_var = "actual", 
                                     pred_var = "pred_df_agg_rf"), function(x) x[[1,"lift"]]))

In [120]:
auc_seg2_glm <- unlist(lapply(lapply(compare_df_list_0001_1723, chart_lift, dep_var = "actual", 
                                     pred_var = "pred_df_seg2_glm"), calc_auc))
auc_seg_glm <- unlist(lapply(lapply(compare_df_list_0001_1723, chart_lift, dep_var = "actual", 
                                     pred_var = "pred_df_seg_glm"), calc_auc))
auc_agg_glm <- unlist(lapply(lapply(compare_df_list_0001_1723, chart_lift, dep_var = "actual", 
                                     pred_var = "pred_df_agg_glm"), calc_auc))
auc_seg2_rf <- unlist(lapply(lapply(compare_df_list_0001_1723, chart_lift, dep_var = "actual", 
                                     pred_var = "pred_df_seg2_rf"), calc_auc))
auc_seg_rf <- unlist(lapply(lapply(compare_df_list_0001_1723, chart_lift, dep_var = "actual", 
                                     pred_var = "pred_df_seg_rf"), calc_auc))
auc_agg_rf <- unlist(lapply(lapply(compare_df_list_0001_1723, chart_lift, dep_var = "actual", 
                                     pred_var = "pred_df_agg_rf"), calc_auc))

In [136]:
df1 <- data.frame(cbind(l10_seg2_glm,l10_seg_glm,l10_agg_glm,
                       l10_seg2_rf,l10_seg_rf,l10_agg_rf))
df1 <- cbind(data.frame(tld_reseller_lookup_df), df1)
dim(df1)
head(df1)

Unnamed: 0_level_0,tld_registrar_index,reseller,n,l10_seg2_glm,l10_seg_glm,l10_agg_glm,l10_seg2_rf,l10_seg_rf,l10_agg_rf
Unnamed: 0_level_1,<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,sitegmo,gmo,106569,8.204334,8.126935,6.408669,8.77709,8.76161,8.591331
2,funalibaba,alibaba,72690,4.883721,5.255814,5.023256,5.813953,6.0,6.186047
3,sitenamecheap,namecheap,53536,2.298025,1.921005,1.885099,2.280072,2.028725,2.046679
4,pwnamecheap,namecheap,50470,3.478261,3.675889,3.675889,3.438735,4.071146,4.3083
5,onlinego daddy,go daddy,42162,2.850995,2.883271,2.474449,2.598171,2.727273,2.646584
6,sitegandi sas,gandi sas,35309,8.243243,8.445946,7.094595,4.932432,8.445946,8.581081


In [139]:
write.csv(df1, "../../data/output/tld_reseller_compare_metrics.csv", row.names=FALSE)

In [140]:
system("gsutil cp /home/jupyter/local/Domains_202003/data/output/* gs://data_outputt/output/")
system("mv /home/jupyter/local/Domains_202003/data/output/* /home/jupyter/local/Domains_202003/data/")