## imputing surrogate index from historical data

In [1]:
suppressWarnings(suppressMessages(library(tidyverse)))
suppressWarnings(suppressMessages(library(caret)))

In [2]:
sessionInfo()

R version 4.1.1 (2021-08-10)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Red Hat Enterprise Linux

Matrix products: default
BLAS/LAPACK: /usr/local/app/rcs_bin/grid3/envs/rcs_2022.01/lib/libmkl_rt.so.1

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] caret_6.0-90    lattice_0.20-45 forcats_0.5.1   stringr_1.4.0  
 [5] dplyr_1.0.7     purrr_0.3.4     readr_2.1.2     tidyr_1.2.0    
 [9] tibble_3.1.6    ggplot2_3.3.5   tidyverse_1.3.1

loaded via a namespace (and not attached):
 [1] nlme_3.1-155         fs_1.5.2             lubridate_1.8.0

In [3]:
dir_path = '/export/projects2/jeryang_narrative_project/globe/log'
setwd(dir_path)

In [4]:
set.seed(101)

### use historical data from 2015 - 2018 to estimate surrogate index for 3-year revenue

In [5]:
history2015 = read.csv('./history2015.csv')

In [6]:
# use surrogates from the first 6 months
xgb_surrogate_6m <- caret::train(rev ~., 
                   data = select(history2015,rev,
                                matches('_1|_m1|_2|_m2|_3|_m3|_4|_m4|_5|_m5|_6|_m6')),
                   method = "xgbTree",
                             tuneGrid = data.frame(eta = 0.3,
                                      max_depth = 2,
                                      gamma = 0,
                                      colsample_bytree = 0.8,
                                      min_child_weight = 1,
                                      subsample = 1,
                                      nrounds = 150))

In [7]:
# use surrogates from the first 4 months
xgb_surrogate_4m <- caret::train(rev ~., 
                   data = select(history2015, rev,
                                matches('_1|_m1|_2|_m2|_3|_m3|_4|_m4')),
                   method = "xgbTree",
                             tuneGrid = data.frame(eta = 0.3,
                                      max_depth = 2,
                                      gamma = 0,
                                      colsample_bytree = 0.8,
                                      min_child_weight = 1,
                                      subsample = 1,
                                      nrounds = 150))

### impute surrogate index for 3-year revenue in the two experiments

In [8]:
df1_s = read.csv('./df_surrogate_exp1.csv') # first experiment 
df2_s = read.csv('./df_surrogate_exp2.csv') # second experiment 

In [9]:
# we observe surrogates up to 6 months in the first experiment 
df1_s$rev3 = predict(xgb_surrogate_6m,
                              newdata =  select(df1_s,matches('_1|_m1|_2|_m2|_3|_m3|_4|_m4|_5|_m5|_6|_m6')))

In [10]:
head(df1_s$rev3)

In [11]:
# we observe surrogates up to 4 months in the second experiment 
df2_s$rev3 = predict(xgb_surrogate_4m,
                              newdata =  select(df2_s, matches('_1|_m1|_2|_m2|_3|_m3|_4|_m4')))

In [12]:
head(df2_s$rev3)