## create synthetic data based on original data in the first experiment 

In [1]:
suppressWarnings(suppressMessages(library(synthpop)))
suppressWarnings(suppressMessages(library(tidyverse)))

In [2]:
sessionInfo()

R version 4.2.1 (2022-06-23)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Red Hat Enterprise Linux

Matrix products: default
BLAS/LAPACK: /usr/local/app/rcs_bin/grid3/envs/rcs_2022.11/lib/libopenblasp-r0.3.21.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] forcats_0.5.2   stringr_1.4.1   dplyr_1.0.10    purrr_0.3.5    
 [5] readr_2.1.3     tidyr_1.2.1     tibble_3.1.8    ggplot2_3.3.6  
 [9] tidyverse_1.3.2 synthpop_1.8-0 

loaded via a namespace (and not attached):
 [1] matrixStats_0.62.0   fs_1.5.2             lubridate_1.8.0     
 

In [3]:
setwd('/export/home/faculty/jeryang/Code/log')

In [4]:
df1 = read.csv('./df_exp1.csv') # load data with pre-treatment covariates
df1_s = read.csv('./df_surrogate_exp1.csv') # load data with surrogates and outcomes

df1 = arrange(df1, subscriber_id)
df1_s = arrange(df1_s, subscriber_id)

df1 = left_join(df1, df1_s %>%
                select(subscriber_id, rev_1, rev_2, rev_3, rev_4, rev_5, rev_6), by = 'subscriber_id')

In [5]:
mu_6 = read.csv('./mu_18m_6.csv') # load predicted outcomes using surrogates from the first 1-6 months
mu_5 = read.csv('./mu_18m_5.csv')
mu_4 = read.csv('./mu_18m_4.csv')
mu_3 = read.csv('./mu_18m_3.csv')
mu_2 = read.csv('./mu_18m_2.csv')
mu_1 = read.csv('./mu_18m_1.csv')

mu_6 = rename(mu_6, y0_6 = y0, y1_6 = y1)
mu_5 = rename(mu_5, y0_5 = y0, y1_5 = y1)
mu_4 = rename(mu_4, y0_4 = y0, y1_4 = y1)
mu_3 = rename(mu_3, y0_3 = y0, y1_3 = y1)
mu_2 = rename(mu_2, y0_2 = y0, y1_2 = y1)
mu_1 = rename(mu_1, y0_1 = y0, y1_1 = y1)

set.seed(100) # select a subset of features and randomly sample 1000 users to generate the synthetic data
n = 1000

df1_sample = df1 %>% select(rev_1, rev_2, rev_3, rev_4, rev_5, rev_6,
                            subscriber_id, risk_score, p_treated, treated, contains('month')) %>% sample_n(n)

df1_sample = left_join(df1_sample, mu_6, by = 'subscriber_id')
df1_sample = left_join(df1_sample, mu_5, by = 'subscriber_id')
df1_sample = left_join(df1_sample, mu_4, by = 'subscriber_id')
df1_sample = left_join(df1_sample, mu_3, by = 'subscriber_id')
df1_sample = left_join(df1_sample, mu_2, by = 'subscriber_id')
df1_sample = left_join(df1_sample, mu_1, by = 'subscriber_id')

In [6]:
# the synthetic surrogate indices are generated directly from surrogate indices estimated on the original data, not from the synthetic raw data
capture.output(
        df1_syn <- syn(df1_sample %>% # create the synthetic data that preserves the correlation structure in the original dataset
                   select(-subscriber_id, -treated, -p_treated),
                   seed = 100)$syn,
        file = '/dev/null'
    )

In [7]:
df1_syn_rev = select(df1_syn, contains(c('rev','y0','y1')))
df1_syn_other = select(df1_syn, -contains(c('rev','y0','y1')))

In [8]:
# masking revenues by multiplying a constant
# hide the value of c 
df1_syn_rev = apply(df1_syn_rev, 2, function(x) x*c)

In [9]:
df1_syn = cbind(df1_syn_rev, df1_syn_other)

In [10]:
df1_syn$treated = df1_sample$treated
df1_syn$p_treated = df1_sample$p_treated
df1_syn$subscriber_id = 1:nrow(df1_syn)

In [11]:
write.csv(df1_syn, './github/synthetic.csv', row.names = F)