In [1]:
setwd('/srv/Globe-raw-data')

list.files()

In [68]:
library(readr)
library(vroom)
library(tidyverse)
library(foreach)

### prepare raw data into a cleaned data frame

In [69]:
dates = seq(as.Date('2020-07-01'), as.Date('2020-07-19'), 1)

dates = dates[dates != '2020-07-14']

In [70]:
length(dates)

In [97]:
data_prep = function(date) {
    
    print(date)
    
    # read column names 
    header = vroom(paste0('nytbostonglobecom_',date,'/column_headers.tsv'))
    header = names(header)
    
    data = vroom(paste0('nytbostonglobecom_',date,'/hit_data.tsv'), delim = '\t', col_names = header)
    
    # select relevant columns 
    df = select(data,date_time,evar68,evar3,prop28,event_list,post_visid_high,post_visid_low)
    
    df$post_visid_high = as.numeric(df$post_visid_high)
    df$post_visid_low = as.numeric(df$post_visid_low)
    
    df = df %>% filter(!is.na(post_visid_high) & !is.na(post_visid_low))
    
    # 210 is the code for subscription 
    df$subscription = as.numeric(grepl('210', df$event_list, fixed=TRUE))
    
    df = group_by(df, post_visid_high, post_visid_low) %>% fill(evar68,evar3,prop28,.direction = "downup")
    
    colnames(df) = c('date_time','id','paywall_seq','paywall_count','event_list','post_visid_high','post_visid_low','subscription')
    
    # select users in one of the treatment groups
    df = filter(df, paywall_seq %in% paste0('Paywall sale_Jun_22_20_',LETTERS[seq(from = 1, to = 14)]))
    
    df$paywall_count = as.numeric(df$paywall_count)
    
    df = df %>% filter(!is.na(paywall_seq) & !is.na(paywall_count))
    
    df$paywall_count[df$paywall_count > 4] = 4
    
    df = df %>% 
    group_by(post_visid_high, post_visid_low, paywall_count) %>% 
    summarize(paywall_seq = first(paywall_seq),
        subscription = max(subscription),
        date_time = min(date_time))
        
    df = df %>% ungroup %>% select(post_visid_high, post_visid_low, paywall_seq, paywall_count, subscription, date_time)
    
    return(df)
    
}

In [98]:
library(doParallel)
registerDoParallel(cores=10)

In [99]:
df = foreach(i=1:length(dates), .combine='rbind') %dopar% {
    
    data_prep(dates[i])
    
}

In [109]:
head(df)

post_visid_high,post_visid_low,paywall_seq,paywall_count,subscription,date_time
<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>
6.664699e+17,1.668778e+18,Paywall sale_Jun_22_20_A,1,1,2020-07-01 00:35:33
1.840745e+18,3.156594e+18,Paywall sale_Jun_22_20_A,3,0,2020-07-01 00:50:09
1.840745e+18,3.156594e+18,Paywall sale_Jun_22_20_A,4,0,2020-07-01 00:50:27
2.89522e+18,4.611687e+18,Paywall sale_Jun_22_20_A,4,0,2020-07-01 00:05:29
2.934205e+18,4.611687e+18,Paywall sale_Jun_22_20_A,4,0,2020-07-01 00:34:34
3.069214e+18,4.611687e+18,Paywall sale_Jun_22_20_A,2,0,2020-07-01 00:22:19


In [110]:
dim(df)

In [111]:
table(df$subscription)


    0     1 
78668   480 

In [112]:
write_csv(df, '/srv/Globe-raw-data/data_clean.csv')