-
Notifications
You must be signed in to change notification settings - Fork 37
/
functions.R
1310 lines (1188 loc) · 58.1 KB
/
functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
lagp <- function(x, p){
return(c(rep(0,p), x[1:(length(x)-p)]))
}
CMean <- function(b) {
b <- b[b != 0]
if (length(b) > 0)
return(mean(b))
return(0)
}
logplus <- function(x){
sapply(x, function(x) {if (x<=0){
return(0.00)
} else{
return(log(x))
}})
}
range01 <- function(x){(x-min(x))/(max(x)-min(x))}
#' @importFrom stats cor var na.omit
#' @importFrom tidyr replace_na
calculate_distances <- function(markets_to_be_matched, data, id, i, warping_limit, matches, dtw_emphasis){
## Nulling to avoid CRAN notes
Skip <- NULL
RelativeDistance <- NULL
Correlation <- NULL
w <- NULL
dist_rank <- NULL
corr_rank <- NULL
combined_rank <- NULL
messages <- NULL
NORMDIST <- NULL
RAWDIST <- NULL
populated <- NULL
SUMTEST <- NULL
SUMCNTL <- NULL
if (dtw_emphasis==0){
warping_limit <- 0
}
row <- 1
ThisMarket <- markets_to_be_matched[i]
distances <- data.frame(matrix(nrow=length(data$id_var)-1, ncol=10))
names(distances) <- c(id, "BestControl", "RelativeDistance", "Correlation", "Length", "SUMTEST", "SUMCNTL", "RAWDIST", "Correlation_of_logs", "populated")
distances[ ,"populated"] <- 0
messages <- 0
# For each market
for (j in 1:length(unique(data$id_var))){
isValidTest <- TRUE
ThatMarket <- unique(data$id_var)[j]
distances[row, id] <- ThisMarket
distances[row, "BestControl"] <- ThatMarket
mkts <- create_market_vectors(data, ThisMarket, ThatMarket)
test <- mkts[[1]]
ref <- mkts[[2]]
dates <- mkts[[3]]
sum_test <- NA
sum_cntl <- NA
dist <- 0
# If insufficient data or no variance
buffer <- 0.5
if (dtw_emphasis>0){
buffer <- warping_limit
}
if ((stats::var(test)==0 | length(test)<=2*buffer+1) | sum(abs(test))==0){
isValidTest <- FALSE
messages <- messages + 1
}
# If data and variance are sufficient and test vector was valid
if (ThisMarket != ThatMarket & isValidTest==TRUE & var(ref)>0 & length(test)>2*warping_limit+1){
sum_test <- abs(sum(test))
sum_cntl <- abs(sum(ref))
if (dtw_emphasis>0 & sum_test>0){
rawdist <- dtw(test, ref, window.type=sakoeChibaWindow, window.size=warping_limit)$distance
dist <- rawdist / sum_test
} else if (dtw_emphasis==0){
dist <- 0
rawdist <- 0
} else{
dist <- -1000000000
rawdist <- -1000000000
}
distances[row, "Correlation"] <- cor(test, ref)
distances[row, "populated"] <- 1
distances[row, "RelativeDistance"] <- dist
distances[row, "Skip"] <- FALSE
distances[row, "Length"] <- length(test)
distances[row, "SUMTEST"] <- sum_test
distances[row, "SUMCNTL"] <- sum_cntl
distances[row, "RAWDIST"] <- rawdist
if (max(ref)>0 & max(test)>0 & sd(logplus(test))>0 & sd(logplus(ref))>0){
distances[row, "Correlation_of_logs"] <- cor(logplus(test), logplus(ref))
} else{
distances[row, "Correlation_of_logs"] <- -1000000000
}
row <- row + 1
} else{
if (ThisMarket != ThatMarket){
messages <- messages + 1
distances[row, "Skip"] <- TRUE
if (dtw_emphasis==0){
distances[row, "RelativeDistance"] <- 0
distances[row, "RAWDIST"] <- 0
} else{
distances[row, "RelativeDistance"] <- -1000000000
distances[row, "RAWDIST"] <- -1000000000
}
distances[row, "populated"] <- 1
distances[row, "Correlation"] <- -1000000000
distances[row, "Length"] <- 0
distances[row, "SUMTEST"] <- 0
distances[row, "SUMCNTL"] <- 0
distances[row, "Correlation_of_logs"] <- -1000000000
row <- row + 1
}
}
}
if(messages > 0){
cat(paste0(messages, " markets were not matched with ", ThisMarket, " due to insufficient data or no variance. \n"))
if (dtw_emphasis>0){
cat(paste0("Since dtw_emphasis>0, more than 2*warping_limit+1 records are required in the matching period to match a market. \n"))
} else{
cat(paste0("Note that More than 2 records are required in the matching period to match a market. \n"))
}
cat("\n")
}
distances$matches <- matches
distances$w <- dtw_emphasis
distances$MatchingStartDate <- min(data$date_var)
distances$MatchingEndDate <- max(data$date_var)
# Filter down to only the top matches
distances <-
dplyr::filter(distances, populated==1) %>%
dplyr::mutate(dist_rank=rank(RelativeDistance)) %>%
dplyr::mutate(corr_rank=rank(-Correlation)) %>%
dplyr::mutate(combined_rank=w*dist_rank+(1-w)*corr_rank) %>%
dplyr::arrange(combined_rank) %>%
dplyr::select(-dist_rank, -combined_rank, -corr_rank, -populated) %>%
dplyr::mutate(rank=row_number()) %>%
dplyr::filter(rank<=matches) %>%
dplyr::select(-matches, -w) %>%
dplyr::mutate(NORMDIST=dplyr::if_else(SUMTEST+SUMCNTL>0 & !(RAWDIST %in% c(-1000000000, 0)), 2*RAWDIST/(SUMTEST+SUMCNTL), -1000000000)) %>%
dplyr::mutate(NORMDIST=dplyr::na_if(NORMDIST, -1000000000),
NORMDIST=dplyr::na_if(NORMDIST, 0),
RAWDIST=dplyr::na_if(RAWDIST, -1000000000),
RAWDIST=dplyr::na_if(RAWDIST, 0))
if (dtw_emphasis==0 & nrow(distances)>0){
distances$RelativeDistance <- NA
}
return(distances)
}
stopif <- function(value, clause, message){
if (value==clause){
stop(message)
}
}
check_inputs <- function(data=NULL, id=NULL, matching_variable=NULL, date_variable=NULL){
stopif(is.null(data), TRUE, "ERROR: No data is provided")
stopif(is.null(id), TRUE, "ERROR: No ID is provided")
stopif(is.null(matching_variable), TRUE, "ERROR: No matching metric is provided")
stopif(is.null(date_variable), TRUE, "ERROR: No date variable is provided")
stopif(id %in% names(data), FALSE, "ERROR: ID variable not found in input data")
stopif(date_variable %in% names(data), FALSE, "ERROR: date variable not found in input data")
stopif(matching_variable %in% names(data), FALSE, "ERROR: matching metric not found in input data")
stopif(length(unique(data[[id]]))>1, FALSE, "ERROR: Need at least 2 unique markets")
stopif(TRUE %in% is.na(data[[id]]), TRUE, "ERROR: NAs found in the market column")
stopif(TRUE %in% is.null(data[[id]]), TRUE, "ERROR: NULLs found in the market column")
stopif('' %in% unique(data[[id]]), TRUE, "ERROR: Blanks found in the market column")
stopif(TRUE %in% is.na(data[[matching_variable]]), TRUE, "ERROR: NAs found in the matching variable")
stopif(class(data[[date_variable]]) == "Date", FALSE, "ERROR: date_variable is not a Date. Check your data frame or use as.Date().")
}
#' @importFrom reshape2 melt dcast
create_market_vectors <- function(data, test_market, ref_market){
## nulling to avoid CRAN notes
id_var <- NULL
date_var <- NULL
match_var <- NULL
d <- subset(data, !is.na(match_var))
test <- subset(d, id_var==test_market)[,c("date_var", "match_var")]
names(test)[2] <- "y"
if (length(ref_market)==1){
ref <- subset(d, id_var == ref_market[1])[,c("date_var", "match_var")]
names(ref)[2] <- "x1"
f <- dplyr::inner_join(test, ref, by="date_var")
return(list(as.numeric(f$y), as.numeric(f$x1), as.Date(f$date_var)))
} else if (length(ref_market)>1){
d <- dplyr::distinct(d, id_var, date_var, .keep_all = TRUE)
ref <- reshape2::dcast(subset(d, id_var %in% ref_market), date_var ~ id_var, value.var="match_var")
names(ref) <- c("date_var", paste0("x", seq(1:length(ref_market))))
f <- data.frame(dplyr::inner_join(test, ref, by="date_var"))
f <- stats::na.omit(f)
return(list(as.numeric(f$y), dplyr::select(f, num_range("x", 1:length(ref_market))), as.Date(f$date_var)))
}
}
mape_no_zeros <- function(test, ref){
d <- subset(data.frame(cbind(test, ref)), test>0)
return(mean(abs(d$test - d$ref)/d$test))
}
dw <- function(y, yhat){
res <- y - yhat
lagres <- lagp(res, 1)
r <- stats::cor(res[2:length(res)], lagres[2:length(lagres)])
return(2*(1-r))
}
#' For each market, find the best matching control market
#'
#' \code{best_matches} finds the best matching control markets for each market in the dataset
#' using dynamic time warping (\code{dtw} package). The algorithm simply loops through all viable candidates for each
#' market in a parallel fashion, and then ranks by distance and/or correlation.
#'
#' @param data input data.frame for analysis. The dataset should be structured as "stacked" time series (i.e., a panel dataset).
#' In other words, markets are rows and not columns -- we have a unique row for each area/time combination.
#' @param markets_to_be_matched Use this parameter if you only want to get control matches for a subset of markets or a single market
#' The default is NULL which means that all markets will be paired with matching markets
#' @param id_variable the name of the variable that identifies the markets
#' @param date_variable the time stamp variable
#' @param matching_variable the variable (metric) used to match the markets. For example, this could be sales or new customers
#' @param parallel set to TRUE for parallel processing. Default is TRUE
#' @param warping_limit the warping limit used for matching. Default is 1,
#' which means that a single query value can be mapped to at most 2 reference values.
#' @param start_match_period the start date of the matching period (pre period).
#' Must be a character of format "YYYY-MM-DD" -- e.g., "2015-01-01"
#' @param end_match_period the end date of the matching period (pre period).
#' Must be a character of format "YYYY-MM-DD" -- e.g., "2015-10-01"
#' @param matches Number of matching markets to keep in the output (to use less markets for inference, use the control_matches parameter when calling inference). Default is to keep all matches.
#' @param dtw_emphasis Number from 0 to 1. The amount of emphasis placed on dtw distances, versus correlation, when ranking markets.
#' Default is 1 (all emphasis on dtw). If emphasis is set to 0, all emphasis would be put on correlation, which is recommended when optimal splits are requested.
#' An emphasis of 0.5 would yield equal weighting.
#' @param suggest_market_splits if set to TRUE, best_matches will return suggested test/control splits based on correlation and market sizes. Default is FALSE.
#' For this option to be invoked, markets_to_be_matched must be NULL (i.e., you must run a full match).
#' Note that the algorithm will force test and control to have the same number of markets. So if the total number of markets is odd, one market will be left out.
#' @param splitbins Number of size-based bins used to stratify when splitting markets into test and control.
#' Only markets inside the same bin can be matched. More bins means more emphasis on market size when splitting.
#' Less bins means more emphasis on correlation. Default is 10.
#' @param log_for_splitting This parameter determines if optimal splitting is based on correlations of the raw
#' matching metric values or the correlations of log(matching metric). Only relevant if suggest_market_splits is TRUE. Default is FALSE.
#' @import foreach
#' @importFrom parallel detectCores
#' @import CausalImpact
#' @import dplyr
#' @import iterators
#' @import utils
#' @import dtw
#' @import utf8
#' @importFrom reshape2 melt
#' @importFrom stats sd
#' @importFrom doParallel registerDoParallel stopImplicitCluster
#'
#' @export best_matches
#' @examples
#' \dontrun{
#' ##-----------------------------------------------------------------------
#' ## Find the best matches for the CPH airport time series
#' ##-----------------------------------------------------------------------
#' library(MarketMatching)
#' data(weather, package="MarketMatching")
#' mm <- best_matches(data=weather,
#' id="Area",
#' markets_to_be_matched=c("CPH", "SFO"),
#' date_variable="Date",
#' matching_variable="Mean_TemperatureF",
#' parallel=FALSE,
#' start_match_period="2014-01-01",
#' end_match_period="2014-10-01")
#' head(mm$BestMatches)
#' }
#'
#' @usage
#' best_matches(data=NULL,
#' markets_to_be_matched=NULL,
#' id_variable=NULL,
#' date_variable=NULL,
#' matching_variable=NULL,
#' parallel=TRUE,
#' warping_limit=1,
#' start_match_period=NULL,
#' end_match_period=NULL,
#' matches=NULL,
#' dtw_emphasis=1,
#' suggest_market_splits=FALSE,
#' splitbins=10,
#' log_for_splitting=FALSE)
#'
#' @return Returns an object of type \code{market_matching}. The object has the
#' following elements:
#'
#' \item{\code{BestMatches}}{A data.frame that contains the best matches for each market. All stats reflect data after the market pairs have been joined by date. Thus SUMTEST and SUMCNTL can have smaller values than what you see in the Bins output table}
#' \item{\code{Data}}{The raw data used to do the matching}
#' \item{\code{MarketID}}{The name of the market identifier}
#' \item{\code{MatchingMetric}}{The name of the matching variable}
#' \item{\code{DateVariable}}{The name of the date variable}
#' \item{\code{SuggestedTestControlSplits}}{Suggested test/control splits. SUMTEST and SUMCNTL are the total market volumes, not volume after joining with other markets. They're greater or equal to the values in the BestMatches file.}
#' \item{\code{Bins}}{Bins used for splitting and corresponding volumes}
best_matches <- function(data=NULL, markets_to_be_matched=NULL, id_variable=NULL, date_variable=NULL, matching_variable=NULL, parallel=TRUE, warping_limit=1, start_match_period=NULL, end_match_period=NULL, matches=NULL, dtw_emphasis=1, suggest_market_splits=FALSE, splitbins=10, log_for_splitting=FALSE){
## Nulling to avoid angry notes
match_var <- NULL
id_var <- NULL
date_var <- NULL
suggested_split <- NULL
Sizes <-NULL
BestControl <- NULL
Corr <- NULL
Correlation <- NULL
Correlation_of_logs <- NULL
SUMCNTL <- NULL
SUMTEST <- NULL
Segment <- NULL
Skip <- NULL
Volume <- NULL
control_market <- NULL
market <- NULL
max_rows <- NULL
rows <- NULL
short <- NULL
test_market <- NULL
## Check the start date and end dates
stopif(is.null(start_match_period), TRUE, "No start date provided")
stopif(is.null(end_match_period), TRUE, "No end date provided")
# Clean up the emphasis
if (is.null(dtw_emphasis)){
dtw_emphasis<-0
} else if (dtw_emphasis>1){
dtw_emphasis<-1
} else if(dtw_emphasis<0){
dtw_emphasis<-0
}
## check the inputs
stopif(date_variable %in% names(data), FALSE, "ERROR: date variable not found in input data")
if (length(class(data[[date_variable]]))>1){
if (!("Date" %in% class(data[[date_variable]]))){
cat("NOTE: Date variable converted to Date using as.Date() \n")
cat("Recommended to double check your date variable \n")
cat("\n")
}
} else if (class(data[[date_variable]]) != "Date"){
cat("NOTE: Date variable converted to Date using as.Date() \n")
cat("Recommended to double check your date variable \n")
cat("\n")
}
data[[date_variable]] <- as.Date(data[[date_variable]]) ## trim the date variable
check_inputs(data=data, id=id_variable, matching_variable=matching_variable, date_variable=date_variable)
data$date_var <- data[[date_variable]]
data$id_var <- data[[id_variable]]
data$match_var <- data[[matching_variable]]
if (is.null(markets_to_be_matched)==FALSE & suggest_market_splits==TRUE){
cat("The suggest_market_splits parameter has been turned off since markets_to_be_matched is not NULL \n")
cat("Set markets_to_be_matched to NULL if you want optimized pairs \n")
cat("\n")
}
if (is.null(matches)){
if (is.null(markets_to_be_matched) & suggest_market_splits==TRUE){
matches <- length(unique(data$id_var))
} else{
matches <- 5
}
} else{
if (is.null(markets_to_be_matched) & suggest_market_splits==TRUE){
matches <- length(unique(data$id_var))
cat("The matches parameter has been overwritten for splitting to conduct a full search for optimized pairs \n")
cat("\n")
}
}
## check for dups
ddup <- dplyr::distinct(data, id_var, date_var)
stopif(nrow(ddup)<nrow(data), TRUE, "ERROR: There are date/market duplicates in the input data")
rm(ddup)
## reduce the width of the data.frame
data <- dplyr::arrange(data, id_var, date_var) %>%
ungroup() %>%
dplyr::select(id_var, date_var, match_var)
## save a reduced version of the data
saved_data <- data
## set up a list to hold all distance matrices
all_distances <- list()
## filter the dates
data <-
dplyr::filter(data, date_var>=as.Date(start_match_period) & date_var<=as.Date(end_match_period)) %>%
dplyr::group_by(id_var) %>%
dplyr::mutate(rows=max(row_number())) %>%
dplyr::ungroup() %>%
dplyr::mutate(max_rows=max(rows)) %>%
dplyr::mutate(short=(rows<max_rows)) %>%
dplyr::select(-rows, -max_rows, -short)
## check if any data is left
stopif(nrow(data)>0, FALSE, "ERROR: no data left after filter for dates")
## get a vector of all markets that matches are wanted for. Check to ensure markets_to_be_matched exists in the data.
if(is.null(markets_to_be_matched)){
markets_to_be_matched <- unique(data$id_var)
}else{
markets_to_be_matched <- unique(markets_to_be_matched)
for (k in 1:length(markets_to_be_matched)){
stopif(markets_to_be_matched[k] %in% unique(data$id_var), FALSE, paste0("test market ", markets_to_be_matched[k], " does not exist"))
}
}
## loop through markets and compute distances
if (parallel == FALSE) {
for (i in 1:length(markets_to_be_matched)) {
all_distances[[i]] <- calculate_distances(markets_to_be_matched, data, id_variable, i, warping_limit, matches, dtw_emphasis)
}
shortest_distances <- data.frame(dplyr::bind_rows(all_distances))
}else{
ncore <- detectCores() - 1
if (ncore>length(markets_to_be_matched)){
ncore <- length(markets_to_be_matched)
}
registerDoParallel(ncore)
loop_result <- foreach(i = 1:length(markets_to_be_matched)) %dopar%
{
calculate_distances(markets_to_be_matched, data, id_variable, i, warping_limit, matches, dtw_emphasis)
}
shortest_distances <- data.frame(dplyr::bind_rows(loop_result))
stopImplicitCluster()
}
if (suggest_market_splits==TRUE){
sizes <- shortest_distances
sizes$market <- sizes[[id_variable]]
markets <- length(unique(sizes$market))
maxbins <- floor(markets/2)
bins <- maxbins
if (maxbins>splitbins){
bins <- splitbins
if (bins==0){bins <- 1}
} else if (maxbins==0){
bins <- 1
} else if (splitbins>maxbins){
bins <- maxbins
} else if (splitbins==0){
bins <- 1
}
bin_size <- floor(markets/bins)
cat(paste0("\tOptimal test/control market splits will be generated, targeting ", bin_size, " markets in each bin. \n"))
if (dtw_emphasis>0){
cat("\n")
cat("\tFYI: It is recommended to set dtw_emphasis to 0 when running optimal splits. \n")
cat("\n")
}
if (bin_size %% 2 != 0){
bin_size <- bin_size-1
if (bin_size<1){
bin_size <- 1
}
}
## True volume before joining with other markets
true_volumes <- dplyr::group_by(data, id_var) %>%
dplyr::summarise(Volume=sum(match_var)) %>%
dplyr::ungroup() %>%
dplyr::rename(market=id_var)
sizes <- dplyr::select(sizes, market) %>%
dplyr::distinct(market, .keep_all=TRUE) %>%
dplyr::left_join(true_volumes, by="market") %>%
dplyr::arrange(-Volume) %>%
dplyr::mutate(
bin=floor((dplyr::row_number()-0.1)/bin_size)+1) %>%
dplyr::select(market, bin, Volume) %>%
dplyr::mutate(test_market=market,
control_market=market) %>%
dplyr::group_split(bin)
optimal_list <- list()
j <- 1
for (i in 1:length(sizes)){
bin <- dplyr::pull(sizes[[i]], market)
tdf <- shortest_distances
tdf$test_market <- tdf[[id_variable]]
if (log_for_splitting==TRUE){
tdf$Corr <- tdf$Correlation_of_logs
} else{
tdf$Corr <- tdf$Correlation
}
tdf <- dplyr::filter(tdf, test_market %in% bin & BestControl %in% bin) %>%
dplyr::arrange(-Corr) %>%
dplyr::mutate(
control_market=BestControl,
Segment=i)
rowsleft <- nrow(tdf)
while(rowsleft>0){
optimal_list[[j]] <- tdf[1,c("Segment", "test_market", "control_market", "Correlation_of_logs", "Correlation", "SUMTEST", "SUMCNTL", "Corr")]
test <- tdf[1,"test_market"]
cntl <- tdf[1,"control_market"]
tdf <- dplyr::filter(tdf, !(test_market %in% c(test, cntl)))
tdf <- dplyr::filter(tdf, !(control_market %in% c(test, cntl))) %>%
dplyr::arrange(-Corr)
rowsleft <- nrow(tdf)
j <- j+1
}
}
Sizes <- dplyr::bind_rows(sizes)
suggested_split <- dplyr::bind_rows(optimal_list) %>%
dplyr::select(-SUMTEST, -SUMCNTL) %>%
dplyr::ungroup() %>%
dplyr::left_join(dplyr::select(Sizes, Volume, test_market), by="test_market") %>%
dplyr::rename(SUMTEST=Volume) %>%
dplyr::left_join(dplyr::select(Sizes, Volume, control_market), by="control_market") %>%
dplyr::rename(SUMCNTL=Volume) %>%
dplyr::arrange(Segment, -Corr) %>%
dplyr::mutate(PairRank=row_number()) %>%
dplyr::mutate(Volume=SUMTEST+SUMCNTL,
percent_of_volume=cumsum(Volume)/sum(Volume)) %>%
dplyr::select(-Corr) %>%
dplyr::group_by(Segment) %>%
dplyr::mutate(markets=n()*2) %>%
dplyr::ungroup() %>%
dplyr::mutate(Correlation_of_logs=dplyr::na_if(Correlation_of_logs, -1000000000),
Correlation=dplyr::na_if(Correlation, -1000000000))
Sizes <- dplyr::select(Sizes, market, bin, Volume) %>%
dplyr::group_by(bin) %>%
dplyr::mutate(markets_in_bin=n()) %>%
dplyr::ungroup() %>%
dplyr::mutate(excluded_from_optimal_splits=dplyr::if_else(market %in% c(suggested_split$test_market,suggested_split$control_market), 0, 1))
if (nrow(Sizes)-nrow(suggested_split)*2>0){
cat("\t", paste0(nrow(Sizes)-nrow(suggested_split)*2, " market(s) were excluded from the splits due the total number of markets being odd \n"))
cat("\t Check the Bins output file to identify the market(s) that were excluded \n")
cat("\n")
}
} else{
suggested_split <- NULL
Sizes <- NULL
}
### Return the results
shortest_distances <-
dplyr::filter(shortest_distances, Skip==FALSE) %>%
dplyr::select(-Skip)
object <- list(BestMatches=shortest_distances, Data=as.data.frame(saved_data), MarketID=id_variable, MatchingMetric=matching_variable, DateVariable=date_variable, SuggestedTestControlSplits=suggested_split, Bins=Sizes)
class(object) <- "matched_market"
return (object)
}
#' Given a test market, analyze the impact of an intervention
#'
#' \code{inference} Analyzes the causal impact of an intervention using the CausalImpact package, given a test market and a matched_market object from the best_matches function.
#' The function returns an object of type "market_inference" which contains the estimated impact of the intervention (absolute and relative).
#'
#' @param matched_markets A matched_market object created by the market_matching function
#' @param bsts_modelargs A list() that passes model parameters directly to bsts -- such as list(niter = 1000, nseasons = 52, prior.level.sd=0.1)
#' This parameter will overwrite the values specified in prior_level_sd and nseasons. ONLY use this if you're using intricate bsts settings
#' For most use-cases, using the prior_level_sd and nseasons parameters should be sufficient
#' @param test_market The name of the test market (character)
#' @param end_post_period The end date of the post period. Must be a character of format "YYYY-MM-DD" -- e.g., "2015-11-01"
#' @param alpha Desired tail-area probability for posterior intervals. For example, 0.05 yields 0.95 intervals
#' @param prior_level_sd Prior SD for the local level term (Gaussian random walk). Default is 0.01. The bigger this number is, the more wiggliness is allowed for the local level term.
#' Note that more wiggly local level terms also translate into larger posterior intervals
#' This parameter will be overwritten if you're using the bsts_modelargs parameter
#' @param control_matches Number of matching control markets to use in the analysis (default is 5)
#' @param analyze_betas Controls whether to test the model under a variety of different values for prior_level_sd.
#' @param nseasons Seasonality for the bsts model -- e.g., 52 for weekly seasonality
#' @import ggplot2
#' @export inference
#' @examples
#' \dontrun{
#' library(MarketMatching)
#' ##-----------------------------------------------------------------------
#' ## Analyze causal impact of a made-up weather intervention in Copenhagen
#' ## Since this is weather data it is a not a very meaningful example.
#' ## This is merely to demonstrate the function.
#' ##-----------------------------------------------------------------------
#' data(weather, package="MarketMatching")
#' mm <- best_matches(data=weather,
#' id="Area",
#' markets_to_be_matched=c("CPH", "SFO"),
#' date_variable="Date",
#' matching_variable="Mean_TemperatureF",
#' parallel=FALSE,
#' warping_limit=1, # warping limit=1
#' dtw_emphasis=0, # rely only on dtw for pre-screening
#' matches=5, # request 5 matches
#' start_match_period="2014-01-01",
#' end_match_period="2014-10-01")
#' library(CausalImpact)
#' results <- inference(matched_markets=mm,
#' test_market="CPH",
#' analyze_betas=FALSE,
#' control_matches=5, # use all 5 matches for inference
#' end_post_period="2015-12-15",
#' prior_level_sd=0.002)
#' }
#' @usage
#' inference(matched_markets=NULL,
#' bsts_modelargs=NULL,
#' test_market=NULL,
#' end_post_period=NULL,
#' alpha=0.05,
#' prior_level_sd=0.01,
#' control_matches=5,
#' analyze_betas=FALSE,
#' nseasons=NULL)
#'
#' @return Returns an object of type \code{inference}. The object has the
#' following elements:
#' \item{\code{AbsoluteEffect}}{The estimated absolute effect of the intervention}
#' \item{\code{AbsoluteEffectLower}}{The lower limit of the estimated absolute effect of the intervention.
#' This is based on the posterior interval of the counterfactual predictions.
#' The width of the interval is determined by the \code{alpha} parameter.}
#' \item{\code{AbsoluteEffectUpper}}{The upper limit of the estimated absolute effect of the intervention.
#' This is based on the posterior interval of the counterfactual predictions.
#' The width of the interval is determined by the \code{alpha} parameter.}
#' \item{\code{RelativeEffectLower}}{Same as the above, just for relative (percentage) effects}
#' \item{\code{RelativeEffectUpper}}{Same as the above, just for relative (percentage) effects}
#' \item{\code{TailProb}}{Posterior probability of a non-zero effect}
#' \item{\code{PrePeriodMAPE}}{Pre-intervention period MAPE}
#' \item{\code{DW}}{Durbin-Watson statistic. Should be close to 2.}
#' \item{\code{PlotActualVersusExpected}}{Plot of actual versus expected using \code{ggplot2}}
#' \item{\code{PlotCumulativeEffect}}{Plot of the cumulative effect using \code{ggplot2}}
#' \item{\code{PlotPointEffect}}{Plot of the pointwise effect using \code{ggplot2}}
#' \item{\code{PlotActuals}}{Plot of the actual values for the test and control markets using \code{ggplot2}}
#' \item{\code{PlotPriorLevelSdAnalysis}}{Plot of DW and MAPE for different values of the local level SE using \code{ggplot2}}
#' \item{\code{PlotLocalLevel}}{Plot of the local level term using \code{ggplot2}}
#' \item{\code{TestData}}{A \code{data.frame} with the test market data}
#' \item{\code{ControlData}}{A \code{data.frame} with the data for the control markets}
#' \item{\code{PlotResiduals}}{Plot of the residuals using \code{ggplot2}}
#' \item{\code{TestName}}{The name of the test market}
#' \item{\code{TestName}}{The name of the control market}
#' \item{\code{zooData}}{A \code{zoo} time series object with the test and control data}
#' \item{\code{Predictions}}{Actual versus predicted values}
#' \item{\code{CausalImpactObject}}{The CausalImpact object created}
#' \item{\code{Coefficients}}{The average posterior coefficients}
#'
#' @import zoo
#' @import bsts
#' @importFrom scales comma
#' @import Boom
inference <- function(matched_markets=NULL, bsts_modelargs=NULL, test_market=NULL, end_post_period=NULL, alpha=0.05, prior_level_sd=0.01, control_matches=5, analyze_betas=FALSE, nseasons=NULL){
## use nulling to avoid CRAN notes
id_var <- NULL
BestControl <- NULL
date_var <- NULL
cum.effect <- NULL
Date <- NULL
Response <- NULL
lower_bound <- NULL
upper_bound <- NULL
Predicted <- NULL
Cumulative <- NULL
match_var <- NULL
SD <- NULL
Beta <- NULL
value <- NULL
variable <- NULL
LocalLevel <- NULL
Residuals <- NULL
Pointwise <- NULL
stopif(is.null(matched_markets), TRUE, "ERROR: Need to specify a matched market object")
stopif(is.null(test_market), TRUE, "ERROR: Need to specify a test market")
stopif(length(test_market)>1, TRUE, "ERROR: inference() can only analyze one test market at a time. Call the function separately for each test market")
## Model settings
if (!is.null(bsts_modelargs) & !is.null(nseasons)){
cat("\tNOTE: You're passing arguments directly to bsts while also specifying nseasons \n")
cat("\tNOTE: bsts_modelargs will overwrite nseasons \n")
cat("\n")
}
if (is.null(bsts_modelargs)){
if (is.null(nseasons)){
bsts_modelargs <- list(prior.level.sd=prior_level_sd)
} else{
bsts_modelargs <- list(nseasons=nseasons, prior.level.sd=prior_level_sd)
}
} else{
if (analyze_betas==TRUE){
analyze_betas <- FALSE
cat("\tNOTE: analyze_betas turned off when bsts model arguments are passed directly \n")
cat("\tConsider using the nseasons and prior_level_sd parameters instead \n")
cat("\n")
}
}
## copy the distances
mm <- dplyr::filter(matched_markets$BestMatches, rank<=control_matches)
data <- matched_markets$Data
mm$id_var <- mm[[names(mm)[1]]]
mm <- dplyr::arrange(mm, id_var, BestControl)
## check if the test market exists
stopif(test_market %in% unique(data$id_var), FALSE, paste0("ERROR: Test market ", test_market, " does not exist"))
## if an end date has not been provided, then choose the max of the data
if (is.null(end_post_period)){
end_post_period <- as.Date(max(subset(data, id_var==test_market)$date_var))
}
# filter for dates
MatchingStartDate <- as.Date(subset(mm, id_var==test_market)$MatchingStartDate[1])
MatchingEndDate <- as.Date(subset(mm, id_var==test_market)$MatchingEndDate[1])
data <- dplyr::filter(data, date_var>=MatchingStartDate & date_var<=as.Date(end_post_period))
## get the control market name
control_market <- subset(mm, id_var==test_market)$BestControl
## get the test and ref markets
mkts <- create_market_vectors(data, test_market, control_market)
y <- mkts[[1]]
ref <- mkts[[2]]
date <- mkts[[3]]
end_post_period <- max(date)
post_period <- date[date > as.Date(mm[1, "MatchingEndDate"])]
stopif(length(post_period)==0, TRUE, "ERROR: no valid data in the post period")
post_period_start_date <- min(post_period)
post_period_end_date <- max(post_period)
ts <- zoo(cbind.data.frame(y, ref), date)
## print the settings
cat("\t------------- Inputs -------------\n")
cat(paste0("\tMarket ID: ", matched_markets$MarketID, "\n"))
cat(paste0("\tDate Variable: ", matched_markets$DateVariable, "\n"))
cat(paste0("\tMatching Metric: ", matched_markets$MatchingMetric, "\n"))
cat("\n")
cat(paste0("\tTest Market: ", test_market, "\n"))
for (i in 1:length(control_market)){
cat(paste0("\tControl Market ", i, ": ", control_market[i], "\n"))
}
cat("\n")
cat(paste0("\tMatching (pre) Period Start Date: ", MatchingStartDate, "\n"))
cat(paste0("\tMatching (pre) Period End Date: ", MatchingEndDate, "\n"))
cat(paste0("\tPost Period Start Date: ", post_period_start_date, "\n"))
cat(paste0("\tPost Period End Date: ", post_period_end_date, "\n"))
cat("\n")
cat(paste0("\tbsts parameters: \n"))
modelparms <- names(bsts_modelargs)
for (p in 1:length(bsts_modelargs)){
cat(paste0("\t ", modelparms[p], ": ", bsts_modelargs[p], "\n"))
}
if (!("nseasons" %in% modelparms)){
cat("\t No seasonality component (controlled for by the matched markets) \n")
}
cat(paste0("\tPosterior Intervals Tail Area: ", 100*(1-alpha), "%\n"))
cat("\n")
## run the inference
pre.period <- c(as.Date(MatchingStartDate), as.Date(MatchingEndDate))
post.period <- c(as.Date(post_period_start_date), as.Date(post_period_end_date))
set.seed(2015)
impact <- CausalImpact(ts, pre.period, post.period, alpha=alpha, model.args=bsts_modelargs)
if(analyze_betas==TRUE){
## estimate betas for different values of prior sd
betas <- data.frame(matrix(nrow=11, ncol=4))
names(betas) <- c("SD", "SumBeta", "DW", "MAPE")
for (i in 0:20){
step <- (max(0.1, prior_level_sd) - min(0.001, prior_level_sd))/20
sd <- min(0.001, prior_level_sd) + step*i
if (is.null(nseasons)){
args <- list(prior.level.sd=sd)
} else{
args <- list(prior.level.sd=sd, nseasons=nseasons)
}
m <- CausalImpact(ts, pre.period, post.period, alpha=alpha, model.args=args)
burn <- SuggestBurn(0.1, m$model$bsts.model)
b <- sum(apply(m$model$bsts.model$coefficients[-(1:burn),], 2, CMean))
betas[i+1, "SD"] <- sd
betas[i+1, "SumBeta"] <- b
preperiod <- subset(m$series, cum.effect == 0)
betas[i+1, "DW"] <- dw(preperiod$response, preperiod$point.pred)
betas[i+1, "MAPE"] <- mape_no_zeros(preperiod$response, preperiod$point.pred)
}
}
burn <- SuggestBurn(0.1, impact$model$bsts.model)
## create statistics
results <- list()
results[[1]] <- impact$summary$AbsEffect[2]
results[[2]] <- impact$summary$AbsEffect.lower[2]
results[[3]] <- impact$summary$AbsEffect.upper[2]
results[[4]] <- impact$summary$RelEffect[2]
results[[5]] <- impact$summary$RelEffect.lower[2]
results[[6]] <- impact$summary$RelEffect.upper[2]
results[[7]] <- impact$summary$p[2]
## compute mape
preperiod <- subset(impact$series, cum.effect == 0)
preperiod$res <- preperiod$response - preperiod$point.pred
results[[8]] <- mape_no_zeros(preperiod$response, preperiod$point.pred)
results[[9]] <- dw(preperiod$response, preperiod$point.pred)
cat("\t------------- Model Stats -------------\n")
cat(paste0("\tMatching (pre) Period MAPE: ", round(100*results[[8]], 2) , "%\n"))
avg_coeffs <- data.frame(nrow=dim(impact$model$bsts.model$coefficients)[2]-1, ncol=2)
names(avg_coeffs) <- c("Market", "AverageBeta")
for (i in 2:dim(impact$model$bsts.model$coefficients)[2]){
avg_coeffs[i-1, "Market"] <- control_market[i-1]
avg_coeffs[i-1, "AverageBeta"] <- apply(impact$model$bsts.model$coefficients[-(1:burn),], 2, CMean)[i]
cat(paste0("\tBeta ", i-1, " [", control_market[i-1], "]: ", round(avg_coeffs[i-1, "AverageBeta"], 4) , "\n"))
}
cat(paste0("\tDW: ", round(results[[9]], 2) , "\n"))
cat("\n")
ymin <- min(min(impact$series$response), min(impact$series$point.pred.lower), min(ref), min(y))
ymax <- max(max(impact$series$response), max(impact$series$point.pred.upper), max(ref), max(y))
## create actual versus predicted plots
stopif (length(date) != nrow(data.frame(impact$series)), "ERROR: you might have holes (skipped weeks/days/months) in your time series ")
avp <- data.frame(cbind(date, data.frame(impact$series)[,c("response", "point.pred", "point.pred.lower", "point.pred.upper")]))
names(avp) <- c("Date", "Response", "Predicted", "lower_bound", "upper_bound")
avp$test_market <- test_market
results[[10]] <- ggplot(data=avp, aes(x=Date)) +
geom_line(aes(y=Response, colour = "Observed"), size=1.2) +
geom_ribbon(aes(ymin=lower_bound, ymax=upper_bound), fill="grey", alpha=0.3) +
geom_line(aes(y=Predicted, colour = "Expected"), size=1.2) +
theme_bw() + theme(legend.title = element_blank()) + ylab("") + xlab("") +
geom_vline(xintercept=as.numeric(MatchingEndDate), linetype=2) +
scale_y_continuous(labels = scales::comma, limits=c(ymin, ymax)) +
ggtitle(paste0("Test Market: ",test_market))
avp$test_market <- NULL
## create cumulative lift plots
plotdf <- cbind.data.frame(as.Date(row.names(data.frame(impact$series))), data.frame(impact$series)[,c("cum.effect", "cum.effect.lower", "cum.effect.upper")])
names(plotdf) <- c("Date", "Cumulative", "lower_bound", "upper_bound")
results[[11]] <- ggplot(data=plotdf, aes(x=Date, y=Cumulative)) + geom_line(size=1.2) + theme_bw() +
scale_y_continuous(labels = scales::comma) + ylab("Cumulative Effect") + xlab("") +
geom_vline(xintercept=as.numeric(MatchingEndDate), linetype=2) +
geom_ribbon(aes(ymin=lower_bound, ymax=upper_bound), fill="grey", alpha=0.3)
## create plots of the actual data
plotdf <- data[data$id_var %in% c(test_market, control_market),]
results[[12]] <- ggplot(data=plotdf, aes(x=date_var, y=match_var, colour=as.factor(id_var))) +
geom_line() +
theme_bw() + theme(legend.title = element_blank(), axis.title.x = element_blank()) + ylab("") + xlab("Date") +
geom_vline(xintercept=as.numeric(MatchingEndDate), linetype=2) +
scale_y_continuous(labels = scales::comma, limits=c(ymin, ymax))
if(analyze_betas==TRUE){
## plot betas at various local level SDs
results[[13]] <- ggplot(data=betas, aes(x=SD, y=Beta)) +
geom_line() +
theme_bw() + theme(legend.title = element_blank()) +
geom_vline(xintercept=as.numeric(prior_level_sd), linetype=2) + xlab("Local Level Prior SD")
## plot DWs and MAPEs at different SDs
plotdf <- melt(data=betas, id="SD")
results[[14]] <- ggplot(data=plotdf, aes(x=SD, y=value, colour=as.factor(variable))) + geom_line() +
theme_bw() + theme(legend.title = element_blank()) +
geom_vline(xintercept=as.numeric(prior_level_sd), linetype=2) + xlab("Local Level Prior SD") +
facet_grid(variable ~ ., scales="free") + ylab("") + guides(colour="none")
}
burn <- SuggestBurn(0.1, impact$model$bsts.model)
plotdf <- cbind.data.frame(date, colMeans(impact$model$bsts.model$state.contributions[-(1:burn), "trend", ])) %>% filter(date<=as.Date(MatchingEndDate))
names(plotdf) <- c("Date", "LocalLevel")
results[[15]] <- ggplot(data=plotdf, aes(x=Date, y=LocalLevel)) +
geom_line() +
theme_bw() + theme(legend.title = element_blank()) +
ylab("Local Level") + xlab("")
plotdf <- cbind.data.frame(date, data.frame(impact$series)[,c("response", "point.pred")]) %>% dplyr::filter(date<=as.Date(MatchingEndDate))
names(plotdf) <- c("Date", "y", "yhat")
plotdf$Residuals <- plotdf$y - plotdf$yhat
results[[16]] <- ggplot(data=plotdf, aes(x=Date, y=Residuals)) +
geom_line() +
theme_bw() + theme(legend.title = element_blank()) +
xlab("")
## create cumulative lift plots
plotdf <- cbind.data.frame(as.Date(row.names(data.frame(impact$series))), data.frame(impact$series)[,c("point.effect", "point.effect.lower", "point.effect.upper")])
names(plotdf) <- c("Date", "Pointwise", "lower_bound", "upper_bound")
results[[17]] <- ggplot(data=plotdf, aes(x=Date, y=Pointwise)) + geom_line(size=1.2) + theme_bw() +
scale_y_continuous(labels = scales::comma) + ylab("Point Effect") + xlab("") +
geom_vline(xintercept=as.numeric(MatchingEndDate), linetype=2) +
geom_ribbon(aes(ymin=lower_bound, ymax=upper_bound), fill="grey", alpha=0.3)
### print results
cat("\t------------- Effect Analysis -------------\n")
cat(paste0("\tAbsolute Effect: ", round(results[[1]],2), " [", round(results[[2]],2), ", ", round(results[[3]],2), "]\n"))
cat(paste0("\tRelative Effect: ", paste0(round(100*results[[4]],2), "%"), " [", paste0(round(100*results[[5]],2), "%"), ", ", paste0(round(100*results[[6]],2), "%"), "]\n"))
cat(paste0("\tProbability of a causal impact: ", paste0(round(100*(1-results[[7]]),4), "%\n")))
### return the results
object <- list(AbsoluteEffect=results[[1]], AbsoluteEffectLower=results[[2]], AbsoluteEffectUpper=results[[3]],
RelativeEffect=results[[4]], RelativeEffectLower=results[[5]], RelativeEffectUpper=results[[6]],
TailProb=results[[7]], PrePeriodMAPE=results[[8]], DW=results[[9]],
PlotActualVersusExpected=results[[10]], PlotCumulativeEffect=results[[11]], PlotPointEffect=results[[17]],
PlotActuals=results[[12]], PlotPriorLevelSdAnalysis=results[[14]],
PlotLocalLevel=results[[15]], TestData=y, ControlData=ref, PlotResiduals=results[[16]],
TestName=test_market, ControlName=control_market, ZooData=ts, Predictions=avp,
CausalImpactObject=impact, Coefficients=avg_coeffs)
class(object) <- "matched_market_inference"
return (object)
}
#' Given a test market, analyze the impact of fake interventions (prospective power analysis)
#'
#' \code{test_fake_lift} Given a matched_market object from the best_matches function, this function analyzes the causal impact of fake interventions using the CausalImpact package.
#' The function returns an object of type "market_inference" which contains the estimated impact of the intervention (absolute and relative).
#'
#' @param matched_markets A matched_market object created by the market_matching function
#' This parameter will overwrite the values specified in prior_level_sd and nseasons. ONLY use this if you're using intricate bsts settings
#' For most use-cases, using the prior_level_sd and nseasons parameters should be sufficient
#' @param test_market The name of the test market (character)
#' @param end_fake_post_period The end date of the post period. Must be a character of format "YYYY-MM-DD" -- e.g., "2015-11-01"
#' @param alpha Desired tail-area probability for posterior intervals. For example, 0.05 yields 0.95 intervals
#' @param prior_level_sd Prior SD for the local level term (Gaussian random walk). Default is 0.01. The bigger this number is, the more wiggliness is allowed for the local level term.
#' Note that more wiggly local level terms also translate into larger posterior intervals
#' This parameter will be overwritten if you're using the bsts_modelargs parameter
#' @param control_matches Number of matching control markets to use in the analysis (default is 5)
#' @param nseasons Seasonality for the bsts model -- e.g., 52 for weekly seasonality
#' @param max_fake_lift The maximum absolute fake lift -- e.g., 0.1 means that the max lift evaluated is 10 percent and the min lift is -10 percent
#' Note that randomization is injected into the lift, which means that the max lift will not be exactly as specified
#' @param steps The number of steps used to calculate the power curve (default is 10)
#' @param lift_pattern_type Lift pattern. Default is constant. The other choice is a random lift..
#'
#'
#' @import ggplot2
#' @export test_fake_lift
#' @examples
#' \dontrun{
#' library(MarketMatching)
#' ##-----------------------------------------------------------------------
#' ## Create a pseudo power curve for various levels of lift
#' ## Since this is weather data it is a not a very meaningful example.
#' ## This is merely to demonstrate the function.
#' ##-----------------------------------------------------------------------
#' data(weather, package="MarketMatching")
#' mm <- best_matches(data=weather,
#' id="Area",
#' markets_to_be_matched=c("CPH", "SFO"),
#' date_variable="Date",
#' matching_variable="Mean_TemperatureF",
#' warping_limit=1, # warping limit=1
#' dtw_emphasis=0, # rely only on dtw for pre-screening
#' matches=5, # request 5 matches
#' start_match_period="2014-01-01",
#' end_match_period="2014-10-01")
#' library(CausalImpact)
#' results <- test_fake_lift(matched_markets=mm,
#' test_market="CPH",
#' lift_pattern_type="constant",
#' control_matches=5, # use all 5 matches for inference
#' end_fake_post_period="2015-12-15",
#' prior_level_sd=0.002,
#' max_fake_lift=0.1)
#' }
#' @usage
#' test_fake_lift(matched_markets=NULL,
#' test_market=NULL,
#' end_fake_post_period=NULL,
#' alpha=0.05,
#' prior_level_sd=0.01,
#' control_matches=NULL,
#' nseasons=NULL,
#' max_fake_lift=NULL,
#' steps=10,
#' lift_pattern_type="constant")
#'
#' @return Returns an object of type \code{matched_market_power}. The object has the
#' following elements:
#' \item{\code{ResultsData}}{The results stored in a data.frame}