# Synthetic control method: Californian tobacco law

May 12, 2024

@author : jeremylhour

NB : before running this notebook, please launch "synth_setup.sh" that downloads Abadie et al. (2010) data in the folder data/MLAB_data.txt

In [None]:
rm(list=ls())

packageList = c('numDeriv','Synth')
#for(pack in packageList) install.packages(pack)
lapply(packageList, require, character.only = TRUE)

# DATA

## Formating the data

In [None]:
DATA_PATH = '../data/MLAB_data.txt'

data = data.frame(t(read.table(DATA_PATH)))

smokeNames = mapply(function(x) paste("SmokingCons", x, sep=""), 1970:2000) # Tobacco consumption variable names
                    
names = c(
    "State_ID",
    "Income",
    "RetailPrice",
    "Young",
    "BeerCons",
    "Smoking1988",
    "Smoking1980",
    "Smoking1975",
    smokeNames
)
colnames(data) = names

stateNames = c('Alabama', 'Arkansas','Colorado','Connecticut','Delaware',
               'Georgia',  'Idaho',  'Illinois',  'Indiana', 'Iowa', 'Kansas',
               'Kentucky', 'Louisiana', 'Maine', 'Minnesota', 'Mississippi',
               'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
               'New Mexico', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma',
               'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
               'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia' , 'West Virginia',
               'Wisconsin', 'Wyoming', 'California')
rownames(data) = stateNames
data[, "Treated"] = as.numeric(data[,"State_ID"]==3) # California is state with id = 3
                  
head(data)

## FIGURE 10.3

In [None]:
plotdata = ts(cbind(unlist(data[data[,"Treated"]==1, smokeNames]),
                    unlist(apply(data[data[,"Treated"]==0, smokeNames], 2, mean))),start=c(1970), freq=1)

plot(plotdata, plot.type="single",
     col=c("steelblue","firebrick"), lwd=2,
     lty=c(1, 6),xlab="", ylab="Tobacco consumption (packs per capita)",
     ylim=c(35,150))
lim <- par("usr")
rect(1988, lim[3], lim[2], lim[4], col = rgb(0.5,0.5,0.5,1/4))
axis(1) ## add axes back
axis(2)
box() 
legend(1971,80,
       legend=c("California", "Other states"),
       col=c("steelblue","firebrick"), lwd=2,
       lty=c(1,6))

We can observe that tobacco consumption per capita in California is significantly lower than the average of other states, and tends to decrease more rapidly. There is clearly no common trend.

In [None]:
# SAVE
jpeg("output/Fig_10_3.jpg", res=300, width = 15, height = 15, units="cm")
plot(plotdata, plot.type="single",
     col=c("black","black"), lwd=2,
     lty=c(1, 2),xlab="", ylab="Tobacco consumption (packs per capita)",
     ylim=c(35, 150))
lim <- par("usr")
rect(1988, lim[3], lim[2], lim[4], col = rgb(0.5,0.5,0.5,1/4))
axis(1) ## add axes back
axis(2)
box() 
legend(1971,80,
       legend=c("California", "Other states (average)"),
       col=c("black", "black"), lwd=2,
       lty=c(1, 2))
dev.off()

# ESTIMATION

## Compute the solution

In [None]:
X1 = t(data[data[,"Treated"]==1, c("Income", "RetailPrice", "Young", "BeerCons", "Smoking1988","Smoking1980","Smoking1975")])
X0 = t(data[data[,"Treated"]==0, c("Income", "RetailPrice", "Young", "BeerCons", "Smoking1988","Smoking1980","Smoking1975")])

Z1 = t(data[data[,"Treated"]==1, mapply(function(x) paste("SmokingCons", x, sep=""), 1970:1988)])
Z0 = t(data[data[,"Treated"]==0, mapply(function(x) paste("SmokingCons", x, sep=""), 1970:1988)])

solution = synth(X1=X1,
                 X0=X0,
                 Z0=Z0,
                 Z1=Z1,
                 custom.v = NULL,
                 optimxmethod = "Nelder-Mead",
                 verbose=TRUE)       
                                 
W = solution$solution.w
print(round(W, 3))
                                        
V_sol = solution$solution.v

On peut constater qu'aux erreurs numériques près, on reproduit bien les résultats d'Abadie et al. (2010).

## Table 10.2 and figure 10.4

In [None]:
tableau1 = round(cbind(X1, X0%*%W ,apply(X0, 1, mean)), 1)
colnames(tableau1) = c("California", "Synthetic California", "Average of 38 other states")
print(tableau1)

write.table(tableau1, "output/tableau1.txt", append = FALSE, quote = FALSE, sep = " & ",
            eol = paste(" \\\\ \n"), na = "--", dec = ".", row.names = T,
            col.names = T)

We also replicate the results from Table 1. We observe that the synthetic California provides a much better counterfactual than the simple average of the other states.

In [None]:
# FIG 2
plotdata = ts(cbind(unlist(data[data[,"Treated"]==1, smokeNames]),
                    unlist(t(as.matrix(data[data[,"Treated"]==0, smokeNames]))%*%W)),start=c(1970), freq=1)


plot(plotdata, plot.type="single",
     col=c("steelblue","firebrick"), lwd=2,
     lty=c(1, 6),xlab="", ylab="Tobacco consumption (packs per capita)",
     ylim=c(35,150))
lim <- par("usr")
rect(1988, lim[3], lim[2], lim[4], col = rgb(0.5,0.5,0.5,1/4))
axis(1) ## add axes back
axis(2)
box() 
legend(1971,80,
       legend=c("California", "Synthetic California"),
       col=c("steelblue","firebrick"), lwd=2,
       lty=c(1,6))

# FIG 3
treatmentEffect_hat = data[data[,"Treated"]==1, smokeNames] - t(as.matrix(data[data[,"Treated"]==0, smokeNames]))%*%W
treatmentPlot = ts(unlist(treatmentEffect_hat),start=c(1970), freq=1)

plot(treatmentPlot, plot.type="single",
     col=c("steelblue"), lwd=2,
     lty=c(1),xlab="", ylab="Tobacco consumption (packs per capita)",
     ylim=c(-30,30))
abline(h=0, lty=c(1))
lim <- par("usr")
rect(1988, lim[3], lim[2], lim[4], col = rgb(0.5,0.5,0.5,1/4))
axis(1) ## add axes back
axis(2)
box() 
legend(1971,80,
       legend=c("Treatment effect"),
       col=c("steelblue"), lwd=2,
       lty=c(1,6))

In [None]:
# SAVE
jpeg("output/Fig_10_4.jpg", res=300, width = 30, height = 15, units="cm")
par(mfrow=c(1,2))
plot(plotdata, plot.type="single",
     col=c("black", "black"), lwd=2,
     lty=c(1, 2),xlab="", ylab="Tobacco consumption (packs per capita)",
     ylim=c(35, 150))
lim <- par("usr")
rect(1988, lim[3], lim[2], lim[4], col = rgb(0.5,0.5,0.5,1/4))
axis(1) ## add axes back
axis(2)
box() 
legend(1971, 60,
       legend=c("California", "Synthetic California"),
       col=c("black","black"), lwd=2,
       lty=c(1, 2))

plot(treatmentPlot, plot.type="single",
     col=c("black"), lwd=2,
     lty=c(1),xlab="", ylab="Tobacco consumption (packs per capita)",
     ylim=c(-30,30))
abline(h=0, lty=c(1))
lim <- par("usr")
rect(1988, lim[3], lim[2], lim[4], col = rgb(0.5,0.5,0.5,1/4))
axis(1) ## add axes back
axis(2)
box() 
legend(1971,80,
       legend=c("Treatment effect"),
       col=c("black"), lwd=2,
       lty=c(1))
dev.off()

# INFERENCE

## Test

In [None]:
# We do as if every state is treated
contrefactuels = matrix(nrow=nrow(data), ncol=length(smokeNames))
weights = matrix(nrow=nrow(data), ncol=nrow(data)-1)

for(i in data[, "State_ID"]){
    print(paste('Contrefactual computation for state', i))
    
    # setting up the matrices
    X1 = t(data[data[, "State_ID"]==i, c("Income", "RetailPrice", "Young", "BeerCons", "Smoking1988", "Smoking1980", "Smoking1975")])
    X0 = t(data[data[, "State_ID"]!=i, c("Income", "RetailPrice", "Young", "BeerCons", "Smoking1988", "Smoking1980", "Smoking1975")])

    Z1 = t(data[data[, "State_ID"]==i, mapply(function(x) paste("SmokingCons", x, sep=""), 1970:1988)])
    Z0 = t(data[data[, "State_ID"]!=i, mapply(function(x) paste("SmokingCons", x, sep=""), 1970:1988)])
    
    # Compute weights
    error = FALSE
    tryCatch({
        sol = synth(X1=X1, X0=X0, Z0=Z0, Z1=Z1, V=V_sol, optimxmethod = "Nelder-Mead", verbose=FALSE)
    }, error = function(e){
        error = TRUE
    })
                                              
    if(error){
        print('Error for this state')
    } else {
    W = sol$solution.w
    weights[i,] = W

    # Compute counterfactual
    contrefactuels[i,] = t(as.matrix(data[data[, "State_ID"]!=i, smokeNames]))%*%W  
    }                     
}

In [None]:
print(paste(rep("=", 80), collapse=""))
print("STATS : MSPE AND MSPE RATIO")
print(paste(rep("=", 80), collapse=""))

# Calcul des statistiques de test
testStats = data.frame()
preTreatment = mapply(function(x) paste("SmokingCons", x, sep=""), 1970:1988)
postTreatment = mapply(function(x) paste("SmokingCons", x, sep=""), 1989:2000)
                       
for(i in data[, 'State_ID']){ 
    TE = data[data[, 'State_ID']==i, smokeNames] - contrefactuels[i,]
    testStats[i, "MSPE"] = apply(TE[postTreatment]^2, 1, mean)
    testStats[i, "MSPE_ratio"] = apply(TE[postTreatment]^2, 1, mean) / apply(TE[preTreatment]^2, 1, mean)
}

rownames(testStats) = rownames(data[order(data[,"State_ID"]),])

hist(testStats[,'MSPE'],
     breaks=38,
     main="",
    xlab="MSPE",
    col=c(rep("white", 7), "steelblue", rep("white", 20)))

hist(testStats[,'MSPE_ratio'],
     breaks=38,
    main="",
    xlab="post-treatment / pre-treatment MSPE ratio",
    col=c(rep("white", 24), "steelblue"))


print(paste("MSPE, p-value :", round(mean(testStats[,'MSPE'] >= testStats[3,'MSPE']),3)))
print(paste("MSPE ratio, p-value :", round(mean(testStats[,'MSPE_ratio'] >= testStats[3,'MSPE_ratio']), 3)))

In [None]:
jpeg("output/Fig_10_5.jpg", res=300, width = 30, height = 15, units="cm")
par(mfrow=c(1,2))
hist(testStats[,'MSPE'],
     breaks=38,
     main="",
    xlab="MSPE",
    col=c(rep("white", 7), "darkgrey", rep("white", 20)))

hist(testStats[,'MSPE_ratio'],
     breaks=38,
    main="",
    xlab="post-treatment / pre-treatment MSPE ratio",
    col=c(rep("white", 24), "darkgrey"))
dev.off()



## Confidence intervals

In [None]:
compute.pval <- function(data, outcome, Wsol, C){
    #' Function to compute p-value at each date
    #' Use the ATE as the statistics
    #' @param data: the dataframe
    #' @param outcome: tobacco consumption for that year, SmokingConsXXXX
    #' @param Wsol: the weights
    #' @param C: the constant of the null hypothesis
    
    theta.reshuffled = vector(length=nrow(data))
    
    for(i in data[, "State_ID"]){
        newOutcome = data[, outcome] + as.numeric(i!=3)*C*(data[, "State_ID"]==i)  - as.numeric(i!=3)*C*(data[, "State_ID"]==3)
        theta.reshuffled[i] = newOutcome[data[, "State_ID"]==i] - newOutcome[data[, "State_ID"]!=i]%*%weights[i,]
    }
    theta.obs = theta.reshuffled[3] # California id = 3
    p.val = mean(abs(theta.reshuffled - C) >= abs(theta.obs-C))
    return(list(p.val=p.val,
                theta.reshuffled=theta.reshuffled))
}

compute.pval.MSPE.ratio <- function(data, outcomes, postTreatment, Wsol, C){
    #' Function to compute p-value from MSPE ratio
    #' @param data: the dataframe
    #' @param outcomes: tobacco consumption
    #' @param postTreatment: post-treatment period indicator
    #' @param Wsol: the weights
    #' @param C: the constant of the null hypothesis
    
    ratio.reshuffled = vector(length=nrow(data))
    
    for(i in data[, "State_ID"]){
        TE = vector(length=length(outcomes))
        for(t in 1:length(outcomes)){
            if(outcomes[t] %in% postTreatment){
                newOutcome = data[, outcomes[t]] + as.numeric(i!=3)*C*(data[, "State_ID"]==i)  - as.numeric(i!=3)*C*(data[, "State_ID"]==3)
                TE[t] = newOutcome[data[, "State_ID"]==i] - newOutcome[data[, "State_ID"]!=i]%*%weights[i,] - C
            } else {
                newOutcome = data[, outcomes[t]]
                TE[t] = newOutcome[data[, "State_ID"]==i] - newOutcome[data[, "State_ID"]!=i]%*%weights[i,]
            }
            
        }
        ratio.reshuffled[i] = mean(TE[outcomes %in% postTreatment]^2) / mean(TE[!(outcomes %in% postTreatment)]^2)
    }
    ratio.obs = ratio.reshuffled[3] # California id = 3
    p.val = mean(ratio.reshuffled >= ratio.obs)
    return(list(p.val=p.val,
                ratio.reshuffled=ratio.reshuffled))
}

In [None]:
alpha = .2
confidenceInterval = matrix(nrow=length(smokeNames), ncol=2)
rownames(confidenceInterval) = smokeNames

for(t in 1988:2000){
    outcome = paste0('SmokingCons', t)
    theta.obs = data[data[, 'State_ID']==3, outcome] - contrefactuels[3, smokeNames==outcome]
    
    # 1. Upper bound
    # A. Looking for an initial value
    res0 = compute.pval(data, outcome, weights, C=theta.obs)
    b = max(res0$theta.reshuffled); eps = .01
    repeat{
        res0 = compute.pval(data, outcome, weights, C=b)
        if(res0$p.val < alpha) break
        res1 = compute.pval(data, outcome, weights, C=b+eps)
        b = b + (alpha - res0$p.val) * eps / (res1$p.val - res0$p.val)
    }
    # B. Dichotomy
    a = theta.obs
    f_a = 1-alpha
    repeat{
        m = (a+b)/2
        res = compute.pval(data, outcome, weights, C=m)
        f_m = res$p.val - alpha
        if(f_m*f_a > 0){
            a = m
            f_a = f_m
        } else {
            b = m
        }
        if(abs(b-a) < .001) break
        }
    Cu = (a+b)/2
    
    # 2. Lower bound
    # A. Looking for an initial value
    res0 = compute.pval(data, outcome, weights, C=theta.obs)
    a = min(res0$theta.reshuffled)
    repeat{
        res0 = compute.pval(data, outcome, weights, C=a)
        if(res0$p.val < alpha) break
        res1 = compute.pval(data, outcome, weights, C=a-eps)
        a = a + (alpha - res0$p.val) * eps / (res0$p.val - res1$p.val)
    }
    # B. Dichotomy
    b = theta.obs
    f_b = 1-alpha
    repeat{
        m = (a+b)/2
        res = compute.pval(data, outcome, weights, C=m)
        f_m = res$p.val - alpha
        if(f_m*f_b > 0){
            b = m
            f_b = f_m
        } else {
            a = m
        }
        if(abs(b-a) < .001) break
        }
    Cl = (a+b)/2
  
    print(paste0(t, ": ", alpha," confidence interval: [",round(Cl,2),",",round(Cu, 2),"]")) 
    confidenceInterval[outcome, ] = c(Cl, Cu)
}

In [None]:
alpha = .2

# 1. Upper bound
res0 = compute.pval.MSPE.ratio(data, outcomes=smokeNames, postTreatment=postTreatment, Wsol=weights, C=theta.obs)
b = 0; eps = .01
repeat{
    res0 = compute.pval.MSPE.ratio(data, outcomes=smokeNames, postTreatment=postTreatment, Wsol=weights, C=b)
    if(res0$p.val < alpha) break
    res1 = compute.pval.MSPE.ratio(data, outcomes=smokeNames, postTreatment=postTreatment, Wsol=weights, C=b+eps)
    b = b + (alpha - res0$p.val) * eps / (res1$p.val - res0$p.val)
}
a = mean(treatmentPlot)-10
f_a = 1-alpha
repeat{
    m = (a+b)/2
    res = compute.pval.MSPE.ratio(data, outcomes=smokeNames, postTreatment=postTreatment, Wsol=weights, C=m)
    f_m = res$p.val - alpha
    if(f_m*f_a > 0){
        a = m
        f_a = f_m
    } else {
        b = m
    }
    if(abs(b-a) < .001) break
    }
Cu = (a+b)/2

# 2. Lower bound
res0 = compute.pval.MSPE.ratio(data, outcomes=smokeNames, postTreatment=postTreatment, Wsol=weights, C=theta.obs)
a = -50
repeat{
    res0 = compute.pval.MSPE.ratio(data, outcomes=smokeNames, postTreatment=postTreatment, Wsol=weights, C=a)
    if(res0$p.val < alpha) break
    res1 = compute.pval.MSPE.ratio(data, outcomes=smokeNames, postTreatment=postTreatment, Wsol=weights, C=a-eps)
    a = a + (alpha - res0$p.val) * eps / (res0$p.val - res1$p.val)
}
b = -20
f_b = 1-alpha
repeat{
    m = (a+b)/2
    res = compute.pval.MSPE.ratio(data, outcomes=smokeNames, postTreatment=postTreatment, Wsol=weights, C=m)
    f_m = res$p.val - alpha
    if(f_m*f_b > 0){
        b = m
        f_b = f_m
    } else {
        a = m
    }
    if(abs(b-a) < .001) break
    }
Cl = (a+b)/2

print(paste0(alpha," confidence interval : [",round(Cl,2),",",round(Cu, 2),"]")) 

# Collecte des données pour le graphe
confidenceInterval_global = matrix(rep(treatmentEffect_hat, 2), ncol=2)
rownames(confidenceInterval_global) = smokeNames
confidenceInterval_global[c('SmokingCons1988', postTreatment),1] = Cl
confidenceInterval_global[c('SmokingCons1988', postTreatment),2] = Cu
confidenceInterval_global[!(smokeNames %in% c('SmokingCons1988', postTreatment)),] = NA 

In [None]:
# FIG 4
treatmentInferencePlot = ts(cbind(c(treatmentEffect_hat),
                                  confidenceInterval[,1],
                                  confidenceInterval[,2],
                                  confidenceInterval_global[,1],
                                  confidenceInterval_global[,2]), start=c(1970), freq=1)

plot(treatmentInferencePlot,
     plot.type="single",
     col=c("steelblue", "steelblue", "steelblue", "firebrick", "firebrick"),
     lwd=2,
     lty= c(1, 3, 3, 6, 6),
     xlab="",
     ylab="Tobacco consumption (packs per capita)",
     ylim=c(-60, 30))
abline(h=0, lty=c(1))
lim <- par("usr")
rect(1988, lim[3], lim[2], lim[4], col = rgb(0.5,0.5,0.5,1/4))
axis(1) ## add axes back
axis(2)
box() 
legend(1971,-40,
       legend=c("Treatment effect", "Confidence interval (point-wise)", "Confidence interval (global)"),
       col=c("steelblue", "steelblue", "firebrick"), lwd=2,
       lty=c(1,3,6))

In [None]:
# SAVE FIG
jpeg("output/Fig_10_6.jpg",res=300, width = 15, height = 15, units="cm")
plot(treatmentInferencePlot,
     plot.type="single",
     col=c("black", "black", "black", "black", "black"),
     lwd=2,
     lty= c(1, 2, 2, 3, 3),
     xlab="",
     ylab="Tobacco consumption (packs per capita)",
     ylim=c(-60, 30))
abline(h=0, lty=c(1))
lim <- par("usr")
rect(1988, lim[3], lim[2], lim[4], col = rgb(0.5,0.5,0.5,1/4))
axis(1) ## add axes back
axis(2)
box() 
legend(1971,-40,
       legend=c("Treatment effect", "80% CI (point-wise)", "80% CI (global)"),
       col=c("black", "black", "black"), lwd=2,
       lty=c(1,2,3))
dev.off()

## Using using Chernozhukov et al. (2017)'s procedure

In [None]:
print(paste(rep("=", 80), collapse=""))
print("Computing weights under CHERNOZHUKOV ET AL. (2017)")
print(paste(rep("=", 80), collapse=""))

X1 = t(data[data[,"Treated"]==1, c("Income", "RetailPrice", "Young", "BeerCons", "Smoking1988","Smoking1980","Smoking1975", smokeNames)])
X0 = t(data[data[,"Treated"]==0, c("Income", "RetailPrice", "Young", "BeerCons", "Smoking1988","Smoking1980","Smoking1975", smokeNames)])

Z1 = t(data[data[,"Treated"]==1, mapply(function(x) paste("SmokingCons", x, sep=""), 1970:1988)])
Z0 = t(data[data[,"Treated"]==0, mapply(function(x) paste("SmokingCons", x, sep=""), 1970:1988)])

solution = synth(X1=X1,
                 X0=X0,
                 Z0=Z0,
                 Z1=Z1,
                 custom.v = NULL,
                 optimxmethod = "Nelder-Mead",
                 verbose=TRUE)   
                                        
W_conformal = solution$solution.w                             

In [None]:
n_perm = 10000

post_treat = smokeNames %in% mapply(function(x) paste("SmokingCons", x, sep=""), 1989:2000)
u_hat = data[data[,"Treated"]==1, smokeNames] - t(as.matrix(data[data[,"Treated"]==0, smokeNames]))%*%W_conformal

stat_obs = sum(abs(u_hat[post.ind])) # Statistique observée

u_hat_p = matrix(unlist(replicate(n_perm, sample(u_hat, replace=F))), ncol=n_perm)
stats_permut = apply(abs(u_hat_p[post_treat,]), 2, sum) # Statistiques permutation
                                    
conformal_pval = (1+sum(stats_permut>=stat_obs))/(n_perm+1)
print(paste0("conformal inference p-value: ", round(conformal_pval, 3)))