-
Imputation engine for continuous data using a Dirichlet Process (DP) Gaussian-mixture model, introduced in Kim, H. J., Reiter, J. P., Wang, Q., Cox, L. H. and Karr, A. F. (2014), "Multiple imputation of missing or faulty values under linear constraints," Journal of Business and Economics Statistics, 32, 375-386"
-
More documentation will be added later.
-
Provided that
devtools
package has been installed on the system, one can install and load the library by
devtools::install_github("hang-j-kim/DPImputeCont")
library(DPImputeCont)
Implementing the package while drawing some diagnostic plots with the simulated true data : Having detailed comments which explain some options for the package. Simulation datasets used are 01_Sim_Population.RData
and 02_Sim_Sample.RData
.
unlink(".RData")
rm(list = ls(all = TRUE))
############# Fuctions to draw scatter plot #####################
one_panel_fn <- function(i_var, j_var, Input, Output, MAIN, XLIM = NULL, YLIM = NULL) {
plot(Input[, i_var], Input[, j_var], main = MAIN, xlab = paste("Var", i_var), ylab = paste("Var",
j_var), col = "red", pch = 1, cex = 0.4, xlim = XLIM, ylim = YLIM)
points(Output[, i_var], Output[, j_var], col = "blue", pch = 20, cex = 0.4)
} # one_panel_fn
scatter_plot_fn <- function(D_true, Y_imputed, File_Name, i_rep) {
Plot_Output <- Y_imputed
if (i_rep == 0) {
MAIN <- "Blue: Imputed data w/o chain convergence"
} else {
MAIN <- paste0("Blue: Imputed values ", i_rep)
} #
png(file = File_Name, width = 800, height = 700, pointsize = 20)
par(mfrow = c(2, 2), mai = c(0.8, 0.8, 0.4, 0.4), family = "serif", mgp = c(1.5, 0.5, 0))
one_panel_fn(i_var = 1, j_var = 2, Input = D_true, Output = Plot_Output, MAIN = "Red: True values",
XLIM = c(-4, 13), YLIM = c(0, 14))
one_panel_fn(i_var = 1, j_var = 3, Input = D_true, Output = Plot_Output, MAIN = MAIN, XLIM = c(-4,
13), YLIM = c(-2, 14))
one_panel_fn(i_var = 2, j_var = 3, Input = D_true, Output = Plot_Output, MAIN = "", XLIM = c(0,
14), YLIM = c(-2, 14))
one_panel_fn(i_var = 2, j_var = 4, Input = D_true, Output = Plot_Output, MAIN = "", XLIM = c(0,
14), YLIM = c(-3, 10))
dev.off()
} # scatter_plot_fn
############################################################################################################
library(DPImputeCont)
sessionInfo()
################## Input data formats
load("01_Sim_Population.RData")
load("02_Sim_Sample.Rdata")
head(D_sample)
varnames <- dimnames(D_sample)[[2]]
################## Read the data
data_obj <- readData(Y_in = D_sample, RandomSeed = 99)
################## Make and initialize the model
model_obj <- createModel(data_obj, K_mix_comp = 30)
# K_mix_comp is the number of the mixture components. The default value is 50. Empirically, K=30
# is enough for most datasets with the number of variables less than 5. After running the code, we
# can check if K_mix_comp is enough big by summarizing result_obj$draw_no_occ_cluster
Y_imputed <- Y_std <- model_obj$Y_mat
for (i_sample in 1:dim(Y_imputed)[[1]]) {
Y_imputed[i_sample, ] <- data_obj$mean_Y_input + data_obj$sd_Y_input * Y_std[i_sample, ]
}
scatter_plot_fn(D_true = D_pop, Y_imputed = Y_imputed, File_Name = "12_MI_ScatterPlots_0.png", i_rep = 0)
# run 1 iteration.
model_obj$Iterate()
# model_obj$.where_we_are
# run many iterations
burn <- 200
m_Imp <- 5
thin <- 100
# burn=10; m_Imp=5; thin=10
where_sample_EI <- burn + seq(from = thin, to = (m_Imp * thin), by = thin)
result_obj <- multipleImp(model_obj = model_obj, data_obj = data_obj, n_burnin = burn, m_Imp = m_Imp, interval_btw_Imp = thin)
################## Save results
save(data_obj = data_obj, model_obj = model_obj, result_obj = result_obj, varnames = varnames, where_sample_EI = where_sample_EI,
burn = burn, file = "11a_ImputedData.RData")
################## Check results
dim(result_obj$multiple_Imp)
head(result_obj$multiple_Imp[1, , ])
head(result_obj$multiple_Imp[m_Imp, , ])
################## Make plots
for (i_Imp in 1:m_Imp) {
scatter_plot_fn(D_true = D_pop, Y_imputed = result_obj$multiple_Imp[i_Imp, , ], File_Name = paste0("12_MI_ScatterPlots_",
i_Imp, ".png"), i_rep = i_Imp)
} # for (i_EI in 1:m)
png(file = "12_Imp_ConvCheck.png", width = 800, height = 700, pointsize = 20)
par(mfrow = c(2, 2), mai = c(0.8, 0.8, 0.4, 0.4), family = "serif", mgp = c(1.5, 0.5, 0)) # b l t r
i_var <- 1
plot(result_obj$draw_weighted_mu[, i_var], main = paste0("Weighted mu for variable ", i_var), type = "l",
ylab = "Weighted mu", xlab = "Iteration")
# abline(h = mean(result_obj$draw_weighted_mu[,i_var]), lty='dotted', col='green', lwd=2)
abline(h = True_Weighted_Mu[i_var], lty = "dotted", col = "green", lwd = 2)
abline(v = burn, col = "red", lwd = 2, lty = "dotted")
abline(v = where_sample_EI, col = "blue", lty = "dotted", lwd = 1.2)
i_var <- 2
plot(result_obj$draw_weighted_mu[, i_var], main = paste0("Weighted mu for variable ", i_var), type = "l",
ylab = "Weighted mu", xlab = "Iteration")
# abline(h = mean(result_obj$draw_weighted_mu[,i_var]), lty='dotted', col='green', lwd=2)
abline(h = True_Weighted_Mu[i_var], lty = "dotted", col = "green", lwd = 2)
abline(v = burn, col = "red", lwd = 2, lty = "dotted")
abline(v = where_sample_EI, col = "blue", lty = "dotted", lwd = 1.2)
plot(result_obj$draw_no_occ_cluster, main = "No. of occupying clusters", type = "l", ylab = "No. of clusters",
xlab = "Iteration", ylim = c(-0.5, 20))
# abline(h = mean(result_obj$draw_no_occ_cluster), lty='dotted', col='green', lwd=2)
abline(v = burn, col = "red", lwd = 2, lty = "dotted")
abline(v = where_sample_EI, col = "blue", lty = "dotted", lwd = 1.2)
plot(result_obj$draw_alpha, main = "alpha - Concentration param.", type = "l", ylab = "alpha", xlab = "Iteration")
# abline(h = mean(result_obj$draw_alpha), lty='dotted', col='green', lwd=2)
abline(v = burn, col = "red", lwd = 2, lty = "dotted")
abline(v = where_sample_EI, col = "blue", lty = "dotted", lwd = 1.2)
dev.off()
Implementing the package (in practice where we do not know the simulation true which was used to draw the scatter plot for comparison in the example above)
library(DPImputeCont)
sessionInfo()
#######################
load("02_Sim_Sample.Rdata")
head(D_sample)
varnames <- dimnames(D_sample)[[2]]
data_obj <- readData(Y_in = D_sample, RandomSeed = 99)
model_obj <- createModel(data_obj, K_mix_comp = 30)
burn <- 200
m_Imp <- 5
thin <- 100
result_obj <- multipleImp(model_obj = model_obj, data_obj = data_obj, n_burnin = burn, m_Imp = m_Imp, interval_btw_Imp = thin)
save(data_obj = data_obj, model_obj = model_obj, result_obj = result_obj, varnames = varnames, burn = burn, file = "11b_ImputedData.RData")
################## Check results
dim(result_obj$multiple_Imp)
head(result_obj$multiple_Imp[1, , ])
head(result_obj$multiple_Imp[m_Imp, , ])