When two columns have the same name: data_merge_module renames them in an unpredictable way #31

cicdguy · 2021-08-05T13:25:10Z

to reproduce:

library(random.cdisc.data)
library(teal.module.clinical)
ADSL <- radsl(cached = TRUE)
ADAE <- radae(cached = TRUE)
ADCM <- radcm(cached = TRUE)

#' Modify ADCM
ADCM$CMINDC <- paste0("Indication_", as.numeric(ADCM$CMDECOD))
ADCM$CMDOSE <- 1
ADCM$CMTRT <- ADCM$CMCAT
ADCM$CMDOSU <- "U"
ADCM$CMROUTE <- "CMROUTE"
ADCM$CMDOSFRQ <- "CMDOSFRQ"
ADCM$CMSTDY <- 1
ADCM[ADCM$CMCAT == "medcl B", ]$CMSTDY <- 20
ADCM[ADCM$CMCAT == "medcl C", ]$CMSTDY <- 150
ADCM$CMENDY <- 500
ADCM[ADCM$CMCAT == "medcl B", ]$CMENDY <- 700
ADCM[ADCM$CMCAT == "medcl C", ]$CMENDY <- 1000
ADCM$CMASTDTM <- ADCM$ASTDTM
ADCM$CMAENDTM <- ADCM$AENDTM
rtables::var_labels(
  ADCM[c("CMINDC", "CMTRT", "CMSTDY", "CMENDY")]
) <- c(
  "Indication",
  "Reported Name of Drug, Med, or Therapy",
  "Study Day of Start of Medication",
  "Study Day of End of Medication"
)
adcm_keys <- c("STUDYID", "USUBJID", "ASTDTM", "CMSEQ", "ATC1", "ATC2", "ATC3", "ATC4")

app <- init(
  data = cdisc_data(
    cdisc_dataset("ADSL", ADSL, code = "ADSL <- radsl(cached = TRUE)"),
    cdisc_dataset("ADAE", ADAE, code = "ADAE <- radae(cached = TRUE)"),
    cdisc_dataset("ADCM", ADCM,
                  code = 'ADCM <- radcm(cached = TRUE)
      ADCM$CMINDC <- paste0("Indication_", as.numeric(ADCM$CMDECOD))
      ADCM$CMDOSE <- 1
      ADCM$CMTRT <- ADCM$CMCAT
      ADCM$CMDOSU <- "U"
      ADCM$CMROUTE <- "CMROUTE"
      ADCM$CMDOSFRQ <- "CMDOSFRQ"
      ADCM$CMSTDY <- 1
      ADCM[ADCM$CMCAT == "medcl B", ]$CMSTDY <- 20
      ADCM[ADCM$CMCAT == "medcl C", ]$CMSTDY <- 150
      ADCM$CMENDY <- 500
      ADCM[ADCM$CMCAT == "medcl B", ]$CMENDY <- 700
      ADCM[ADCM$CMCAT == "medcl C", ]$CMENDY <- 1000
      ADCM$CMASTDTM <- ADCM$ASTDTM
      ADCM$CMAENDTM <- ADCM$AENDTM
      rtables::var_labels(
        ADCM[c("CMINDC", "CMTRT", "CMSTDY", "CMENDY")]) <- c(
          "Indication",
          "Reported Name of Drug, Med, or Therapy",
          "Study Day of Start of Medication",
          "Study Day of End of Medication")',
                  keys = adcm_keys
    ),
    check = TRUE
  ),
  modules = root_modules(
    tm_g_pp_patient_timeline(
      label = "Vitals",
      dataname_adae = "ADAE",
      dataname_adcm = "ADCM",
      parentname = "ADSL",
      patient_col = "USUBJID",
      plot_height = c(600L, 200L, 2000L),
      cmtrt = choices_selected(
        choices = variable_choices(ADCM, "CMTRT"),
        selected = "CMTRT",
      ),
      aeterm = choices_selected(
        choices = variable_choices(ADAE, "AETERM"),
        selected = c("AETERM")
      ),
      aetime_start = choices_selected(
        choices = variable_choices(ADAE, "ASTDTM"),
        selected = c("ASTDTM")
      ),
      aetime_end = choices_selected(
        choices = variable_choices(ADAE, "AENDTM"),
        selected = c("AENDTM")
      ),
      dstime_start = choices_selected(
        choices = variable_choices(ADCM, "CMASTDTM"),
        selected = c("CMASTDTM")
      ),
      dstime_end = choices_selected(
        choices = variable_choices(ADCM, "CMAENDTM"),
        selected = c("CMAENDTM")
      ),
      aerelday_start = choices_selected(
        choices = variable_choices(ADAE, "ASTDY"),
        selected = c("ASTDY")
      ),
      aerelday_end = choices_selected(
        choices = variable_choices(ADAE, "AENDY"),
        selected = c("AENDY")
      ),
      dsrelday_start = choices_selected(
        choices = variable_choices(ADCM, "ASTDY"),
        selected = c("ASTDY")
      ),
      dsrelday_end = choices_selected(
        choices = variable_choices(ADCM, "AENDY"),
        selected = c("AENDY")
      )
    )
  )
)

shinyApp(app$ui, app$server)

Notice that ASTDY and AENDY are columns that are passed in from both ADAE and ADCM

Inside of srv_g_patient_timeline:

  p_timeline_merged_data <- data_merge_module(
    datasets = datasets,
    data_extract = list(
      dsrelday_start, dsrelday_end,
      aerelday_start, aerelday_end,
      aeterm, aetime_start,
      aetime_end, dstime_start, dstime_end, cmtrt),
    input_id = c(
      "dsrelday_start", "dsrelday_end",
      "aerelday_start", "aerelday_end",
      "aeterm", "aetime_start",
      "aetime_end", "dstime_start", "dstime_end", "cmtrt")
  )

If you press the Show R code button on the app:

data_merge_module tries to resolve this conflict by renaming. However, the renaming is unpredictable.

If instead, the developer rearranges the order:

  p_timeline_merged_data <- data_merge_module(
    datasets = datasets,
    data_extract = list(
      aeterm, aetime_start,
      aetime_end, dstime_start, dstime_end, cmtrt,
      dsrelday_start, dsrelday_end,
      aerelday_start, aerelday_end),
    input_id = c(
      "aeterm", "aetime_start",
      "aetime_end", "dstime_start", "dstime_end", "cmtrt",
      "dsrelday_start", "dsrelday_end",
      "aerelday_start", "aerelday_end")
  )

Pressing SRC:

Solution:

ASTDY -> ADAE.ASTDY
AENDY -> ADAE.AENDY

ASTDY -> ADCM.ASTDY
AENDY -> ADCM.AENDY

This actually makes more sense, too.

Provenance:

Creator: junlue

The text was updated successfully, but these errors were encountered:

cicdguy · 2021-08-05T13:25:14Z

It adds input_id as a prefix. This should be unique inside the call (i.e. repetitions are not allowed). This however isn't true for a dataname. What if you need to do merge of multiple selectors from the same dataset?

SELECTOR1:
id: id1
ADAE
select: X, Y, Z
filter: COL_1 = VAL_1
key: X, Y

SELECTOR2:
id: id2
ADAE
select:  X, Y, Z
filter: COL_1 = VAL_2
key: X, Y

Provenance:

Creator: pawelru

cicdguy · 2021-08-05T13:25:17Z

@pawelru

I agree that my solution is too simplistic and would not work always work.

But then we should consider other solutions, because the issue still remains: the current renaming scheme is unpredictable.

Provenance:

Creator: junlue

kpagacz · 2021-08-27T08:39:51Z

It adds input_id as a prefix. This should be unique inside the call (i.e. repetitions are not allowed). This however isn't true for a dataname. What if you need to do merge of multiple selectors from the same dataset?
SELECTOR1:
id: id1
ADAE
select: X, Y, Z
filter: COL_1 = VAL_1
key: X, Y

SELECTOR2:
id: id2
ADAE
select:  X, Y, Z
filter: COL_1 = VAL_2
key: X, Y
Provenance:
Creator: pawelru

It does, but it doesn't add the same input_id to the same data_extract_spec if the order of input is scrambled. And it probably should, right?

cicdguy added the core label Aug 5, 2021

gogonzo transferred this issue from another repository Apr 8, 2022

gogonzo mentioned this issue Jun 2, 2022

Design data extract and data merge insightsengineering/NEST-roadmap#36

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

When two columns have the same name: data_merge_module renames them in an unpredictable way #31

When two columns have the same name: data_merge_module renames them in an unpredictable way #31

cicdguy commented Aug 5, 2021 •

edited by pawelru

Loading

cicdguy commented Aug 5, 2021

cicdguy commented Aug 5, 2021

kpagacz commented Aug 27, 2021

When two columns have the same name: data_merge_module renames them in an unpredictable way #31

When two columns have the same name: data_merge_module renames them in an unpredictable way #31

Comments

cicdguy commented Aug 5, 2021 • edited by pawelru Loading

cicdguy commented Aug 5, 2021

cicdguy commented Aug 5, 2021

kpagacz commented Aug 27, 2021

cicdguy commented Aug 5, 2021 •

edited by pawelru

Loading