
# RNHS Data Cleaning 
## J.Merritt 2023

Load required packages

In [36]:
Sys.setenv(JAGS_HOME = "C:/Program Files/JAGS/JAGS-4.0.0")

In [37]:
.libPaths("C:/Users/jkmer/anaconda3/envs/r4-base/Library")

load.lib <- c(#"lme4",
              #"ggplot2",
              #"ggpubr",
              #"directlabels",
              #"RColorBrewer",
              #"ggeffects",
              #"see",
              #"ggforce",
              #"yarrr",
              #"mgcv",
              #"gratia",
              #"tidymv",
              #"visreg",
              #"gamm4",
              "tidyverse",
              "dplyr",
              #"directlabels",
              #"confintr",
              #"Kendall",
              #"poolr",
              #"broom",
              #"modelr",
              #"gdata",
             #"LongituRF",
             "janitor",
             "zoo",
             "gt")
             #"scales",
             #"glue",
             #"purrr",
             #"htree",
             #"JMbayes") #note: modelr has been altered to include rmse_nl and residuals_nl described below

sapply(load.lib,require,character=TRUE)

Loading required package: gt



## Read in and clean data

In [38]:
setwd("C:/Users/jkmer/desktop")

In [39]:
scores <- read.csv("C:/Users/jkmer/desktop/2022_css/20221205_css.csv")
#xi_file <- read.csv("20220809_xci_bdnf_mecp2_grouped.csv")

In [40]:
clean <- clean_names(scores)

### Deidentification

In [41]:
#create new random identifier
ids <- clean %>% select(participant_id) %>% unique()
set.seed(321)
ids$new_id <- sample(9999, size = nrow(ids), replace = FALSE)
write.csv(ids, file='C:/Users/jkmer/Desktop/2023_css/new_id_lookup.csv', row.names=FALSE)
clean <- merge(clean, ids, by = 'participant_id')
#remove any spurious info or other identifiers
clean <- clean %>% select(-c('participant_id5201',
                             'participant_id5211',
                             'participant_id',
                             'interviewer_id',
                             'childs_dob',
                             'dob5201',
                             'rdcrn_protocol_id',
                             'visit_date'))
clean$participant_id <- clean$new_id
clean <- clean %>% select(-new_id)



In [42]:
head(clean)

Unnamed: 0_level_0,childs_gender,diagnosis,mutation,grouping1_genetic_mutation,grouping2,grouping3,visit,age_at_visit,age_of_onset_of_regression,onset_of_stereotypes,...,ambulation_at_this_visit_by_exam,hand_use,scoliosis,language_at_this_visit_by_exam,nonverbal_communication_at_this_vi,respiratory_dysfunction_at_this_vi,autonomic_symptoms_at_this_visit_by,epilepsy_seizures_at_this_visit,total_score,participant_id
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<int>,<int>,...,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,Female,Classic,Yes,MECP2 mutation,MECP2_R255X,R255X,Baseline,3.165,3,3,...,5,2,0,3,2,1,0,0,20,333
2,Female,Classic,Yes,MECP2 mutation,MECP2_R255X,R255X,6 months,3.7399,3,3,...,5,4,0,3,3,3,1,3,29,333
3,Female,Classic,Yes,MECP2 mutation,MECP2_R255X,R255X,12 months,4.2382,3,3,...,5,3,0,3,3,2,1,0,28,333
4,Female,Classic,Yes,MECP2 mutation,MECP2_R255X,R255X,18 months,4.7365,3,3,...,5,3,0,3,2,1,1,0,26,333
5,Female,Classic,Yes,MECP2 mutation,MECP2_R255X,R255X,"2 years, 6 months",5.7331,3,3,...,3,3,0,4,3,1,1,0,27,333
6,Female,Classic,Yes,MECP2 mutation,MECP2_R255X,R255X,3 years,6.1903,3,3,...,3,3,0,3,2,1,1,1,20,333


### Data cleanup

In [43]:
#convert na and missing values to R readable format
clean <- clean %>% mutate_if(is.character, na_if, c('NA')) %>%
          mutate_if(is.character, na_if, c('na')) %>%
          mutate_if(is.character, na_if, c('MISSING')) %>%
          mutate_if(is.character, na_if, c('missing')) %>%
          mutate_if(is.character, na_if, c('N/A')) %>%
          mutate_if(is.character, na_if, c('n/a')) %>%
          mutate_if(is.character, na_if, c('')) %>%
          mutate_if(is.character, na_if, c('UNKNOWN')) %>%
          mutate_if(is.character, na_if, c('unkown'))
    

In [44]:
#remove rows with na in participant id, age_at_visit, or any of the CSS components
clean <- clean %>% drop_na(c(participant_id, 
                    age_at_visit, 
                    age_of_onset_of_regression, 
                    onset_of_stereotypes, 
                    head_growth, 
                    somatic_growth_at_this_visit, 
                    independent_sitting_at_this_visit_b, 
                    ambulation_at_this_visit_by_exam, 
                    hand_use, scoliosis, 
                    language_at_this_visit_by_exam, 
                    nonverbal_communication_at_this_vi, 
                    respiratory_dysfunction_at_this_vi, 
                    autonomic_symptoms_at_this_visit_by, 
                    epilepsy_seizures_at_this_visit, 
                    total_score))

In [45]:
#age_of_onset_of_regression assigned to earliest non-zero reported value

corlag <- function(a) {
    ifelse(a != 0 & lag(a, default = 0) == 0, a, NA) }

clean <- clean %>%
             group_by(participant_id) %>% 
                     mutate(regression = corlag(age_of_onset_of_regression))

count_id <- clean %>%
    group_by(participant_id) %>%
    summarise(count_id=n())
clean <- merge(clean, count_id, by = "participant_id")

narep_func <- function (a,b){
    ifelse(a > 1, na.locf(b), b)
}
    
clean <- clean %>% 
             group_by(participant_id) %>% mutate(regression = narep_func(count_id, regression))


replag <- function(a,b) {
    ifelse(a == 0, 
           a, 
           b)
}
clean <- clean %>%
       group_by(participant_id) %>%
                                mutate(regressionclean = replag(age_of_onset_of_regression,
                                                                 regression))
clean <- select(clean, -regression)

In [46]:
#make chart of individuals showing improper changes in regression
clean$regression <- clean$age_of_onset_of_regression
clean$check <- clean$regressionclean - clean$regression
regression_indiv <- subset(clean, participant_id %in% 
                           as.vector(clean[clean$check != 0,]$participant_id))[,
                                c("participant_id", "age_at_visit", "regression", "regressionclean")]
regression_indiv$Age <- round(regression_indiv$age_at_visit, 1)
regression_indiv <- select(regression_indiv, -age_at_visit)
regression_count <- length(unique(regression_indiv$participant_id))
regression_count
regression_indiv <- head(regression_indiv, n=100)
gt_input <- regression_indiv %>%
ungroup() %>%
mutate(row = row_number()) %>% 
pivot_longer(-c(participant_id, row)) %>%
mutate(name = gsub("\\.", " ", name)) %>%
pivot_wider(names_from=c(participant_id, name), values_from=value) %>%
select(-row)
reg_cols = colnames(gt_input[,grepl( "regression" , names(gt_input) )])
gt_table <-  gt(gt_input) %>%
    data_color(columns =  all_of(reg_cols),
            colors = scales::col_factor(
            palette = c('blue','purple','green', 'yellow', 'orange', 'red'),    
            domain = c(0:5)
            )
            ) %>% 
  tab_spanner_delim(
      delim="_"
  ) %>% 
  sub_missing(
    columns=everything(),
    missing_text=""
  ) 
 gt:::as.tags.gt_tbl(gt_table) 


113,113,113,152,152,152,154,154,154,160,160,160,196,196,196,259,259,259,277,277,277,283,283,283,287,287,287,514,514,514,581,581,581,608,608,608,648,648,648,689,689,689
regression,regressionclean,Age,regression,regressionclean,Age,regression,regressionclean,Age,regression,regressionclean,Age,regression,regressionclean,Age,regression,regressionclean,Age,regression,regressionclean,Age,regression,regressionclean,Age,regression,regressionclean,Age,regression,regressionclean,Age,regression,regressionclean,Age,regression,regressionclean,Age,regression,regressionclean,Age,regression,regressionclean,Age
5.0,5.0,3.8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5.0,5.0,4.3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1.0,5.0,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1.0,5.0,5.3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1.0,5.0,5.8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1.0,5.0,6.2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1.0,5.0,8.2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1.0,5.0,9.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,1.0,1.0,3.8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,1.0,1.0,4.3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [47]:
#onset_of_stereotypes assigned to earliest non-zero reported value

corlag <- function(a) {
    ifelse(a != 0 & lag(a, default = 0) == 0, a, NA) }

clean <- clean %>%
             group_by(participant_id) %>%
                     mutate(stereotypes = corlag(onset_of_stereotypes))

narep_func <- function (a,b){
    ifelse(a > 1, na.locf(b), b)
}
    
clean <- clean %>% 
             group_by(participant_id) %>% mutate(stereotypes = narep_func(count_id, stereotypes))


replag <- function(a,b) {
    ifelse(a == 0, 
           a, 
           b)
}
clean <- clean %>%
       group_by(participant_id) %>%
                                mutate(stereotypesclean = replag(onset_of_stereotypes,
                                                                 stereotypes))
clean <- select(clean, -stereotypes)

In [48]:
#make chart of individuals showing improper changes in regression
clean$stereotypes <- clean$onset_of_stereotypes
clean$check <- clean$stereotypesclean - clean$stereotypes
stereotypes_indiv <- subset(clean, participant_id %in% 
                           as.vector(clean[clean$check != 0,]$participant_id))[,
                                c("participant_id", "age_at_visit", "stereotypes", "stereotypesclean")]
stereotypes_indiv$Age <- round(stereotypes_indiv$age_at_visit, 1)
stereotypes_indiv <- select(stereotypes_indiv, -age_at_visit)
stereotypes_count <- length(unique(stereotypes_indiv$participant_id))
stereotypes_count
stereotypes_indiv <- head(stereotypes_indiv, n=100)
gt_input <- stereotypes_indiv %>%
ungroup() %>%
mutate(row = row_number()) %>% 
pivot_longer(-c(participant_id, row)) %>%
mutate(name = gsub("\\.", " ", name)) %>%
pivot_wider(names_from=c(participant_id, name), values_from=value) %>%
select(-row)
ster_cols = colnames(gt_input[,grepl( "stereotypes" , names(gt_input) )])
gt_table <-  gt(gt_input) %>%
    data_color(columns =  all_of(ster_cols),
            colors = scales::col_factor(
            palette = c('blue','purple','green', 'yellow', 'orange', 'red'),    
            domain = c(0:5)
            )
            ) %>% 
  tab_spanner_delim(
      delim="_"
  ) %>% 
  sub_missing(
    columns=everything(),
    missing_text=""
  ) 
 gt:::as.tags.gt_tbl(gt_table) 

113,113,113,126,126,126,154,154,154,196,196,196,277,277,277,287,287,287,298,298,298,326,326,326,355,355,355,419,419,419,532,532,532,546,546,546,608,608,608
stereotypes,stereotypesclean,Age,stereotypes,stereotypesclean,Age,stereotypes,stereotypesclean,Age,stereotypes,stereotypesclean,Age,stereotypes,stereotypesclean,Age,stereotypes,stereotypesclean,Age,stereotypes,stereotypesclean,Age,stereotypes,stereotypesclean,Age,stereotypes,stereotypesclean,Age,stereotypes,stereotypesclean,Age,stereotypes,stereotypesclean,Age,stereotypes,stereotypesclean,Age,stereotypes,stereotypesclean,Age
2.0,2.0,3.8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2.0,2.0,4.3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2.0,2.0,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1.0,2.0,5.3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1.0,2.0,5.8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2.0,2.0,6.2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2.0,2.0,8.2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2.0,2.0,9.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,1.0,1.0,4.6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,1.0,1.0,5.2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [49]:
#head growth should never decrease
corlag <- function(a){   
    ifelse(a >= lag(a, default = 0), a, lag(a, default = 0))
    ifelse(a >= lag(a, n=2, default = 0), a, lag(a, n=2, default = 0))
    ifelse(a >= lag(a, n=3, default = 0), a, lag(a, n=3, default = 0))
    ifelse(a >= lag(a, n=4, default = 0), a, lag(a, n=4, default = 0))
    ifelse(a >= lag(a, n=5, default = 0), a, lag(a, n=5, default = 0))  
}

clean <- clean %>%
       group_by(participant_id) %>%
                                mutate(headgrowthclean = corlag(head_growth))
i <- 1
repeat {
  clean <- clean %>%
       group_by(participant_id) %>%
                                mutate(headgrowthclean = corlag(headgrowthclean)) 
   i <- i + 1 
   if(i > 10) {
      break
   }
}

In [50]:
corlag <- function(a, b){   
    ifelse(a == 3 & lag(a, default = 3) == 2, lag(a, default = 2), a)    
}

i <- 1
repeat {
  clean <- clean %>%
       group_by(participant_id) %>%
                                mutate(headgrowthclean = corlag(headgrowthclean, age_at_visit)) 
   i <- i + 1
   if(i > 10) {
      break
   }
}

In [51]:
corlag <- function(a, b){   
    ifelse(a == 4 & lag(a, default = 4) == 2, lag(a, default = 2), a)   
}

i <- 1
repeat {
  clean <- clean %>%
       group_by(participant_id) %>%
                                mutate(headgrowthclean = corlag(headgrowthclean, age_at_visit)) 
   i <- i + 1
   if(i > 10) {
      break
   }
}

In [52]:
corlag <- function(a, b){   
    ifelse(b > 2 & a == 4 & lag(a, default = 4) == 3, lag(a, default = 3), a)    
}

i <- 1
repeat {
  clean <- clean %>%
       group_by(participant_id) %>%
                                mutate(headgrowthclean = corlag(headgrowthclean, age_at_visit)) 
   i <- i + 1
   if(i > 10) {
      break
   }
}

In [53]:
#make chart of individuals showing improper changes in headgrowth
clean$headgrowth <- clean$head_growth
clean$check <- clean$headgrowthclean - clean$headgrowth
headgrowth_indiv <- subset(clean, participant_id %in% 
                           as.vector(clean[clean$check != 0,]$participant_id))[,
                                c("participant_id", "age_at_visit", "headgrowth", "headgrowthclean")]
headgrowth_indiv$Age <- round(headgrowth_indiv$age_at_visit, 1)
headgrowth_indiv <- select(headgrowth_indiv, -age_at_visit)
headgrowth_count <- length(unique(headgrowth_indiv$participant_id))
headgrowth_count
headgrowth_indiv <- head(headgrowth_indiv, n=100)
gt_input <- headgrowth_indiv %>%
ungroup() %>%
mutate(row = row_number()) %>% 
pivot_longer(-c(participant_id, row)) %>%
mutate(name = gsub("\\.", " ", name)) %>%
pivot_wider(names_from=c(participant_id, name), values_from=value) %>%
select(-row)
head_cols = colnames(gt_input[,grepl( "headgrowth" , names(gt_input) )])
gt_table <-  gt(gt_input) %>%
    data_color(columns =  all_of(head_cols),
            colors = scales::col_factor(
            palette = c('blue','purple','green', 'yellow', 'orange', 'red'),    
            domain = c(0:5)
            )
            ) %>% 
  tab_spanner_delim(
      delim="_"
  ) %>% 
  sub_missing(
    columns=everything(),
    missing_text=""
  ) 
 gt:::as.tags.gt_tbl(gt_table) 

65,65,65,113,113,113,154,154,154,287,287,287,355,355,355,465,465,465,538,538,538,689,689,689,707,707,707,813,813,813,894,894,894
headgrowth,headgrowthclean,Age,headgrowth,headgrowthclean,Age,headgrowth,headgrowthclean,Age,headgrowth,headgrowthclean,Age,headgrowth,headgrowthclean,Age,headgrowth,headgrowthclean,Age,headgrowth,headgrowthclean,Age,headgrowth,headgrowthclean,Age,headgrowth,headgrowthclean,Age,headgrowth,headgrowthclean,Age,headgrowth,headgrowthclean,Age
2.0,2.0,8.6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2.0,2.0,9.6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2.0,2.0,10.6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1.0,1.0,12.9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1.0,1.0,14.9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1.0,2.0,16.8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,1.0,1.0,3.8,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,1.0,1.0,4.3,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,0.0,0.0,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,0.0,0.0,5.3,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [54]:
#independent sitting,
 #once score different than 5, should never be 5 again
#create index column by participant id
#create column summing cumulatice independent sitting score
#if current value is 5, multiply by index number, then compare to actual sum, if values
#differ, replace 5 with NA, then forward replace NAs
####need to add check for up and down, can go 5(never) to 1 to 2 to 3 to 4, but never back down, ie ever increasing
clean <- clean %>% 
    group_by(participant_id) %>%
        mutate(row_id = row_number(participant_id))
clean <- clean %>%
    group_by(participant_id) %>%
            mutate(sitting_sum = cumsum(independent_sitting_at_this_visit_b))

flagcheck <- function(a,b,c){
   ifelse( a != 5, 0,
          ifelse(a == 5 & b*a == c, 0, NA))
}
clean <- clean %>%
    group_by(participant_id) %>%
            mutate(sitting = flagcheck(independent_sitting_at_this_visit_b, row_id, sitting_sum))

replacena <- function(a,b){
    ifelse(a == 0, b, a)
}
clean <- clean %>%
    group_by(participant_id) %>%
            mutate(sittingclean = replacena(sitting, independent_sitting_at_this_visit_b))

narep_func <- function (a,b){
    ifelse(a > 1, na.locf(b), b)
}
    
clean <- clean %>% 
             group_by(participant_id) %>% mutate(sittingclean = narep_func(count_id, sittingclean))

clean <- select(clean, -c(sitting_sum, sitting))
clean$sitting <- clean$independent_sitting_at_this_visit_b

In [55]:
#additional sitting filters
corlag <- function(a, b){   
    ifelse( a != 0, a,
         ifelse((a == 0 & lag(a, default = 0) == 0), a, 
         ifelse((a == 0 & lag(a, default = 4) == 4), a,
              ifelse((a == 0 & lag(a, default = 5) == 5), a, lag(a, default = 0)))))   
}
i <- 1
repeat {
  clean <- clean %>%
       group_by(participant_id) %>%
                                mutate(sittingclean = corlag(sittingclean, age_at_visit)) 
   i <- i + 1
   if(i > 10) {
      break
   }
}

In [56]:
corlag <- function(a, b){   
    ifelse( a != 1, a,
         ifelse((a == 1 & lag(a, default = 1) == 1), a, 
         ifelse((a == 1 & lag(a, default = 4) == 4), a,
              ifelse((a == 1 & lag(a, default = 5) == 5), a, lag(a, default = 1)))))   
}
i <- 1
repeat {
  clean <- clean %>%
       group_by(participant_id) %>%
                                mutate(sittingclean = corlag(sittingclean, age_at_visit)) 
   i <- i + 1
   if(i > 10) {
      break
   }
}

In [57]:
corlag <- function(a, b){   
    ifelse( a != 2, a,
         ifelse((a == 2 & lag(a, default = 2) == 2), a, 
         ifelse((a == 2 & lag(a, default = 4) == 4), a,
              ifelse((a == 2 & lag(a, default = 5) == 5), a, lag(a, default = 2)))))   
}

i <- 1
repeat {
  clean <- clean %>%
       group_by(participant_id) %>%
                                mutate(sittingclean = corlag(sittingclean, age_at_visit)) 
   i <- i + 1
   if(i > 10) {
      break
   }
}

In [58]:
corlag <- function(a, b){   
    ifelse( a != 3, a,
         ifelse((a == 3 & lag(a, default = 3) == 3), a, 
         ifelse((a == 3 & lag(a, default = 4) == 4), a,
              ifelse((a == 3 & lag(a, default = 5) == 5), a, lag(a, default = 3)))))   
}

i <- 1
repeat {
  clean <- clean %>%
       group_by(participant_id) %>%
                                mutate(sittingclean = corlag(sittingclean, age_at_visit)) 
   i <- i + 1
   if(i > 10) {
      break
   }
}

In [59]:
#if lost, go to previous val unless 5
#if handuse is recovered(ie from 3 to 0,1,2), these values should match previous scores
#create flex column of handuseclean scores with 3's replaced with NA, forward fill na, conditionally replace
#all values following 3's that are not 3's with lagged flex value
#unless previous is 4

#create flex column where losses (3) are replaced with NA if 3 is not the first value
clean$flex <- clean$sittingclean

na_replace <- function(a,b){
    ifelse(a > 1 & b == 4, NA, b)
    }

clean <- clean %>% group_by(participant_id) %>% mutate(flex = na_replace(row_id, flex))



#replace NA in flex column with previous value
narep_func <- function (a,b){
    ifelse(a > 1, na.locf(b), b)
}
    
clean <- clean %>% 
             group_by(participant_id) %>% mutate(flex = narep_func(row_id, flex))

#if value is first entry, do nothing
#if previous value is 5, do nothing
#if previous value is 4 and current value doesn't match previous replacement in flex column
#replace a with previous value in flex, unless previous flex is 3
corlag <- function(a,b){
    ifelse(lag(a, default = 999) == 999, a,
    ifelse(lag(b, default = 0) == 5, a,
    ifelse(lag(b, default = 0) == 4, a,       
    ifelse( a != 4 & a != lag(b, default = 0), lag(b, default =0), a))))       
}
i <- 1
repeat {
  clean <- clean %>%
       group_by(participant_id) %>%
                                mutate(sittingclean = corlag(sittingclean, flex)) 
   i <- i + 1
   if(i > 20) {
      break
   }
}


In [60]:
#make chart of individuals showing improper changes in sitting
clean$check <- clean$sittingclean - clean$independent_sitting_at_this_visit_b
sitting_indiv <- subset(clean, participant_id %in% 
                           as.vector(clean[clean$check != 0,]$participant_id))[,
                                c("participant_id", "age_at_visit", "sitting", "sittingclean")]
sitting_indiv$Age <- round(sitting_indiv$age_at_visit, 1)
sitting_indiv <- select(sitting_indiv, -age_at_visit)
sitting_count <- length(unique(sitting_indiv$participant_id))
sitting_count
sitting_indiv <- head(sitting_indiv, n=100)
gt_input <- sitting_indiv %>%
ungroup() %>%
mutate(row = row_number()) %>% 
pivot_longer(-c(participant_id, row)) %>%
mutate(name = gsub("\\.", " ", name)) %>%
pivot_wider(names_from=c(participant_id, name), values_from=value) %>%
select(-row)
sit_cols = colnames(gt_input[,grepl( "sitting" , names(gt_input) )])
gt_table <-  gt(gt_input) %>%
    data_color(columns =  all_of(sit_cols),
            colors = scales::col_factor(
            palette = c('blue','purple','green', 'yellow', 'orange', 'red'),    
            domain = c(0:5)
            )
            ) %>% 
  tab_spanner_delim(
      delim="_"
  ) %>% 
  sub_missing(
    columns=everything(),
    missing_text=""
  ) 
 gt:::as.tags.gt_tbl(gt_table) 

65,65,65,113,113,113,143,143,143,159,159,159,160,160,160,220,220,220,259,259,259,288,288,288,333,333,333,355,355,355,364,364,364,419,419,419,471,471,471,476,476,476
sitting,sittingclean,Age,sitting,sittingclean,Age,sitting,sittingclean,Age,sitting,sittingclean,Age,sitting,sittingclean,Age,sitting,sittingclean,Age,sitting,sittingclean,Age,sitting,sittingclean,Age,sitting,sittingclean,Age,sitting,sittingclean,Age,sitting,sittingclean,Age,sitting,sittingclean,Age,sitting,sittingclean,Age,sitting,sittingclean,Age
0.0,0.0,8.6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0.0,0.0,9.6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0.0,0.0,10.6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1.0,0.0,12.9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2.0,0.0,14.9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2.0,0.0,16.8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,1.0,1.0,3.8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,1.0,1.0,4.3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,1.0,1.0,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,2.0,1.0,5.3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [61]:
#ambulation
#same cleaning approach as for independent_sitting_at_this_visit_b
clean <- clean %>% 
    group_by(participant_id) %>%
        mutate(row_id = row_number(participant_id))

clean <- clean %>%
    group_by(participant_id) %>%
            mutate(ambulation_sum = cumsum(ambulation_at_this_visit_by_exam))

flagcheck <- function(a,b,c){
    ifelse( a != 5, 0,
           ifelse( a == 5 & b*a == c, 0, NA))
}
clean <- clean %>%
    group_by(participant_id) %>%
            mutate(ambulation = flagcheck(ambulation_at_this_visit_by_exam, row_id, ambulation_sum))

replacena <- function(a,b){
    ifelse(a == 0, b, a)
}
clean <- clean %>%
    group_by(participant_id) %>%
            mutate(ambulationclean = replacena(ambulation, ambulation_at_this_visit_by_exam))

narep_func <- function (a,b){
    ifelse(a > 1, na.locf(b), b)
}
    
clean <- clean %>% 
             group_by(participant_id) %>% mutate(ambulationclean = narep_func(count_id, ambulationclean))

clean <- select(clean, -c(ambulation_sum, ambulation))
clean$ambulation <- clean$ambulation_at_this_visit_by_exam

In [62]:
#additional ambulation filters
corlag <- function(a, b){   
    ifelse( a != 0, a,
         ifelse((a == 0 & lag(a, default = 0) == 0), a, 
         ifelse((a == 0 & lag(a, default = 4) == 4), a,
              ifelse((a == 0 & lag(a, default = 5) == 5), a, lag(a, default = 0)))))    
}

i <- 1
repeat {
  clean <- clean %>%
       group_by(participant_id) %>%
                                mutate(ambulationclean = corlag(ambulationclean, age_at_visit)) 
   i <- i + 1
   if(i > 10) {
      break
   }
}

In [63]:
corlag <- function(a, b){   
    ifelse( a != 1, a,
         ifelse((a == 1 & lag(a, default = 1) == 1), a, 
         ifelse((a == 1 & lag(a, default = 4) == 4), a,
              ifelse((a == 1 & lag(a, default = 5) == 5), a, lag(a, default = 1)))))    
}

i <- 1
repeat {
  clean <- clean %>%
       group_by(participant_id) %>%
                                mutate(ambulationclean = corlag(ambulationclean, age_at_visit)) 
   i <- i + 1
   if(i > 10) {
      break
   }
}

In [64]:
corlag <- function(a, b){   
    ifelse(a != 2, a,
        ifelse((a == 2 & lag(a, default = 1) == 1), a, 
        ifelse((a == 2 & lag(a, default = 2) == 2), a,
           ifelse((a == 2 & lag(a, default = 4) == 4), a,
              ifelse((a == 2 & lag(a, default = 5) == 5), a, lag(a, default = 2))))))    
}


i <- 1
repeat {
  clean <- clean %>%
       group_by(participant_id) %>%
                                mutate(ambulationclean = corlag(ambulationclean, age_at_visit)) 
   i <- i + 1
   if(i > 10) {
      break
   }
}

In [65]:
#if lost (4), must return to previous occuring value, unless 5****


#create flex column where losses (3) are replaced with NA if 3 is not the first value
clean$flex <- clean$ambulationclean

na_replace <- function(a,b){
    ifelse(a > 1 & b == 4, NA, b)
    }

clean <- clean %>% group_by(participant_id) %>% mutate(flex = na_replace(row_id, flex))



#replace NA in flex column with previous value
narep_func <- function (a,b){
    ifelse(a > 1, na.locf(b), b)
}
    
clean <- clean %>% 
             group_by(participant_id) %>% mutate(flex = narep_func(row_id, flex))

#if value is first entry, do nothing
#if previous value is 4, do nothing
#if previous value is 3 and current value doesn't match previous replacement in flex column
#replace a with previous value in flex, unless previous flex is 3
corlag <- function(a,b){
    ifelse(lag(a, default = 999) == 999, a,
    ifelse(lag(b, default = 0) == 5, a,
    ifelse(lag(b, default = 0) == 4, a,       
    ifelse( a != 4 & a != lag(b, default = 0), lag(b, default =0), a))))       
}
i <- 1
repeat {
  clean <- clean %>%
       group_by(participant_id) %>%
                                mutate(ambulationclean = corlag(ambulationclean, flex)) 
   i <- i + 1
   if(i > 20) {
      break
   }
}


In [66]:
#make chart of individuals showing improper changes in ambulation
clean$check <- clean$ambulationclean - clean$ambulation_at_this_visit_by_exam
ambulation_indiv <- subset(clean, participant_id %in% 
                           as.vector(clean[clean$check != 0,]$participant_id))[,
                                c("participant_id", "age_at_visit", "ambulation", "ambulationclean")]
ambulation_indiv$Age <- round(ambulation_indiv$age_at_visit, 1)
ambulation_indiv <- select(ambulation_indiv, -age_at_visit)
ambulation_count <- length(unique(ambulation_indiv$participant_id))
ambulation_count
ambulation_indiv <- head(ambulation_indiv, n=100)
gt_input <- ambulation_indiv %>%
ungroup() %>%
mutate(row = row_number()) %>% 
pivot_longer(-c(participant_id, row)) %>%
mutate(name = gsub("\\.", " ", name)) %>%
pivot_wider(names_from=c(participant_id, name), values_from=value) %>%
select(-row)
amb_cols = colnames(gt_input[,grepl( "ambulation" , names(gt_input) )])
gt_table <-  gt(gt_input) %>%
    data_color(columns =  all_of(amb_cols),
            colors = scales::col_factor(
            palette = c('blue','purple','green', 'yellow', 'orange', 'red'),    
            domain = c(0:5)
            )
            ) %>% 
  tab_spanner_delim(
      delim="_"
  ) %>% 
  sub_missing(
    columns=everything(),
    missing_text=""
  ) 
 gt:::as.tags.gt_tbl(gt_table) 

9,9,9,11,11,11,65,65,65,113,113,113,152,152,152,159,159,159,160,160,160,165,165,165,196,196,196,201,201,201,259,259,259,264,264,264
ambulation,ambulationclean,Age,ambulation,ambulationclean,Age,ambulation,ambulationclean,Age,ambulation,ambulationclean,Age,ambulation,ambulationclean,Age,ambulation,ambulationclean,Age,ambulation,ambulationclean,Age,ambulation,ambulationclean,Age,ambulation,ambulationclean,Age,ambulation,ambulationclean,Age,ambulation,ambulationclean,Age,ambulation,ambulationclean,Age
0.0,0.0,28.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0.0,0.0,29.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0.0,0.0,30.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0.0,0.0,31.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0.0,0.0,32.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3.0,0.0,33.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4.0,4.0,34.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4.0,4.0,35.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,5.0,5.0,3.2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,3.0,3.0,3.7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [67]:
#hand use
#if any score other than 4 has been previously assigned, should never return to 4
#hand use
#if any score other than 4 has been previously assigned, should never return to 4
clean <- clean %>% 
    group_by(participant_id) %>%
        mutate(row_id = row_number(participant_id))

clean <- clean %>%
    group_by(participant_id) %>%
            mutate(handuse_sum = cumsum(hand_use))

flagcheck <- function(a,b,c){
    ifelse( a != 4, 0,
           ifelse(a == 4 & b*a == c, 0, NA))
}
clean <- clean %>%
    group_by(participant_id) %>%
            mutate(handuse = flagcheck(hand_use, row_id, handuse_sum))

replacena <- function(a,b){
    ifelse(a == 0, b, a)
}
clean <- clean %>%
    group_by(participant_id) %>%
            mutate(handuseclean = replacena(handuse, hand_use))

narep_func <- function (a,b){
    ifelse(a > 1, na.locf(b), b)
}
    
clean <- clean %>% 
             group_by(participant_id) %>% mutate(handuseclean = narep_func(count_id, handuseclean))

clean <- select(clean, -c(handuse_sum, handuse))
clean$handuse <- clean$hand_use
                             


In [68]:
#if handuse is recovered(ie from 3 to 0,1,2), these values should match previous scores
#create flex column of handuseclean scores with 3's replaced with NA, forward fill na, conditionally replace
#all values following 3's that are not 3's with lagged flex value
#unless previous is 4

#create flex column where losses (3) are replaced with NA if 3 is not the first value
clean$flex <- clean$handuseclean

na_replace <- function(a,b){
    ifelse(a > 1 & b == 3, NA, b)
    }

clean <- clean %>% group_by(participant_id) %>% mutate(flex = na_replace(row_id, flex))



#replace NA in flex column with previous value
narep_func <- function (a,b){
    ifelse(a > 1, na.locf(b), b)
}
    
clean <- clean %>% 
             group_by(participant_id) %>% mutate(flex = narep_func(row_id, flex))

#if value is first entry, do nothing
#if previous value is 4, do nothing
#if previous value is 3 and current value doesn't match previous replacement in flex column
#replace a with previous value in flex, unless previous flex is 3
corlag <- function(a,b){
    ifelse(lag(a, default = 999) == 999, a,
    ifelse(lag(b, default = 0) == 4, a,
    ifelse(lag(b, default = 0) == 3, a,       
    ifelse( a != 3 & a != lag(b, default = 0), lag(b, default =0), a))))       
}
i <- 1
repeat {
  clean <- clean %>%
       group_by(participant_id) %>%
                                mutate(handuseclean = corlag(handuseclean, flex)) 
   i <- i + 1
   if(i > 20) {
      break
   }
}


In [69]:
#handuse can never go from 0 to 1, 0 to 2, 1 to 2, or 2 to 1, 2 to 0, or 1 to 0
corlag <- function(a){
    ifelse( lag(a, default = 999) == 999, a,
        ifelse( a == 1 & lag(a) == 0, 0,
          ifelse( a == 1 & lag(a) == 2, 2,     
            ifelse( a == 2 & lag(a) == 0, 0,
                ifelse( a == 2 & lag(a) == 1, 1, 
                    ifelse( a == 0 & lag(a) == 1, 1,
                        ifelse( a == 0 & lag(a) == 2, 2, a)))))))
}
i <- 1
repeat {
  clean <- clean %>%
       group_by(participant_id) %>%
                                mutate(handuseclean = corlag(handuseclean)) 
   i <- i + 1
   if(i > 20) {
      break
   }
}



In [70]:
#create chart for individuals showing changes in hand use
clean$handuse <- clean$hand_use
clean$check <- clean$handuseclean - clean$handuse
handuse_indiv <- subset(clean, participant_id %in% 
                           as.vector(clean[clean$check != 0,]$participant_id))[,
                                c("participant_id", "age_at_visit", "handuse", "handuseclean")]
handuse_indiv$Age <- round(handuse_indiv$age_at_visit, 1)
handuse_indiv <- select(handuse_indiv, -age_at_visit)
handuse_count <- length(unique(handuse_indiv$participant_id))
handuse_count
handuse_indiv <- head(handuse_indiv, n=100)
gt_input <- handuse_indiv %>%
ungroup() %>%
mutate(row = row_number()) %>% 
pivot_longer(-c(participant_id, row)) %>%
mutate(name = gsub("\\.", " ", name)) %>%
pivot_wider(names_from=c(participant_id, name), values_from=value) %>%
select(-row)
hand_cols = colnames(gt_input[,grepl( "handuse" , names(gt_input) )])
gt_table <-  gt(gt_input) %>%
    data_color(columns =  all_of(hand_cols),
            colors = scales::col_factor(
            palette = c('blue','purple','green', 'yellow', 'orange', 'red'),    
            domain = c(0:5)
            )
            ) %>% 
  tab_spanner_delim(
      delim="_"
  ) %>% 
  sub_missing(
    columns=everything(),
    missing_text=""
  ) 
 gt:::as.tags.gt_tbl(gt_table) 


53,53,53,65,65,65,104,104,104,126,126,126,152,152,152,159,159,159,160,160,160,170,170,170,174,174,174,213,213,213,220,220,220,243,243,243,264,264,264,277,277,277
handuse,handuseclean,Age,handuse,handuseclean,Age,handuse,handuseclean,Age,handuse,handuseclean,Age,handuse,handuseclean,Age,handuse,handuseclean,Age,handuse,handuseclean,Age,handuse,handuseclean,Age,handuse,handuseclean,Age,handuse,handuseclean,Age,handuse,handuseclean,Age,handuse,handuseclean,Age,handuse,handuseclean,Age,handuse,handuseclean,Age
0.0,0.0,10.9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0.0,0.0,11.4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0.0,0.0,11.9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1.0,0.0,12.9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1.0,0.0,13.8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1.0,0.0,14.9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1.0,0.0,15.9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,3.0,3.0,8.6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,3.0,3.0,9.6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,3.0,3.0,10.6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [71]:
#scoliosis, once 5, should stay 5
surgcheck <- function(a){
    ifelse(a == 5, 1, 0)
}
clean <- clean %>%
    group_by(participant_id) %>%
        mutate(surgery = surgcheck(scoliosis))
scolcheck <-function(a,b){
    ifelse( a !=5 & cumsum(b) > 0, 5, a)
}
clean <- clean %>%
    group_by(participant_id) %>%
        mutate(scoliosisclean = scolcheck(scoliosis, surgery))

In [72]:
#make chart of individuals showing changes in scoliosis from origina
clean$check <- clean$scoliosisclean - clean$scoliosis
scoliosis_indiv <- subset(clean, participant_id %in% 
                           as.vector(clean[clean$check != 0,]$participant_id))[,
                                c("participant_id", "age_at_visit", "scoliosis", "scoliosisclean")]
scoliosis_indiv$Age <- round(scoliosis_indiv$age_at_visit, 1)
scoliosis_indiv <- select(scoliosis_indiv, -age_at_visit)
scoliosis_count <- length(unique(scoliosis_indiv$participant_id))
scoliosis_count
gt_input <- scoliosis_indiv %>%
ungroup() %>%
mutate(row = row_number()) %>% 
pivot_longer(-c(participant_id, row)) %>%
mutate(name = gsub("\\.", " ", name)) %>%
pivot_wider(names_from=c(participant_id, name), values_from=value) %>%
select(-row)
scol_cols = colnames(gt_input[,grepl( "scoliosis" , names(gt_input) )])

gt_table <-  gt(gt_input) %>%
    data_color(columns =  all_of(scol_cols),
            colors = scales::col_factor(
            palette = c('blue','purple','green', 'yellow', 'orange', 'red'),    
            domain = c(0:5)
            )
            ) %>% 
  tab_spanner_delim(
      delim="_"
  ) %>% 
  sub_missing(
    columns=everything(),
    missing_text=""
  ) 

 gt:::as.tags.gt_tbl(gt_table) 


868,868,868,1379,1379,1379,1770,1770,1770,3463,3463,3463,6470,6470,6470,6756,6756,6756,7383,7383,7383,7551,7551,7551,7677,7677,7677
scoliosis,scoliosisclean,Age,scoliosis,scoliosisclean,Age,scoliosis,scoliosisclean,Age,scoliosis,scoliosisclean,Age,scoliosis,scoliosisclean,Age,scoliosis,scoliosisclean,Age,scoliosis,scoliosisclean,Age,scoliosis,scoliosisclean,Age,scoliosis,scoliosisclean,Age
3.0,3.0,17.8,,,,,,,,,,,,,,,,,,,,,,,,
3.0,3.0,18.8,,,,,,,,,,,,,,,,,,,,,,,,
5.0,5.0,22.3,,,,,,,,,,,,,,,,,,,,,,,,
4.0,5.0,23.2,,,,,,,,,,,,,,,,,,,,,,,,
,,,5.0,5.0,22.5,,,,,,,,,,,,,,,,,,,,,
,,,4.0,5.0,23.4,,,,,,,,,,,,,,,,,,,,,
,,,5.0,5.0,24.2,,,,,,,,,,,,,,,,,,,,,
,,,,,,5.0,5.0,29.5,,,,,,,,,,,,,,,,,,
,,,,,,2.0,5.0,33.8,,,,,,,,,,,,,,,,,,
,,,,,,,,,5.0,5.0,17.8,,,,,,,,,,,,,,,


In [73]:
clean$total_score_clean = clean$regressionclean +
                          clean$stereotypesclean +
                          clean$headgrowthclean +
                          clean$somatic_growth_at_this_visit +
                          clean$sittingclean +
                          clean$ambulationclean +
                          clean$handuseclean +
                          clean$scoliosisclean +
                          clean$language_at_this_visit_by_exam +
                          clean$nonverbal_communication_at_this_vi +
                          clean$respiratory_dysfunction_at_this_vi +
                          clean$autonomic_symptoms_at_this_visit_by +
                          clean$epilepsy_seizures_at_this_visit 

In [74]:
#export cleaned css
write.csv(clean, file='C:/Users/jkmer/Desktop/2023_css/20230201_CSS_5201_5211_cleaned.csv', row.names=FALSE)