# Create DAT analysis file

In [32]:
library(data.table);library(dplyr)
# Read data and check the dup.
dat_res = fread('../PDcohorts//PPMI/download181018/DATScan_Analysis.csv')
dat_dx = fread('../PDcohorts//PPMI/download181018/DaTSCAN_SPECT_Visual_Interpretation_Assessment.csv')
dat_loc = fread('../PDcohorts//PPMI/download181018/DaTscan_Imaging.csv')
dx = fread('../PDcohorts/PPMI/out181018/DEMOG_DIAG.csv')

# Give latelality from SC visit and create mean values
dat_lat = dat_res %>% filter(EVENT_ID == "SC") %>% 
  mutate(lateral = ifelse( (CAUDATE_R + PUTAMEN_R) > (CAUDATE_L + PUTAMEN_L), "R", "L" )) %>% 
  select(PATNO, lateral)
dat_res = inner_join(dat_res, dat_lat, by="PATNO") %>% 
  mutate(STRIATUM_MEAN= (PUTAMEN_R + PUTAMEN_L + CAUDATE_R + CAUDATE_L)/4,
         PUTAMEN_MEAN = (PUTAMEN_R + PUTAMEN_L)/2,
         CAUDATE_MEAN = (CAUDATE_R + CAUDATE_L)/2) %>% 
  mutate(CAUDATE_IPS = ifelse(lateral=="R", CAUDATE_R, CAUDATE_L),
         CAUDATE_CNT = ifelse(lateral=="R", CAUDATE_L, CAUDATE_R),
         PUTAMEN_IPS = ifelse(lateral=="R", PUTAMEN_R, PUTAMEN_L),
         PUTAMEN_CNT = ifelse(lateral=="R", PUTAMEN_L, PUTAMEN_R))


# Solve duplication 
# dat_res %>% group_by(PATNO, EVENT_ID) %>% filter(n()>1)
# dat_dx %>% group_by(PATNO) %>% filter(n()>1)
# dat_loc %>% group_by(PATNO, EVENT_ID) %>% filter(n()>1)
# dat_res %>% filter(PATNO %in% c(3861, 4120))
dat_dx = dat_dx %>% distinct(PATNO, .keep_all = T)

In [33]:
# Combine data
df = left_join(dat_res, dat_dx[,c("PATNO", "VISINTRP")], by = "PATNO") %>% 
  left_join(., dat_loc[, c("PATNO", "EVENT_ID", "SCNLOC")], by = c("PATNO", "EVENT_ID")) %>% 
  left_join(., dx[,c("PATNO", "RECRUIT", "FEMALE", "BIRTHDT", "DX_INIT", "DIAG")], by = "PATNO")

# Table
df %>% with(table(EVENT_ID, RECRUIT))

        RECRUIT
EVENT_ID GENPD  HC  PD REGPD REGUN SWEDD
     SC    188 195 435     2     2    77
     ST      0   0  32     0     0     0
     U01     3   1  11     0     0     5
     U02     0   0   2     0     0     0
     V04     2   1 337     0     0     4
     V06    60   0 322     1     0    46
     V10     0   1 272     0     0     2

In the following analysis, exclude the obs from the small cells (count < 40)    
#### Eliminate unexpected obs small number cohorts. 

In [34]:
df_original = df
df = df_original %>% 
  group_by(EVENT_ID, RECRUIT) %>% filter(n()>40) %>% 
  ungroup() %>% data.frame()

# Table
df %>% with(table(EVENT_ID, RECRUIT))

        RECRUIT
EVENT_ID GENPD  HC  PD SWEDD
     SC    188 195 435    77
     V04     0   0 337     0
     V06    60   0 322    46
     V10     0   0 272     0

#### Check the distribution of values

In [36]:
outcomes = names(df)[8:14]
draw_density = function(df, col, thres = 4, mcex = 1, plot = TRUE){
  t = df[, col] %>% t %>% as.vector
  miss = is.na(t)
  N = sum(!miss)
  X = t[!miss]
  M = mean(X)
  S = sd(X)
  Out = (X < M - thres*S) + (X > M + thres*S)
  Noutlr = sum(Out)
  X=X[Out==0]
  M = mean(X)
  S = sd(X)
  if(plot){
    plot(density(X), main=sprintf("%s, N=%.0f \n (%.0f outlier excluded)", col, N-Noutlr, Noutlr), cex.main = mcex)
    curve(dnorm(x, M, S), min(X), max(X), add=T, col="green")
  }
  t[Out==1] = NA
  return(t)
}
df_sc = df %>% filter(EVENT_ID == "SC") 
png('figure/dat_raw_PD_BL.png', width=1600, height=800)
par(mfrow = c(2,4))
t = lapply(outcomes, function(x) draw_density(col = x, df = subset(df_sc, RECRUIT == "PD")))
dev.off()
# log transformation
df_sc_log = df_sc
df_sc_log[outcomes] = log(df_sc_log[outcomes])
png('figure/dat_log_PD_BL.png', width=1600, height=800)
par(mfrow = c(2,4))
t = lapply(outcomes, function(x) draw_density(col = x, df = subset(df_sc_log, RECRUIT == "PD")))
dev.off()


# PD V10
df_v10 = df %>% filter(EVENT_ID == "V10")
png('figure/dat_raw_PD_V10.png', width=1200, height=600)
par(mfrow = c(2,4))
t = lapply(outcomes, function(x) draw_density(col = x, df = subset(df_v10, RECRUIT == "PD")))
dev.off()
# log transformation
df_v10_log = df_v10
df_v10_log[outcomes] = log(df_v10_log[outcomes])
png('figure/dat_log_PD_V10.png', width=1200, height=600)
par(mfrow = c(2,4))
t = lapply(outcomes, function(x) draw_density(col = x, df = subset(df_v10_log, RECRUIT == "PD")))
dev.off()

# HC
png('figure/dat_raw_HC_BL.png', width=1200, height=600)
par(mfrow = c(2,4))
t = lapply(outcomes, function(x) draw_density(col = x, df = subset(df_sc, RECRUIT %in% c("HC", "SWEDD"))))
dev.off()
png('figure/dat_log_HC_BL.png', width=1200, height=600)
par(mfrow = c(2,4))
t = lapply(outcomes, function(x) draw_density(col = x, df = subset(df_sc_log, RECRUIT %in% c("HC", "SWEDD"))))
dev.off()


1. [raw density plot of value at screening for PD](figure/dat_raw_PD_BL.png)
2. [log density plot of value at screening for PD](figure/dat_log_PD_BL.png)
3. [raw density plot of value at visit 10 for PD](figure/dat_raw_PD_V10.png)
4. [log density plot of value at visit 10 for PD](figure/dat_log_PD_V10.png)
3. [raw density plot of value at screening for HC/SWEDD](figure/dat_raw_HC_BL.png)
4. [log density plot of value at screening for HC/SWEDD](figure/dat_log_HC_BL.png)

For PD, log transformation is better at baseline but shows relative deviation at V10    
For HC, the log-transformations are better.     
**For consistency, use the log-transformed values**

In [23]:
df[outcomes] = log(df[outcomes])
outcomes = paste(outcomes, 'log', sep = "_")
names(df)[8:14]= outcomes
# exclude outliers
df[outcomes] = lapply(outcomes, function(x) draw_density(df, col = x, plot = FALSE))

#### Check if different scanning locations affect the results on SC-> OK.

In [24]:
df_sc = df %>% filter(EVENT_ID == "SC") 
lapply(outcomes, function(x){
    lm(df_sc[, x] ~ df_sc$SCNLOC) %>% summary %>% coef() %>% .[2,]
  }) %>% do.call(rbind, .)

Estimate,Std. Error,t value,Pr(>|t|)
-0.004761792,0.07809518,-0.06097421,0.9513947
-0.066151249,0.10654847,-0.6208559,0.5348678
0.020006095,0.06745803,0.29657099,0.7668695
0.025402491,0.06461469,0.39313803,0.6943202
0.013879041,0.07381734,0.18801871,0.8509087
-0.074860345,0.10345239,-0.72362121,0.4695057
-0.055589046,0.11847114,-0.46922013,0.6390376


Scanning location (1,2) doesn't seem to affect the results

In [27]:
# see the chronological chagne
require(ggplot2)
# png("figure/chronological.png", width = 1800, height = 600)
# par(mfcol = c(2,6))
p = list()
for (C in outcomes){ # first 4 are CSF_Hb and MTDNAs
  df$Y = df[, C]
  p[[C]] = ggplot(data = df, aes(x = EVENT_ID, y = Y)) +
    geom_boxplot(aes(fill=RECRUIT)) + 
    ylab(paste(C))
  df$Y = NULL
}
library(gridExtra)
ggsave("figure/dat.png", arrangeGrob(grobs=p, nrow = 2), width = 12, height = 6, dpi=100)

"Removed 2 rows containing non-finite values (stat_boxplot)."

[The chronological change](figure/dat.png)     
The signal dicreases as time goes by.    
The distribution is right skewed. Try log-transformation

# Create laterality and mean outcome

In [29]:
summary(df)

     PATNO         EVENT_ID           CAUDATE_R       CAUDATE_L    
 Min.   : 3000   Length:1932        Min.   :0.060   Min.   :0.200  
 1st Qu.: 3352   Class :character   1st Qu.:1.400   1st Qu.:1.420  
 Median : 3612   Mode  :character   Median :1.860   Median :1.870  
 Mean   : 9072                      Mean   :1.941   Mean   :1.946  
 3rd Qu.: 4023                      3rd Qu.:2.413   3rd Qu.:2.370  
 Max.   :72784                      Max.   :4.960   Max.   :4.750  
                                                                   
   PUTAMEN_R        PUTAMEN_L        lateral          STRIATUM_MEAN_log  
 Min.   :0.0300   Min.   :0.0100   Length:1932        Min.   :-1.386294  
 1st Qu.:0.5200   1st Qu.:0.5175   Class :character   1st Qu.: 0.008092  
 Median :0.7100   Median :0.7000   Mode  :character   Median : 0.258511  
 Mean   :0.9429   Mean   :0.9214                      Mean   : 0.274818  
 3rd Qu.:1.1200   3rd Qu.:1.0500                      3rd Qu.: 0.539048  
 Max.   :3.7

In [30]:
fwrite(df, 'output/DAT.csv')