# Create DAT analysis file

In [65]:
library(data.table);library(dplyr)
# Read data and check the dup.
dat_res = fread('../PDcohorts//PPMI/download181018/DATScan_Analysis.csv')
dat_dx = fread('../PDcohorts//PPMI/download181018/DaTSCAN_SPECT_Visual_Interpretation_Assessment.csv')
dat_loc = fread('../PDcohorts//PPMI/download181018/DaTscan_Imaging.csv')
dx = fread('../PDcohorts/PPMI/out181018/DEMOG_DIAG.csv')

# Solve duplication 
# dat_res %>% group_by(PATNO, EVENT_ID) %>% filter(n()>1)
# dat_dx %>% group_by(PATNO) %>% filter(n()>1)
# dat_loc %>% group_by(PATNO, EVENT_ID) %>% filter(n()>1)
# dat_res %>% filter(PATNO %in% c(3861, 4120))
dat_dx = dat_dx %>% distinct(PATNO, .keep_all = T)

In [66]:
# Combine data
df = left_join(dat_res, dat_dx[,c("PATNO", "VISINTRP")], by = "PATNO") %>% 
  left_join(., dat_loc[, c("PATNO", "EVENT_ID", "SCNLOC")], by = c("PATNO", "EVENT_ID")) %>% 
  left_join(., dx[,c("PATNO", "RECRUIT", "FEMALE", "BIRTHDT", "DX_INIT", "DIAG")], by = "PATNO")

# Table
df %>% with(table(EVENT_ID, RECRUIT))

        RECRUIT
EVENT_ID GENPD  HC  PD REGPD REGUN SWEDD
     SC    188 195 435     2     2    77
     ST      0   0  32     0     0     0
     U01     3  17  25     1     0     7
     U02     0   0   7     0     0     1
     V04     2   1 350     0     0     4
     V06    61   0 335     2     0    47
     V10     0   1 283     0     0     2

In the following analysis, exclude the obs from the small cells (count < 40)    
#### Eliminate unexpected obs small number cohorts. 

In [67]:
df_original = df
df = df_original %>% 
  group_by(EVENT_ID, RECRUIT) %>% filter(n()>40) %>% 
  ungroup() %>% data.frame()

# Table
df %>% with(table(EVENT_ID, RECRUIT))

        RECRUIT
EVENT_ID GENPD  HC  PD SWEDD
     SC    188 195 435    77
     V04     0   0 350     0
     V06    61   0 335    47
     V10     0   0 283     0

#### Check the distribution of values

In [68]:
draw_density = function(df, col, thres = 4, mcex = 1, plot = TRUE){
  t = df[, col] %>% t %>% as.vector
  miss = is.na(t)
  N = sum(!miss)
  X = t[!miss]
  M = mean(X)
  S = sd(X)
  Out = (X < M - thres*S) + (X > M + thres*S)
  Noutlr = sum(Out)
  X=X[Out==0]
  M = mean(X)
  S = sd(X)
  if(plot){
    plot(density(X), main=sprintf("%s, N=%.0f \n (%.0f outlier excluded)", col, N-Noutlr, Noutlr), cex.main = mcex)
    curve(dnorm(x, M, S), min(X), max(X), add=T, col="green")
  }
  t[Out==1] = NA
  return(t)
}
df_sc = df %>% filter(EVENT_ID == "SC") 
png('figure/dat_raw.png', width=800, height=800)
par(mfrow = c(2,2))
t = lapply(names(df)[3:6], function(x) draw_density(col = x, df = subset(df_sc, RECRUIT == "PD")))
dev.off()
# log transformation
df_sc_log = df_sc
df_sc_log[,3:6] = log(df_sc_log[,3:6])
png('figure/dat_log.png', width=800, height=800)
par(mfrow = c(2,2))
t = lapply(names(df)[3:6], function(x) draw_density(col = x, df = subset(df_sc_log, RECRUIT == "PD")))
dev.off()


# PD V10
df_v10 = df %>% filter(EVENT_ID == "V10")
png('figure/dat_raw_V10.png', width=800, height=800)
par(mfrow = c(2,2))
t = lapply(names(df)[3:6], function(x) draw_density(col = x, df = subset(df_v10, RECRUIT == "PD")))
dev.off()
# log transformation
df_v10_log = df_v10
df_v10_log[,3:6] = log(df_v10_log[,3:6])
png('figure/dat_log_V10.png', width=800, height=800)
par(mfrow = c(2,2))
t = lapply(names(df)[3:6], function(x) draw_density(col = x, df = subset(df_v10_log, RECRUIT == "PD")))
dev.off()

# HC
png('figure/dat_raw_HC.png', width=800, height=800)
par(mfrow = c(2,2))
t = lapply(names(df)[3:6], function(x) draw_density(col = x, df = subset(df_sc, RECRUIT %in% c("HC", "SWEDD"))))
dev.off()
png('figure/dat_log_HC.png', width=800, height=800)
par(mfrow = c(2,2))
t = lapply(names(df)[3:6], function(x) draw_density(col = x, df = subset(df_sc_log, RECRUIT %in% c("HC", "SWEDD"))))
dev.off()


1. [raw density plot of value at screening for PD](figure/dat_raw.png)
2. [log density plot of value at screening for PD](figure/dat_log.png)
3. [raw density plot of value at visit 10 for PD](figure/dat_raw_V10.png)
4. [log density plot of value at visit 10 for PD](figure/dat_log_V10.png)
3. [raw density plot of value at screening for HC/SWEDD](figure/dat_raw_HC.png)
4. [log density plot of value at screening for HC/SWEDD](figure/dat_log_HC.png)

For PD, the raw values are normally distributed for the caudate, but not for putamine.    
For HC, the log-transformations are better.     
**For consistency, use the log-transformed values**

In [69]:
df[, 3:6] = log(df[, 3:6])
names(df)[3:6]= paste(names(df)[3:6], 'log', sep = "_")
# exclude outliers
df[,3:6] = lapply(names(df)[3:6], function(x) draw_density(df, col = x, plot = FALSE))

#### Check if different scanning locations affect the results on SC-> OK.

In [70]:
df_sc = df %>% filter(EVENT_ID == "SC") 
lapply(names(df_sc)[3:6], function(x){
    lm(df_sc[, x] ~ df_sc$SCNLOC) %>% summary %>% coef() %>% .[2,]
  }) %>% do.call(rbind, .)

Estimate,Std. Error,t value,Pr(>|t|)
0.02647787,0.07119552,0.3719037,0.7100611
0.01280366,0.07024657,0.1822674,0.8554182
-0.0895334,0.1135306,-0.788628,0.4305585
-0.04091599,0.11359375,-0.3601958,0.7187939


Scanning location (1,2) doesn't affect the results

In [71]:
# see the chronological chagne
require(ggplot2)
# png("figure/chronological.png", width = 1800, height = 600)
# par(mfcol = c(2,6))
p = list()
for (C in names(df)[3:6]){ # first 4 are CSF_Hb and MTDNAs
  df$Y = df[, C]
  p[[C]] = ggplot(data = df, aes(x = EVENT_ID, y = Y)) +
    geom_boxplot(aes(fill=RECRUIT)) + 
    ylab(paste(C))
  df$Y = NULL
}
library(gridExtra)
ggsave("figure/dat.png", arrangeGrob(grobs=p, nrow = 2), width = 8, height = 8, dpi=100)

"Removed 5 rows containing non-finite values (stat_boxplot)."

[The chronological change](figure/dat.png)     
The signal dicreases as time goes by.    
The distribution is right skewed. Try log-transformation

In [72]:
summary(df)

     PATNO         EVENT_ID         CAUDATE_R_log     CAUDATE_L_log    
 Min.   : 3000   Length:1971        Min.   :-1.1087   Min.   :-1.0498  
 1st Qu.: 3326   Class :character   1st Qu.: 0.3436   1st Qu.: 0.3507  
 Median : 3605   Mode  :character   Median : 0.6206   Median : 0.6259  
 Mean   : 8979                      Mean   : 0.5867   Mean   : 0.5940  
 3rd Qu.: 4020                      3rd Qu.: 0.8796   3rd Qu.: 0.8660  
 Max.   :72784                      Max.   : 1.6014   Max.   : 1.5581  
                                    NA's   :6         NA's   :9        
 PUTAMEN_R_log      PUTAMEN_L_log        VISINTRP             SCNLOC     
 Min.   :-2.65926   Min.   :-2.65926   Length:1971        Min.   :1.000  
 1st Qu.:-0.65393   1st Qu.:-0.67334   Class :character   1st Qu.:1.000  
 Median :-0.34249   Median :-0.35668   Mode  :character   Median :1.000  
 Mean   :-0.25836   Mean   :-0.27556                      Mean   :1.026  
 3rd Qu.: 0.09531   3rd Qu.: 0.04879                  

In [73]:
fwrite(df, 'output/DAT.csv')