Script for outlier detection from LearningBrain data
=======================================

This is a sctipt for creating tidy dataframe containinig DVARS and FramewiseDisplacement for participants extracted from confound files generated by fmriprep pipeline. 


In [1]:
# Loading packages

library(tidyverse)

“package ‘tidyverse’ was built under R version 3.4.4”── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.0.0     ✔ purrr   0.2.5
✔ tibble  1.4.2     ✔ dplyr   0.7.6
✔ tidyr   0.8.1     ✔ stringr 1.3.1
✔ readr   1.1.1     ✔ forcats 0.3.0
“package ‘forcats’ was built under R version 3.4.4”── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


In [5]:
# Selecting subjects who finished the study

groups = read.csv('/home/finc/Dropbox/Projects/LearningBrain/github/LearningBrain_networks/data/behavioral/group_assignment.csv')
trained_subs <- groups %>% filter(group == 'Experimental' | group == 'Control') %>% select (sub)


In [6]:
subs <- as.vector(trained_subs$sub)
length(subs)

In [9]:
# Setting files
top_dir <- '/home/finc/Dropbox/Projects/LearningBrain/data/confounds/'
#subs <- list.files(top_dir)
sess <- c('ses-1', 'ses-2', 'ses-3', 'ses-4')
tasks <- c('dualnback','rest')


In [74]:
# Tidying dataframes and merging together by rows

confounds_all <- data.frame()
missing_data <- c()

for (sub in subs){
    for(ses in sess){
        for (task in tasks){
                        
            #sub_dir <- paste0(top_dir, sub, '/', ses, '/func/')           
            confounds_path <- paste0(top_dir, sub, '_', ses, '_task-', task, '_bold_confounds.csv')
            
            if(!file.exists(confounds_path)){
                missing_data <- rbind(missing_data, sub)
                }
            
            else {
                confounds <- read.csv(confounds_path)
                confounds <- confounds %>% select(stdDVARS:FramewiseDisplacement)
                cols <- c(1:4);    
                confounds[,cols] <- apply(confounds[,cols], 2, function(x) as.numeric(as.character(x)));
                confounds <- confounds %>% mutate(sub = sub) %>% mutate(task = task) %>% mutate(ses = ses) %>% select(sub, ses, task, everything())

                confounds_all <- rbind(confounds_all, confounds)
            }
        }
    }                               
}

“NAs introduced by coercion”

In [75]:
str(confounds_all)

'data.frame':	118070 obs. of  7 variables:
 $ sub                  : chr  "sub-01" "sub-01" "sub-01" "sub-01" ...
 $ ses                  : chr  "ses-1" "ses-1" "ses-1" "ses-1" ...
 $ task                 : chr  "dualnback" "dualnback" "dualnback" "dualnback" ...
 $ stdDVARS             : num  NA 1.19 1.21 1.25 1.09 ...
 $ non.stdDVARS         : num  NA 19.2 19.6 20.2 17.5 ...
 $ vx.wisestdDVARS      : num  NA 0.996 1.019 0.965 0.911 ...
 $ FramewiseDisplacement: num  NA 0.0879 0.0442 0.0713 0.0715 ...


In [82]:
missing_data

0,1
sub,sub-20
sub,sub-44


In [83]:
scrubbing <- confounds_all %>% mutate(scrubbing = as.numeric(FramewiseDisplacement > 0.5)) %>% 
    group_by(sub, ses, task) %>% 
    summarize(sum_scr = sum(scrubbing, na.rm = TRUE)*2, mean_FD = mean(FramewiseDisplacement, na.rm = TRUE))

head(scrubbing)

sub,ses,task,sum_scr,mean_FD
sub-01,ses-1,dualnback,14,0.10310018
sub-01,ses-1,rest,10,0.09710079
sub-01,ses-2,dualnback,10,0.11660174
sub-01,ses-2,rest,2,0.10325274
sub-01,ses-3,dualnback,36,0.15987912
sub-01,ses-3,rest,32,0.14850088


In [84]:
# subjects to exclude for dualnback (15% scrubbing frames)

scrubbing %>%  
    filter(task == 'dualnback') %>% 
    mutate(scr_perc = (sum_scr / 340) * 100) %>% 
    filter(scr_perc > 15 | mean_FD > 0.2)

sub,ses,task,sum_scr,mean_FD,scr_perc
sub-13,ses-2,dualnback,102,0.2937446,30.0
sub-13,ses-3,dualnback,74,0.2154746,21.76471
sub-13,ses-4,dualnback,96,0.254949,28.23529
sub-21,ses-1,dualnback,82,0.2666401,24.11765
sub-23,ses-3,dualnback,104,0.2968723,30.58824
sub-50,ses-2,dualnback,78,0.1688164,22.94118


In [49]:
# subjects to exclude for dualnback (10% scrubbing frames)

scrubbing %>%  
    filter(task == 'dualnback') %>% 
    mutate(scr_perc = (sum_scr / 340) * 100) %>% 
    filter(scr_perc > 10 | mean_FD > 0.2)

sub,ses,task,sum_scr,mean_FD,scr_perc
sub-01,ses-3,dualnback,36,0.1598791,10.58824
sub-13,ses-1,dualnback,38,0.1775225,11.17647
sub-13,ses-2,dualnback,102,0.2937446,30.0
sub-13,ses-3,dualnback,74,0.2154746,21.76471
sub-13,ses-4,dualnback,96,0.254949,28.23529
sub-15,ses-1,dualnback,36,0.1674185,10.58824
sub-21,ses-1,dualnback,82,0.2666401,24.11765
sub-23,ses-3,dualnback,104,0.2968723,30.58824
sub-46,ses-2,dualnback,38,0.1791175,11.17647
sub-50,ses-2,dualnback,78,0.1688164,22.94118


In [43]:
# subjects to exclude for rest (15% scrubbing frames)

scrubbing %>%  
    filter(task == 'rest') %>% 
    mutate(scr_perc = (sum_scr / 305) * 100) %>% 
    filter(scr_perc > 15 | mean_FD > 0.2)

sub,ses,task,sum_scr,mean_FD,scr_perc
sub-21,ses-1,rest,46,0.1756784,15.08197
sub-46,ses-2,rest,54,0.2540169,17.70492
sub-47,ses-3,rest,66,0.2986916,21.63934


In [50]:
# subjects to exclude for rest (15% scrubbing frames)

scrubbing %>%  
    filter(task == 'rest') %>% 
    mutate(scr_perc = (sum_scr / 305) * 100) %>% 
    filter(scr_perc > 10 | mean_FD > 0.2)

sub,ses,task,sum_scr,mean_FD,scr_perc
sub-01,ses-3,rest,32,0.1485009,10.4918
sub-21,ses-1,rest,46,0.1756784,15.08197
sub-25,ses-1,rest,38,0.1500476,12.45902
sub-30,ses-2,rest,36,0.165682,11.80328
sub-46,ses-2,rest,54,0.2540169,17.70492
sub-47,ses-3,rest,66,0.2986916,21.63934
