In [1]:
library(doParallel)
library(foreach)
source("/home/hcs2152/github/REDITs/REDIT_regression.R")

Loading required package: foreach

Loading required package: iterators

Loading required package: parallel



In [2]:
# Input data 
file_2dpf <- '/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/SPRINT/Output/A2I_Editing/2dpf/2dpf_REDIT_input.tsv'
file_5dpf <- '/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/SPRINT/Output/A2I_Editing/5dpf/5dpf_REDIT_input.tsv'

# Read the TSV file into a data frame
data_2dpf <- read.table(file_2dpf, header = TRUE, sep = "\t")
data_5dpf <- read.table(file_5dpf, header = TRUE, sep = "\t")

In [3]:
merged_df <- merge(data_2dpf, data_5dpf, by = "X", all = FALSE)
merged_df

X,Ctrl.01.coverage,Ctrl.02.coverage,Ctrl.03.coverage,NO.01.coverage,NO.02.coverage,NO.03.coverage,Ctrl.04.coverage,Ctrl.05.coverage,Ctrl.06.coverage,NO.04.coverage,NO.05.coverage,NO.06.coverage
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1_10033434 Edited,12,10,12,8,10,13,9,16,20,23,22,14
1_10033434 Non-Edited,6,7,4,2,3,3,6,12,11,9,9,11
1_10033438 Edited,12,9,12,7,9,12,9,15,18,21,21,14
1_10033438 Non-Edited,12,9,12,7,9,12,9,15,18,21,21,13
1_11784543 Edited,75,115,80,59,66,42,130,130,149,220,163,171
1_11784543 Non-Edited,8,11,10,8,13,10,16,20,9,46,14,17
1_11784563 Edited,77,115,88,60,65,45,128,134,154,218,169,177
1_11784563 Non-Edited,20,22,14,9,4,10,38,26,35,43,28,46
1_11784567 Edited,77,124,94,63,65,48,133,136,161,224,184,185
1_11784567 Non-Edited,36,78,49,36,34,30,91,90,89,153,115,102


In [6]:
# Now construct the coviariate matrix as described by REDIT tutorial 
the_covariates = data.frame(group=c("control", "control", "control", "mutant", "mutant", "mutant", "control", "control", "control", "mutant", "mutant", "mutant"), dpf=c(2,2,2,2,2,2,5,5,5,5,5,5))
the_covariates

group,dpf
<chr>,<dbl>
control,2
control,2
control,2
mutant,2
mutant,2
mutant,2
control,5
control,5
control,5
mutant,5


In [7]:
# Extract the first column (index column)
ids <- merged_df[, 1]

# Exclude the first column for further processing
merged_df <- merged_df[, -1]

In [8]:
# Create a new data frame with the extracted part before the space (the site)
new_ids <- data.frame(ID = sub(" .*", "", ids))
# Remove duplicates since we had 2 rows for each site (edited/non-edited)
new_ids <- unique(new_ids)

In [9]:
#Split up the table into separate matrices for processing
rows_per_matrix <- 2

matrix_list <- split(merged_df, (seq(nrow(merged_df))-1) %/% rows_per_matrix)

In [10]:
# Assuming matrix_list is a list of data frames
matrix_list <- lapply(matrix_list, as.matrix)

In [11]:
#initiate a cluster
noCores = detectCores() -1
cl = makeCluster(noCores,outfile="")
registerDoParallel(cl,cores=noCores)

In [12]:
length(matrix_list)

In [13]:
output_matrix = foreach(i=1:length(matrix_list),.combine='rbind') %dopar%{
    current_matrix <- matrix_list[[i]]
    regression_info = REDIT_regression(data=current_matrix, covariates=the_covariates)
    return( as.matrix( data.frame(p_value= regression_info$dpf.p.value )) )
}

In [14]:
output_matrix

p_value
0.138631078
1.000000000
0.430952169
0.071173904
0.227277609
1.000000000
0.883242627
0.030138082
0.449845118
0.553671045


In [15]:
p_vals_per_site <- cbind(new_ids,output_matrix)

In [16]:
p_vals_per_site$p_adj_BH <- p.adjust(p_vals_per_site$p_value, method = "BH")

In [17]:
p_vals_per_site

Unnamed: 0_level_0,ID,p_value,p_adj_BH
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>
1,1_10033434,0.138631078,0.8823425
3,1_10033438,1.000000000,1.0000000
5,1_11784543,0.430952169,1.0000000
7,1_11784563,0.071173904,0.7055186
9,1_11784567,0.227277609,0.9969175
11,1_11784568,1.000000000,1.0000000
13,1_11784569,0.883242627,1.0000000
15,1_11784571,0.030138082,0.4873298
17,1_11784572,0.449845118,1.0000000
19,1_11784577,0.553671045,1.0000000


In [18]:
p_file <- '/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/SPRINT/Output/A2I_Editing/SPRINT_combined_p_values.tsv'

In [19]:
write.table(p_vals_per_site, file = p_file, sep = "\t", quote = FALSE, row.names = FALSE)