In [22]:
library(doParallel)
library(foreach)
source("/home/hcs2152/github/REDITs/REDIT_LLR.R")

In [23]:
# Input data 
file_path <- '/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/SPRINT/Output/A2I_Editing/2dpf/2dpf_REDIT_input.tsv'

# Read the TSV file into a data frame
data <- read.table(file_path, header = TRUE, sep = "\t")

In [24]:
data

X,Ctrl.01.coverage,Ctrl.02.coverage,Ctrl.03.coverage,NO.01.coverage,NO.02.coverage,NO.03.coverage
<chr>,<int>,<int>,<int>,<int>,<int>,<int>
1_209063 Edited,69,75,80,79,67,95
1_209063 Non-Edited,8,8,15,17,13,14
1_209113 Edited,35,36,44,36,27,40
1_209113 Non-Edited,16,12,16,11,8,18
1_209121 Edited,35,36,43,36,27,39
1_209121 Non-Edited,35,35,42,36,27,39
1_209134 Edited,33,34,40,37,26,32
1_209134 Non-Edited,9,11,8,5,10,8
1_665434 Edited,13,6,8,9,14,12
1_665434 Non-Edited,7,4,3,3,7,4


In [25]:
#initiate a cluster
noCores = detectCores() -1
cl = makeCluster(noCores,outfile="")
registerDoParallel(cl,cores=noCores)

In [26]:
# Extract the first column (index column)
ids <- data[, 1]

# Exclude the first column for further processing
data <- data[, -1]


In [27]:
#Split up the table into separate matrices for processing
rows_per_matrix <- 2

matrix_list <- split(data, (seq(nrow(data))-1) %/% rows_per_matrix)

In [28]:
# Create a new data frame with the extracted part before the space (the site)
new_ids <- data.frame(ID = sapply(strsplit(ids, " "), [, 1))
# Remove duplicates since we had 2 rows for each site (edited/non-edited)
new_ids <- unique(new_ids)

In [29]:
#Check
new_ids

Unnamed: 0_level_0,ID
Unnamed: 0_level_1,<chr>
1,1_209063
3,1_209113
5,1_209121
7,1_209134
9,1_665434
11,1_959013
13,1_2000352
15,1_2001689
17,1_2001697
19,1_2001698


In [30]:
# Assuming matrix_list is a list of data frames
matrix_list <- lapply(matrix_list, as.matrix)

In [31]:
# Check the number of matrices in matrix_list
num_editing_sites <- length(matrix_list)

# Print the result
print(num_editing_sites)

[1] 11146


In [32]:
#Set the groups for each matrix 
groups=c('ctrl','ctrl','ctrl','mutant','mutant','mutant')

In [33]:
output_matrix = foreach(i=1:num_editing_sites,.combine='rbind') %dopar%{
    current_matrix <- matrix_list[[i]]
    regression_info = REDIT_LLR(data=current_matrix, groups=groups)
    return( as.matrix( data.frame(p_value= regression_info$p.value )) )
}

In [34]:
p_vals_per_site <- cbind(new_ids,output_matrix)

In [35]:
p_vals_per_site

Unnamed: 0_level_0,ID,p_value
Unnamed: 0_level_1,<chr>,<dbl>
1,1_209063,0.5436740
3,1_209113,0.9712274
5,1_209121,0.9958361
7,1_209134,0.9691585
9,1_665434,0.8508294
11,1_959013,0.9994736
13,1_2000352,0.7410046
15,1_2001689,0.6346250
17,1_2001697,0.8642627
19,1_2001698,0.8990815


In [36]:
p_vals_per_site$p_adj_BH <- p.adjust(p_vals_per_site$p_value, method = "BH")

In [37]:
p_vals_per_site

Unnamed: 0_level_0,ID,p_value,p_adj_BH
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>
1,1_209063,0.5436740,1
3,1_209113,0.9712274,1
5,1_209121,0.9958361,1
7,1_209134,0.9691585,1
9,1_665434,0.8508294,1
11,1_959013,0.9994736,1
13,1_2000352,0.7410046,1
15,1_2001689,0.6346250,1
17,1_2001697,0.8642627,1
19,1_2001698,0.8990815,1


In [38]:
unique(p_vals_per_site$p_adj_BH)

In [39]:
p_file <- '/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/SPRINT/Output/A2I_Editing/5dpf/SPRINT_p_values_5dpf.tsv'

In [40]:
write.table(p_vals_per_site, file = p_file, sep = "\t", quote = FALSE, row.names = FALSE)