In [1]:
library(doParallel)
library(foreach)
library(stringr)
source("/home/hcs2152/github/REDITs/REDIT_LLR.R")

Loading required package: foreach

Loading required package: iterators

Loading required package: parallel



In [2]:
# Input data 
file_path <- '/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/REDITs/5dpf/Merged/JACUSA2_5dpf_REDITs_input.tsv'

# Read the TSV file into a data frame
data <- read.table(file_path, header = TRUE, sep = "\t")

In [3]:
#initiate a cluster
noCores = detectCores() -1
cl = makeCluster(noCores,outfile="")
registerDoParallel(cl,cores=noCores)

In [4]:
data

ID___Count_Type,Ctrl.04Edited_And_NonEdited__Counts_Counts,Ctrl.05Edited_And_NonEdited__Counts_Counts,Ctrl.06Edited_And_NonEdited__Counts_Counts,NO.04Edited_And_NonEdited__Counts_Counts,NO.05Edited_And_NonEdited__Counts_Counts,NO.06Edited_And_NonEdited__Counts_Counts
<chr>,<int>,<int>,<int>,<int>,<int>,<int>
1_10023876___Edited_AG,14,6,1,18,11,32
1_10023876___Non_Edited_AG,68,39,97,63,87,92
1_10032409___Edited_AG,13,15,25,5,33,24
1_10032409___Non_Edited_AG,8,20,15,13,10,13
1_10034006___Edited_AG,5,10,3,4,2,6
1_10034006___Non_Edited_AG,17,16,22,34,26,29
1_10034007___Edited_AG,16,13,8,16,12,5
1_10034007___Non_Edited_AG,5,13,17,22,16,28
1_10034010___Edited_AG,10,20,17,30,22,31
1_10034010___Non_Edited_AG,11,4,2,4,3,6


In [5]:
# Extract the first column (index column)
ids <- data[, 1]

# Exclude the first column for further processing
data <- data[, -1]

In [6]:
data

Ctrl.04Edited_And_NonEdited__Counts_Counts,Ctrl.05Edited_And_NonEdited__Counts_Counts,Ctrl.06Edited_And_NonEdited__Counts_Counts,NO.04Edited_And_NonEdited__Counts_Counts,NO.05Edited_And_NonEdited__Counts_Counts,NO.06Edited_And_NonEdited__Counts_Counts
<int>,<int>,<int>,<int>,<int>,<int>
14,6,1,18,11,32
68,39,97,63,87,92
13,15,25,5,33,24
8,20,15,13,10,13
5,10,3,4,2,6
17,16,22,34,26,29
16,13,8,16,12,5
5,13,17,22,16,28
10,20,17,30,22,31
11,4,2,4,3,6


In [7]:
#Split up the table into separate matrices for processing
rows_per_matrix <- 2

matrix_list <- split(data, (seq(nrow(data))-1) %/% rows_per_matrix)

In [8]:
#Modify the ID column to append after REDITs has ran on data
# Extract the desired part of the column
new_ids <- data.frame(ID = str_extract(ids, "(\\d+_\\d+)___\\w+_(AG|TC)"), stringsAsFactors = FALSE)

# Remove the middle part of the 'ID' values
new_ids$ID <- gsub("_(Edited|Non_Edited)_", "_", new_ids$ID)

# Remove duplicates based on the modified 'ID' column
new_ids <- unique(new_ids)

In [9]:
new_ids

Unnamed: 0_level_0,ID
Unnamed: 0_level_1,<chr>
1,1_10023876___AG
3,1_10032409___AG
5,1_10034006___AG
7,1_10034007___AG
9,1_10034010___AG
11,1_10034017___AG
13,1_10034018___AG
15,1_10034020___AG
17,1_10034028___AG
19,1_10054490___AG


In [10]:
# Assuming matrix_list is a list of data frames
matrix_list <- lapply(matrix_list, as.matrix)

In [11]:
# Check the number of matrices in matrix_list
num_editing_sites <- length(matrix_list)

# Print the result
print(num_editing_sites)

[1] 43337


In [12]:
#Set the groups for each matrix 
groups=c('ctrl','ctrl','ctrl','mutant','mutant','mutant')

In [13]:
output_matrix = foreach(i=1:num_editing_sites,.combine='rbind') %dopar%{
    current_matrix <- matrix_list[[i]]
    regression_info = REDIT_LLR(data=current_matrix, groups=groups)
    return( as.matrix( data.frame(p_value= regression_info$p.value )) )
}

In [14]:
# Combine IDs with p-values
p_vals_per_site <- cbind(new_ids,output_matrix)

In [15]:
# Check
p_vals_per_site

Unnamed: 0_level_0,ID,p_value
Unnamed: 0_level_1,<chr>,<dbl>
1,1_10023876___AG,0.175430013
3,1_10032409___AG,0.449639922
5,1_10034006___AG,0.159227748
7,1_10034007___AG,0.362987219
9,1_10034010___AG,0.105605119
11,1_10034017___AG,0.999725359
13,1_10034018___AG,0.391949692
15,1_10034020___AG,0.873991312
17,1_10034028___AG,0.996342025
19,1_10054490___AG,0.336390509


In [16]:
# Adjust p_values
p_vals_per_site$p_adj_BH <- p.adjust(p_vals_per_site$p_value, method = "BH")

In [17]:
p_vals_per_site

Unnamed: 0_level_0,ID,p_value,p_adj_BH
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>
1,1_10023876___AG,0.175430013,0.5782644
3,1_10032409___AG,0.449639922,0.7666742
5,1_10034006___AG,0.159227748,0.5620158
7,1_10034007___AG,0.362987219,0.7152876
9,1_10034010___AG,0.105605119,0.5177689
11,1_10034017___AG,0.999725359,1.0000000
13,1_10034018___AG,0.391949692,0.7336434
15,1_10034020___AG,0.873991312,0.9702227
17,1_10034028___AG,0.996342025,1.0000000
19,1_10054490___AG,0.336390509,0.6984132


In [65]:
p_file <- "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/REDITs/5dpf/JACUSA2_5dpf_p_values.tsv"
write.table(p_vals_per_site, file = p_file, sep = "\t", quote = FALSE, row.names = FALSE)