In [1]:
library(doParallel)
library(foreach)
library(stringr)
source("/home/hcs2152/github/REDITs/REDIT_regression.R")

Loading required package: foreach

Loading required package: iterators

Loading required package: parallel



In [2]:
# Input data 
file_path <- '/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/REDITs/all_dpf/Merged/JACUSA2_all_dpf_REDITs_input.tsv'

# Read the TSV file into a data frame
data <- read.table(file_path, header = TRUE, sep = "\t")

In [3]:
#initiate a cluster
noCores = detectCores() -1
cl = makeCluster(noCores,outfile="")
registerDoParallel(cl,cores=noCores)

In [4]:
data

ID___Count_Type,Ctrl.01Edited_And_NonEdited__Counts_Counts,Ctrl.02Edited_And_NonEdited__Counts_Counts,Ctrl.03Edited_And_NonEdited__Counts_Counts,Ctrl.04Edited_And_NonEdited__Counts_Counts,Ctrl.05Edited_And_NonEdited__Counts_Counts,Ctrl.06Edited_And_NonEdited__Counts_Counts,NO.01Edited_And_NonEdited__Counts_Counts,NO.02Edited_And_NonEdited__Counts_Counts,NO.03Edited_And_NonEdited__Counts_Counts,NO.04Edited_And_NonEdited__Counts_Counts,NO.05Edited_And_NonEdited__Counts_Counts,NO.06Edited_And_NonEdited__Counts_Counts
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1_10023876___Edited_AG,9,9,8,14,6,1,17,6,6,18,11,32
1_10023876___Non_Edited_AG,40,54,46,68,39,97,18,21,41,63,87,92
1_10054490___Edited_AG,102,19,218,232,154,60,172,180,161,355,212,271
1_10054490___Non_Edited_AG,471,384,263,790,726,1035,243,363,440,780,1268,707
1_10057975___Edited_AG,42,19,46,96,85,36,46,69,32,228,142,178
1_10057975___Non_Edited_AG,155,115,84,540,494,594,92,147,129,461,535,337
1_10313231___Edited_AG,12,36,20,33,21,17,10,6,15,18,24,25
1_10313231___Non_Edited_AG,1,7,8,4,8,2,16,8,8,10,9,15
1_10314652___Edited_AG,29,25,21,21,20,32,10,11,16,30,67,34
1_10314652___Non_Edited_AG,52,48,35,30,13,28,20,34,27,63,13,28


In [5]:
# Extract the first column (index column)
ids <- data[, 1]

# Exclude the first column for further processing
data <- data[, -1]

In [6]:
data

Ctrl.01Edited_And_NonEdited__Counts_Counts,Ctrl.02Edited_And_NonEdited__Counts_Counts,Ctrl.03Edited_And_NonEdited__Counts_Counts,Ctrl.04Edited_And_NonEdited__Counts_Counts,Ctrl.05Edited_And_NonEdited__Counts_Counts,Ctrl.06Edited_And_NonEdited__Counts_Counts,NO.01Edited_And_NonEdited__Counts_Counts,NO.02Edited_And_NonEdited__Counts_Counts,NO.03Edited_And_NonEdited__Counts_Counts,NO.04Edited_And_NonEdited__Counts_Counts,NO.05Edited_And_NonEdited__Counts_Counts,NO.06Edited_And_NonEdited__Counts_Counts
<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
9,9,8,14,6,1,17,6,6,18,11,32
40,54,46,68,39,97,18,21,41,63,87,92
102,19,218,232,154,60,172,180,161,355,212,271
471,384,263,790,726,1035,243,363,440,780,1268,707
42,19,46,96,85,36,46,69,32,228,142,178
155,115,84,540,494,594,92,147,129,461,535,337
12,36,20,33,21,17,10,6,15,18,24,25
1,7,8,4,8,2,16,8,8,10,9,15
29,25,21,21,20,32,10,11,16,30,67,34
52,48,35,30,13,28,20,34,27,63,13,28


In [7]:
#Split up the table into separate matrices for processing
rows_per_matrix <- 2

matrix_list <- split(data, (seq(nrow(data))-1) %/% rows_per_matrix)

In [8]:
#Modify the ID column to append after REDITs has ran on data
# Extract the desired part of the column
new_ids <- data.frame(ID = str_extract(ids, "(\\d+_\\d+)___\\w+_(AG|TC)"), stringsAsFactors = FALSE)

# Remove the middle part of the 'ID' values
new_ids$ID <- gsub("_(Edited|Non_Edited)_", "_", new_ids$ID)

# Remove duplicates based on the modified 'ID' column
new_ids <- unique(new_ids)

In [9]:
new_ids

Unnamed: 0_level_0,ID
Unnamed: 0_level_1,<chr>
1,1_10023876___AG
3,1_10054490___AG
5,1_10057975___AG
7,1_10313231___AG
9,1_10314652___AG
11,1_10325782___AG
13,1_10325785___AG
15,1_10379135___AG
17,1_10432785___AG
19,1_10684036___AG


In [10]:
# Assuming matrix_list is a list of data frames
matrix_list <- lapply(matrix_list, as.matrix)

In [11]:
# Check the number of matrices in matrix_list
num_editing_sites <- length(matrix_list)

# Print the result
print(num_editing_sites)

[1] 23983


In [57]:
# Now construct the coviariate matrix as described by REDIT tutorial 
the_covariates = data.frame(group=c("control", "control", "control", "control", "control", "control", "mutant", "mutant", "mutant", "mutant", "mutant", "mutant"), dpf=c(2,2,2,5,5,5,2,2,2,5,5,5))
the_covariates
     

group,dpf
<chr>,<dbl>
control,2
control,2
control,2
control,5
control,5
control,5
mutant,2
mutant,2
mutant,2
mutant,5


In [58]:
output_matrix = foreach(i=1:length(matrix_list),.combine='rbind') %dopar%{
    current_matrix <- matrix_list[[i]]
    regression_info = REDIT_regression(data=current_matrix, covariates=the_covariates)
    return( as.matrix( data.frame(p_value= regression_info$group.mutant.p.value )) )
}

In [59]:
output_matrix

p_value
0.0442094628
0.0662903807
0.0092256133
0.0005234153
0.8984568719
0.3259710215
0.2834954047
0.6257129526
0.5186447508
0.0849740930


In [60]:
# Combine IDs with p-values
p_vals_per_site <- cbind(new_ids,output_matrix)

In [61]:
p_vals_per_site

Unnamed: 0_level_0,ID,p_value
Unnamed: 0_level_1,<chr>,<dbl>
1,1_10023876___AG,0.0442094628
3,1_10054490___AG,0.0662903807
5,1_10057975___AG,0.0092256133
7,1_10313231___AG,0.0005234153
9,1_10314652___AG,0.8984568719
11,1_10325782___AG,0.3259710215
13,1_10325785___AG,0.2834954047
15,1_10379135___AG,0.6257129526
17,1_10432785___AG,0.5186447508
19,1_10684036___AG,0.0849740930


In [54]:
# Adjust p_values
p_vals_per_site$p_adj_BH <- p.adjust(p_vals_per_site$p_value, method = "BH")

In [62]:
p_vals_per_site

Unnamed: 0_level_0,ID,p_value
Unnamed: 0_level_1,<chr>,<dbl>
1,1_10023876___AG,0.0442094628
3,1_10054490___AG,0.0662903807
5,1_10057975___AG,0.0092256133
7,1_10313231___AG,0.0005234153
9,1_10314652___AG,0.8984568719
11,1_10325782___AG,0.3259710215
13,1_10325785___AG,0.2834954047
15,1_10379135___AG,0.6257129526
17,1_10432785___AG,0.5186447508
19,1_10684036___AG,0.0849740930


In [56]:
p_file <- "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/REDITs/all_dpf/JACUSA2_all_dpf_p_values.tsv"
write.table(p_vals_per_site, file = p_file, sep = "\t", quote = FALSE, row.names = FALSE)