This notebook identifies and extracts A to G editing sites meaning sites edited from A to G for the positive strand and T to C for the negative strand. 

In [1]:
library(tidyr)
library(dplyr)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
# Load in filtered data
filtered_data <- read.table("/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/all_dpf/all_dpf_merged_filtered_data.tsv", sep = "\t", stringsAsFactors = FALSE)

In [3]:
# Read in individual samples with associated meta data
directory_path <- "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/all_dpf/all_dpf_singles"
single_files <- list.files(path = directory_path, pattern = "\\.tsv$", full.names = TRUE)

# Use lapply to read each tsv file into a dataframe
dataframes_list <- lapply(single_files, function(file) {
    read.table(file, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
})

names(dataframes_list) <- gsub("\\.tsv$", "", tools::file_path_sans_ext(basename(single_files)))

In [4]:
# Assign dataframes to a variable 
df1 <- dataframes_list[["Ctrl-01_bases"]]
df2 <- dataframes_list[["Ctrl-02_bases"]]
df3 <- dataframes_list[["Ctrl-03_bases"]]
df4 <- dataframes_list[["Ctrl-04_bases"]]
df5 <- dataframes_list[["Ctrl-05_bases"]]
df6 <- dataframes_list[["Ctrl-06_bases"]]
df7 <- dataframes_list[["NO-01_bases"]]
df8 <- dataframes_list[["NO-02_bases"]]
df9 <- dataframes_list[["NO-03_bases"]]
df10 <- dataframes_list[["NO-04_bases"]]
df11 <- dataframes_list[["NO-05_bases"]]
df12 <- dataframes_list[["NO-06_bases"]]

In [5]:
# Fix vector column names for filtered_data
# Read the first row as column names
col_names <- filtered_data[1, ]

# Remove the first row from the DataFrame
filtered_data <- filtered_data[-1, ]

# Set the column names
colnames(filtered_data) <- col_names

In [6]:
# First, filter out any sites that have already been filtered out by Filter 1
filtered_df1 <- df1[df1$ID %in% filtered_data$ID, ]
filtered_df2 <- df2[df2$ID %in% filtered_data$ID, ]
filtered_df3 <- df3[df3$ID %in% filtered_data$ID, ]
filtered_df4 <- df4[df4$ID %in% filtered_data$ID, ]
filtered_df5 <- df5[df5$ID %in% filtered_data$ID, ]
filtered_df6 <- df6[df6$ID %in% filtered_data$ID, ]
filtered_df7 <- df7[df7$ID %in% filtered_data$ID, ]
filtered_df8 <- df8[df8$ID %in% filtered_data$ID, ]
filtered_df9 <- df9[df9$ID %in% filtered_data$ID, ]
filtered_df10 <- df10[df10$ID %in% filtered_data$ID, ]
filtered_df11 <- df11[df11$ID %in% filtered_data$ID, ]
filtered_df12 <- df12[df12$ID %in% filtered_data$ID, ]

In [7]:
# Split Base_counts into separate columns
split_base_counts <- function(df) {
  base_counts <- strsplit(df$Base_counts, ",")
  base_counts <- matrix(unlist(base_counts), ncol = 4, byrow = TRUE)
  colnames(base_counts) <- c("A_count", "C_count", "G_count", "T_count")
  df <- cbind(df, base_counts)
  return(df)
}

In [8]:
filtered_df1 <- split_base_counts(filtered_df1)
filtered_df2 <- split_base_counts(filtered_df2)
filtered_df3 <- split_base_counts(filtered_df3)
filtered_df4 <- split_base_counts(filtered_df4)
filtered_df5 <- split_base_counts(filtered_df5)
filtered_df6 <- split_base_counts(filtered_df6)
filtered_df7 <- split_base_counts(filtered_df7)
filtered_df8 <- split_base_counts(filtered_df8)
filtered_df9 <- split_base_counts(filtered_df9)
filtered_df10 <- split_base_counts(filtered_df10)
filtered_df11 <- split_base_counts(filtered_df11)
filtered_df12 <- split_base_counts(filtered_df12)

df_list = list(filtered_df1, filtered_df2, filtered_df3, filtered_df4, filtered_df5, filtered_df6, filtered_df7, filtered_df8, filtered_df9, filtered_df10, filtered_df11, filtered_df12)

In [9]:
# Grab A to G edited sites (positive strand)
filtered_df1_A_G <- subset(filtered_df1, ref == 'A' & strand == '+' & A_count >= 0 & G_count > 0 & C_count == 0 & T_count == 0)
filtered_df2_A_G <- subset(filtered_df2, ref == 'A' & strand == '+' & A_count >= 0 & G_count > 0 & C_count == 0 & T_count == 0)
filtered_df3_A_G <- subset(filtered_df3, ref == 'A' & strand == '+' & A_count >= 0 & G_count > 0 & C_count == 0 & T_count == 0)
filtered_df4_A_G <- subset(filtered_df4, ref == 'A' & strand == '+' & A_count >= 0 & G_count > 0 & C_count == 0 & T_count == 0)
filtered_df5_A_G <- subset(filtered_df5, ref == 'A' & strand == '+' & A_count >= 0 & G_count > 0 & C_count == 0 & T_count == 0)
filtered_df6_A_G <- subset(filtered_df6, ref == 'A' & strand == '+' & A_count >= 0 & G_count > 0 & C_count == 0 & T_count == 0)
filtered_df7_A_G <- subset(filtered_df7, ref == 'A' & strand == '+' & A_count >= 0 & G_count > 0 & C_count == 0 & T_count == 0)
filtered_df8_A_G <- subset(filtered_df8, ref == 'A' & strand == '+' & A_count >= 0 & G_count > 0 & C_count == 0 & T_count == 0)
filtered_df9_A_G <- subset(filtered_df9, ref == 'A' & strand == '+' & A_count >= 0 & G_count > 0 & C_count == 0 & T_count == 0)
filtered_df10_A_G <- subset(filtered_df10, ref == 'A' & strand == '+' & A_count >= 0 & G_count > 0 & C_count == 0 & T_count == 0)
filtered_df11_A_G <- subset(filtered_df11, ref == 'A' & strand == '+' & A_count >= 0 & G_count > 0 & C_count == 0 & T_count == 0)
filtered_df12_A_G <- subset(filtered_df12, ref == 'A' & strand == '+' & A_count >= 0 & G_count > 0 & C_count == 0 & T_count == 0)



In [10]:
filtered_df1_A_G

Unnamed: 0_level_0,ID,strand,ref,Base_counts,A_count,C_count,G_count,T_count
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
815,1_14972,+,A,60110,6,0,11,0
931,1_19878,+,A,50150,5,0,15,0
1039,1_22057,+,A,11010,11,0,1,0
1044,1_22087,+,A,10100,1,0,10,0
1124,1_29949,+,A,190780,19,0,78,0
3955,1_141499,+,A,38050,38,0,5,0
3956,1_141501,+,A,39010,39,0,1,0
3957,1_141508,+,A,160160,16,0,16,0
3958,1_141509,+,A,10280,1,0,28,0
3959,1_141510,+,A,10260,1,0,26,0


In [11]:
merge_data_frames <- function(df1, df2) {
  merge(df1, df2, by = c("ID", "strand", "ref"), all = FALSE)
}
df_list = list(filtered_df1_A_G, filtered_df2_A_G, filtered_df3_A_G, filtered_df4_A_G, filtered_df5_A_G, filtered_df6_A_G, filtered_df7_A_G, filtered_df8_A_G, filtered_df9_A_G, filtered_df10_A_G, filtered_df11_A_G, filtered_df12_A_G)
merged_AG_df <- Reduce(merge_data_frames, df_list)

“column names ‘Base_counts.x’, ‘A_count.x’, ‘C_count.x’, ‘G_count.x’, ‘T_count.x’, ‘Base_counts.y’, ‘A_count.y’, ‘C_count.y’, ‘G_count.y’, ‘T_count.y’ are duplicated in the result”
“column names ‘Base_counts.x’, ‘A_count.x’, ‘C_count.x’, ‘G_count.x’, ‘T_count.x’, ‘Base_counts.y’, ‘A_count.y’, ‘C_count.y’, ‘G_count.y’, ‘T_count.y’ are duplicated in the result”
“column names ‘Base_counts.x’, ‘A_count.x’, ‘C_count.x’, ‘G_count.x’, ‘T_count.x’, ‘Base_counts.y’, ‘A_count.y’, ‘C_count.y’, ‘G_count.y’, ‘T_count.y’, ‘Base_counts.x’, ‘A_count.x’, ‘C_count.x’, ‘G_count.x’, ‘T_count.x’, ‘Base_counts.y’, ‘A_count.y’, ‘C_count.y’, ‘G_count.y’, ‘T_count.y’ are duplicated in the result”
“column names ‘Base_counts.x’, ‘A_count.x’, ‘C_count.x’, ‘G_count.x’, ‘T_count.x’, ‘Base_counts.y’, ‘A_count.y’, ‘C_count.y’, ‘G_count.y’, ‘T_count.y’, ‘Base_counts.x’, ‘A_count.x’, ‘C_count.x’, ‘G_count.x’, ‘T_count.x’, ‘Base_counts.y’, ‘A_count.y’, ‘C_count.y’, ‘G_count.y’, ‘T_count.y’ are duplicated in the result”


In [12]:
# Add a tag for editing type
merged_AG_df$tag <- "AG"

In [13]:
merged_AG_df

ID,strand,ref,Base_counts.x,A_count.x,C_count.x,G_count.x,T_count.x,Base_counts.y,A_count.y,⋯,A_count.x,C_count.x,G_count.x,T_count.x,Base_counts.y,A_count.y,C_count.y,G_count.y,T_count.y,tag
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>.1,<chr>.1,<chr>.1,<chr>.1,<chr>.1,<chr>.1,<chr>,<chr>,<chr>,<chr>
1_10023876,+,A,40090,40,0,9,0,54090,54,⋯,87,0,11,0,920320,92,0,32,0,AG
1_10054490,+,A,47101020,471,0,102,0,3840190,384,⋯,1268,0,212,0,70702710,707,0,271,0,AG
1_10057975,+,A,1550420,155,0,42,0,1150190,115,⋯,535,0,142,0,33701780,337,0,178,0,AG
1_10313231,+,A,10120,1,0,12,0,70360,7,⋯,9,0,24,0,150250,15,0,25,0,AG
1_10314652,+,A,520290,52,0,29,0,480250,48,⋯,13,0,67,0,280340,28,0,34,0,AG
1_10325782,+,A,130190,13,0,19,0,5060,5,⋯,4,0,37,0,100390,10,0,39,0,AG
1_10325785,+,A,150250,15,0,25,0,5060,5,⋯,3,0,51,0,120400,12,0,40,0,AG
1_10379135,+,A,50160,5,0,16,0,90280,9,⋯,16,0,78,0,140240,14,0,24,0,AG
1_10432785,+,A,170160,17,0,16,0,20060,20,⋯,16,0,40,0,220130,22,0,13,0,AG
1_10684036,+,A,170790,17,0,79,0,260570,26,⋯,1,0,133,0,601160,6,0,116,0,AG


In [14]:
# Grab T to C editing sites (negative strand)
filtered_df1_T_C <- subset(filtered_df1, ref == 'T' & strand == '-' & T_count >= 0 & C_count > 0 & A_count == 0 & G_count == 0)
filtered_df2_T_C <- subset(filtered_df2, ref == 'T' & strand == '-' & T_count >= 0 & C_count > 0 & A_count == 0 & G_count == 0)
filtered_df3_T_C <- subset(filtered_df3, ref == 'T' & strand == '-' & T_count >= 0 & C_count > 0 & A_count == 0 & G_count == 0)
filtered_df4_T_C <- subset(filtered_df4, ref == 'T' & strand == '-' & T_count >= 0 & C_count > 0 & A_count == 0 & G_count == 0)
filtered_df5_T_C <- subset(filtered_df5, ref == 'T' & strand == '-' & T_count >= 0 & C_count > 0 & A_count == 0 & G_count == 0)
filtered_df6_T_C <- subset(filtered_df6, ref == 'T' & strand == '-' & T_count >= 0 & C_count > 0 & A_count == 0 & G_count == 0)
filtered_df7_T_C <- subset(filtered_df7, ref == 'T' & strand == '-' & T_count >= 0 & C_count > 0 & A_count == 0 & G_count == 0)
filtered_df8_T_C <- subset(filtered_df8, ref == 'T' & strand == '-' & T_count >= 0 & C_count > 0 & A_count == 0 & G_count == 0)
filtered_df9_T_C <- subset(filtered_df9, ref == 'T' & strand == '-' & T_count >= 0 & C_count > 0 & A_count == 0 & G_count == 0)
filtered_df10_T_C <- subset(filtered_df10, ref == 'T' & strand == '-' & T_count >= 0 & C_count > 0 & A_count == 0 & G_count == 0)
filtered_df11_T_C <- subset(filtered_df11, ref == 'T' & strand == '-' & T_count >= 0 & C_count > 0 & A_count == 0 & G_count == 0)
filtered_df12_T_C <- subset(filtered_df12, ref == 'T' & strand == '-' & T_count >= 0 & C_count > 0 & A_count == 0 & G_count == 0)

In [15]:
merge_data_frames <- function(df1, df2) {
  merge(df1, df2, by = c("ID", "strand", "ref"), all = FALSE)
}
df_list = list(filtered_df1_T_C, filtered_df2_T_C, filtered_df3_T_C, filtered_df4_T_C, filtered_df5_T_C, filtered_df6_T_C, filtered_df7_T_C, filtered_df8_T_C, filtered_df9_T_C, filtered_df10_T_C, filtered_df11_T_C, filtered_df12_T_C)
merged_TC_df <- Reduce(merge_data_frames, df_list)

“column names ‘Base_counts.x’, ‘A_count.x’, ‘C_count.x’, ‘G_count.x’, ‘T_count.x’, ‘Base_counts.y’, ‘A_count.y’, ‘C_count.y’, ‘G_count.y’, ‘T_count.y’ are duplicated in the result”
“column names ‘Base_counts.x’, ‘A_count.x’, ‘C_count.x’, ‘G_count.x’, ‘T_count.x’, ‘Base_counts.y’, ‘A_count.y’, ‘C_count.y’, ‘G_count.y’, ‘T_count.y’ are duplicated in the result”
“column names ‘Base_counts.x’, ‘A_count.x’, ‘C_count.x’, ‘G_count.x’, ‘T_count.x’, ‘Base_counts.y’, ‘A_count.y’, ‘C_count.y’, ‘G_count.y’, ‘T_count.y’, ‘Base_counts.x’, ‘A_count.x’, ‘C_count.x’, ‘G_count.x’, ‘T_count.x’, ‘Base_counts.y’, ‘A_count.y’, ‘C_count.y’, ‘G_count.y’, ‘T_count.y’ are duplicated in the result”
“column names ‘Base_counts.x’, ‘A_count.x’, ‘C_count.x’, ‘G_count.x’, ‘T_count.x’, ‘Base_counts.y’, ‘A_count.y’, ‘C_count.y’, ‘G_count.y’, ‘T_count.y’, ‘Base_counts.x’, ‘A_count.x’, ‘C_count.x’, ‘G_count.x’, ‘T_count.x’, ‘Base_counts.y’, ‘A_count.y’, ‘C_count.y’, ‘G_count.y’, ‘T_count.y’ are duplicated in the result”


In [16]:
# Add a tag for editing type
merged_TC_df$tag <- "TC"

In [17]:
merged_TC_df

ID,strand,ref,Base_counts.x,A_count.x,C_count.x,G_count.x,T_count.x,Base_counts.y,A_count.y,⋯,A_count.x,C_count.x,G_count.x,T_count.x,Base_counts.y,A_count.y,C_count.y,G_count.y,T_count.y,tag
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>.1,<chr>.1,<chr>.1,<chr>.1,<chr>.1,<chr>.1,<chr>,<chr>,<chr>,<chr>
1_10067239,-,T,0530151,0,53,0,151,0260107,0,⋯,0,115,0,241,0810161,0,81,0,161,TC
1_1006825,-,T,021015,0,21,0,15,011042,0,⋯,0,25,0,29,02038,0,2,0,38,TC
1_10068596,-,T,0159095,0,159,0,95,093095,0,⋯,0,583,0,0,0407034,0,407,0,34,TC
1_1009780,-,T,015023,0,15,0,23,010022,0,⋯,0,11,0,44,015018,0,15,0,18,TC
1_10822937,-,T,01002,0,10,0,2,016011,0,⋯,0,18,0,8,01507,0,15,0,7,TC
1_10822972,-,T,01003,0,10,0,3,014016,0,⋯,0,16,0,8,012013,0,12,0,13,TC
1_10823089,-,T,0908,0,9,0,8,014022,0,⋯,0,17,0,8,08027,0,8,0,27,TC
1_10838290,-,T,034024,0,34,0,24,043045,0,⋯,0,30,0,44,024063,0,24,0,63,TC
1_10897372,-,T,064019,0,64,0,19,062028,0,⋯,0,126,0,44,063076,0,63,0,76,TC
1_10901303,-,T,085025,0,85,0,25,098047,0,⋯,0,137,0,45,01050111,0,105,0,111,TC


In [18]:
write.table(merged_AG_df, "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/all_dpf/all_dpf_AG_sites.tsv", sep = "\t", row.names = FALSE, quote = FALSE)
write.table(merged_TC_df, "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/all_dpf/all_dpf_TC_sites.tsv", sep = "\t", row.names = FALSE, quote = FALSE)