This notebook removes any sites that are not contained in chromosomal regions 1-25. Additionally, sites where all samples have less than 10 percent editing or all samples have above 90 percent editing (to account for overamplification) are removed.

In [1]:
library(dplyr)
library(tidyr)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
# Provide file path for merged data here
data <- read.table("/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/all_dpf/all_dpf_merged_data.tsv", sep = "\t", stringsAsFactors = FALSE)

In [3]:
# Read the first row as column names
col_names <- data[1, ]

# Remove the first row from the DataFrame
data <- data[-1, ]

# Set the column names
colnames(data) <- col_names

In [4]:
data

Unnamed: 0_level_0,ID,Ctrl.01_bases_Non_Edited_Count,Ctrl.01_bases_Edited_Count,Ctrl.01_bases_Edited_Count_Proportion,Ctrl.02_bases_Non_Edited_Count,Ctrl.02_bases_Edited_Count,Ctrl.02_bases_Edited_Count_Proportion,Ctrl.03_bases_Non_Edited_Count,Ctrl.03_bases_Edited_Count,Ctrl.03_bases_Edited_Count_Proportion,⋯,NO.04_bases_Non_Edited_Count,NO.04_bases_Edited_Count,NO.04_bases_Edited_Count_Proportion,NO.05_bases_Non_Edited_Count,NO.05_bases_Edited_Count,NO.05_bases_Edited_Count_Proportion,NO.06_bases_Non_Edited_Count,NO.06_bases_Edited_Count,NO.06_bases_Edited_Count_Proportion,col.names
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
2,1_5476,42,0,0,22,0,0,30,0,0,⋯,37,0,0,41,0,0,21,0,0,TRUE
3,1_5479,11,31,0.738095238095238,2,20,0.909090909090909,3,26,0.896551724137931,⋯,9,28,0.756756756756757,2,39,0.951219512195122,4,18,0.818181818181818,TRUE
4,1_5504,47,0,0,28,0,0,32,1,0.0303030303030303,⋯,50,0,0,43,0,0,29,0,0,TRUE
5,1_5505,48,0,0,28,0,0,32,0,0,⋯,49,0,0,43,0,0,29,0,0,TRUE
6,1_5510,45,0,0,27,0,0,32,0,0,⋯,48,0,0,33,1,0.0294117647058824,25,0,0,TRUE
7,1_5523,48,0,0,28,0,0,26,2,0.0714285714285714,⋯,52,0,0,34,0,0,28,0,0,TRUE
8,1_5536,6,27,0.818181818181818,5,22,0.814814814814815,4,20,0.833333333333333,⋯,12,36,0.75,1,31,0.96875,4,21,0.84,TRUE
9,1_5539,33,0,0,29,0,0,20,4,0.166666666666667,⋯,47,0,0,32,0,0,27,0,0,TRUE
10,1_5979,39,0,0,43,0,0,24,0,0,⋯,21,0,0,29,1,0.0333333333333333,24,0,0,TRUE
11,1_5998,46,0,0,47,0,0,25,0,0,⋯,35,1,0.0277777777777778,30,0,0,33,0,0,TRUE


In [5]:
# Separate ID column into chromosome and position for filtering 
data <- separate(data, col =ID, into = c("chromosome", "position"), sep = "_")

In [6]:
# Remove any chromosome regions that aren't 1-25
data_filtered <- subset(data, chromosome >= 1 & chromosome <= 25)


In [7]:
# Recombine the two as ID 
data_filtered$ID <- paste(data_filtered$chromosome, data_filtered$position, sep = "_")

# Remove 'chromosome' and 'position' columns
data_filtered <- data_filtered[, !(names(data_filtered) %in% c("chromosome", "position"))]

# Reorder columns to have 'ID' as the first column
data_filtered <- data_filtered[, c("ID", setdiff(names(data_filtered), "ID"))]


In [8]:
data_filtered

Unnamed: 0_level_0,ID,Ctrl.01_bases_Non_Edited_Count,Ctrl.01_bases_Edited_Count,Ctrl.01_bases_Edited_Count_Proportion,Ctrl.02_bases_Non_Edited_Count,Ctrl.02_bases_Edited_Count,Ctrl.02_bases_Edited_Count_Proportion,Ctrl.03_bases_Non_Edited_Count,Ctrl.03_bases_Edited_Count,Ctrl.03_bases_Edited_Count_Proportion,⋯,NO.04_bases_Non_Edited_Count,NO.04_bases_Edited_Count,NO.04_bases_Edited_Count_Proportion,NO.05_bases_Non_Edited_Count,NO.05_bases_Edited_Count,NO.05_bases_Edited_Count_Proportion,NO.06_bases_Non_Edited_Count,NO.06_bases_Edited_Count,NO.06_bases_Edited_Count_Proportion,col.names
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
2,1_5476,42,0,0,22,0,0,30,0,0,⋯,37,0,0,41,0,0,21,0,0,TRUE
3,1_5479,11,31,0.738095238095238,2,20,0.909090909090909,3,26,0.896551724137931,⋯,9,28,0.756756756756757,2,39,0.951219512195122,4,18,0.818181818181818,TRUE
4,1_5504,47,0,0,28,0,0,32,1,0.0303030303030303,⋯,50,0,0,43,0,0,29,0,0,TRUE
5,1_5505,48,0,0,28,0,0,32,0,0,⋯,49,0,0,43,0,0,29,0,0,TRUE
6,1_5510,45,0,0,27,0,0,32,0,0,⋯,48,0,0,33,1,0.0294117647058824,25,0,0,TRUE
7,1_5523,48,0,0,28,0,0,26,2,0.0714285714285714,⋯,52,0,0,34,0,0,28,0,0,TRUE
8,1_5536,6,27,0.818181818181818,5,22,0.814814814814815,4,20,0.833333333333333,⋯,12,36,0.75,1,31,0.96875,4,21,0.84,TRUE
9,1_5539,33,0,0,29,0,0,20,4,0.166666666666667,⋯,47,0,0,32,0,0,27,0,0,TRUE
10,1_5979,39,0,0,43,0,0,24,0,0,⋯,21,0,0,29,1,0.0333333333333333,24,0,0,TRUE
11,1_5998,46,0,0,47,0,0,25,0,0,⋯,35,1,0.0277777777777778,30,0,0,33,0,0,TRUE


In [9]:
# Identify proprtion columns that will be used for filtering 
proportion_columns <- grep("Proportion", colnames(data_filtered), value = TRUE)

In [10]:
# Use apply for each row to check if proprtion column values are all less than 0.1 or all greater than 0.9
rows_to_remove <- apply(data_filtered[proportion_columns], 1, function(row) all(row < 0.1 | row > 0.9))

# Filter the DataFrame based on the conditions stated above
df_filtered_new <- data_filtered[!rows_to_remove, ]

In [11]:
# Remove col.names column 
df_filtered_new <- df_filtered_new[, !(names(df_filtered_new) %in% c("col.names"))]

In [21]:
df_filtered_new

Unnamed: 0_level_0,ID,Ctrl.01_bases_Non_Edited_Count,Ctrl.01_bases_Edited_Count,Ctrl.01_bases_Edited_Count_Proportion,Ctrl.02_bases_Non_Edited_Count,Ctrl.02_bases_Edited_Count,Ctrl.02_bases_Edited_Count_Proportion,Ctrl.03_bases_Non_Edited_Count,Ctrl.03_bases_Edited_Count,Ctrl.03_bases_Edited_Count_Proportion,⋯,NO.03_bases_Edited_Count_Proportion,NO.04_bases_Non_Edited_Count,NO.04_bases_Edited_Count,NO.04_bases_Edited_Count_Proportion,NO.05_bases_Non_Edited_Count,NO.05_bases_Edited_Count,NO.05_bases_Edited_Count_Proportion,NO.06_bases_Non_Edited_Count,NO.06_bases_Edited_Count,NO.06_bases_Edited_Count_Proportion
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
3,1_5479,11,31,0.738095238095238,2,20,0.909090909090909,3,26,0.896551724137931,⋯,0.91304347826087,9,28,0.756756756756757,2,39,0.951219512195122,4,18,0.818181818181818
8,1_5536,6,27,0.818181818181818,5,22,0.814814814814815,4,20,0.833333333333333,⋯,0.885714285714286,12,36,0.75,1,31,0.96875,4,21,0.84
9,1_5539,33,0,0,29,0,0,20,4,0.166666666666667,⋯,0,47,0,0,32,0,0,27,0,0
15,1_6119,31,0,0,31,0,0,27,0,0,⋯,0.111111111111111,31,0,0,24,0,0,32,0,0
17,1_6643,32,0,0,21,0,0,35,0,0,⋯,0,18,0,0,32,0,0,15,0,0
91,1_6727,33,0,0,41,0,0,40,2,0.0476190476190476,⋯,0,19,0,0,21,2,0.0869565217391304,22,0,0
92,1_6727,33,0,0,41,0,0,40,2,0.0476190476190476,⋯,0,19,0,0,21,2,0.0869565217391304,1646,11,0.00663850331925166
93,1_6727,33,0,0,41,0,0,40,2,0.0476190476190476,⋯,0,19,0,0,1592,4,0.0025062656641604,22,0,0
94,1_6727,33,0,0,41,0,0,40,2,0.0476190476190476,⋯,0,19,0,0,1592,4,0.0025062656641604,1646,11,0.00663850331925166
95,1_6727,33,0,0,41,0,0,40,2,0.0476190476190476,⋯,0,1674,12,0.00711743772241993,21,2,0.0869565217391304,22,0,0


In [47]:
write.table(df_filtered_new, "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/all_dpf/all_dpf_merged_filtered_data.tsv", sep = "\t", row.names = FALSE, quote = FALSE)