### Filtering JACUSA2 Output
This script calculates editing proprtions of each site per sample such that further filtering can be done.


In [1]:
# Load required libraries
library(dplyr)
library(readr)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
process_file <- function(file_path, output_directory) {
  # Read the TSV file
  df <- read.table(file_path, header = TRUE, sep = "\t", stringsAsFactors = FALSE)

  # Split the base counts column
  counts <- strsplit(df$Base_counts, ",")
  
  # Convert the list to a data frame
  counts_df <- as.data.frame(do.call(rbind, counts))
  
  # Rename the columns
  colnames(counts_df) <- c("A_count", "C_count", "G_count", "T_count")
  
  # Combine the new columns with the original data frame
  df <- cbind(df, counts_df)
  
  
  # Remove unnecessary columns
  df <- df[, !colnames(df) %in% c("Base_counts")]
  
  # Convert counts columns to numeric
  df[, c("A_count", "C_count", "G_count", "T_count")] <- lapply(
    df[, c("A_count", "C_count", "G_count", "T_count")], as.numeric)
  
  
  # Calculate Non_Edited_Count and Edited_Count
  edited_df <- df %>%
    mutate(
      Non_Edited_Count = case_when(
        ref == "A" ~ A_count,
        ref == "C" ~ C_count,
        ref == "G" ~ G_count,
        ref == "T" ~ T_count
      ),
      Edited_Count = rowSums(select(., A_count, C_count, G_count, T_count), na.rm = TRUE) - as.numeric(Non_Edited_Count)
    ) %>%
    select(ID, Non_Edited_Count, Edited_Count)
  
  # Calculate Edited_Count_Proportion
  edited_df$Edited_Count_Proportion <- edited_df$Edited_Count / (edited_df$Edited_Count + edited_df$Non_Edited_Count)
  
  print("Final DataFrame:")
  print(head(edited_df))
  
  # Get the base name of the original file
  base_name <- tools::file_path_sans_ext(basename(file_path))
  
  # Create the output file path
  output_file_path <- file.path(output_directory, paste0(base_name, "___Edited_Column_Added.tsv"))
  
  # Write the processed table to a TSV file
  write.table(edited_df, file = output_file_path, sep = "\t", quote = FALSE, row.names = FALSE)
}

In [4]:
input_directory_path <- '/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/all_dpf/all_dpf_singles'
output_directory_path <- '/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/all_dpf/all_dpf_singles/Edited'

In [5]:
# Get a list of files in the input directory
file_list <- list.files(input_directory_path, pattern = "\\.tsv$", full.names = TRUE)
print(file_list)

 [1] "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/all_dpf/all_dpf_singles/Ctrl-01_bases.tsv"
 [2] "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/all_dpf/all_dpf_singles/Ctrl-02_bases.tsv"
 [3] "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/all_dpf/all_dpf_singles/Ctrl-03_bases.tsv"
 [4] "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/all_dpf/all_dpf_singles/Ctrl-04_bases.tsv"
 [5] "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/all_dpf/all_dpf_singles/Ctrl-05_bases.tsv"
 [6] "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/all_dpf/all_dpf_singles/Ctrl-06_bases.tsv"
 [7] "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/all_dpf/all_dpf_singles/NO-01_bases.tsv"  
 [8] "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/all_dpf/all_dpf_singles/NO-02_bases.tsv"  
 [9] "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/all_dpf/all_dpf_singles/NO-03_bases.tsv"  
[10] "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/all_dpf/all_dpf_singles/NO-04_bases.tsv"  
[11] "/mnt

In [6]:
for (file in file_list) {
  process_file(file, output_directory_path)
}

[1] "Final DataFrame:"
      ID Non_Edited_Count Edited_Count Edited_Count_Proportion
1 1_5476               42            0               0.0000000
2 1_5479               11           31               0.7380952
3 1_5504               47            0               0.0000000
4 1_5505               48            0               0.0000000
5 1_5510               45            0               0.0000000
6 1_5523               48            0               0.0000000
[1] "Final DataFrame:"
      ID Non_Edited_Count Edited_Count Edited_Count_Proportion
1 1_5476               22            0               0.0000000
2 1_5479                2           20               0.9090909
3 1_5504               28            0               0.0000000
4 1_5505               28            0               0.0000000
5 1_5510               27            0               0.0000000
6 1_5523               28            0               0.0000000
[1] "Final DataFrame:"
      ID Non_Edited_Count Edited_Count Edited_Cou