This is a continuation of Step 9 to format the REDITs input. This script pivots the Edited/Non-edited counts per sample.

In [1]:
library(tidyr)
library(dplyr)
library(readr)
library(stringr)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
#Specify input directories
AG_path <- "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/REDITs/all_dpf/AG"
TC_path <- "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/REDITs/all_dpf/TC"

In [3]:
#Specify output directories
AG_output_dir <- "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/REDITs/all_dpf/AG/Pivoted"
TC_output_dir <- "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/REDITs/all_dpf/TC/Pivoted"

In [4]:
process_file <- function(file_path, output_directory) {
  # Extract prefix from the file name
  prefix <- sub("^(.*?)_", "\\1", tools::file_path_sans_ext(basename(file_path)))
  
  # Read the TSV file
  df <- read.table(file_path, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
  df <- df %>%
    select(1, 3, 2, everything())

  cat("Head of the input file", prefix, ":\n")
  print(head(df))

  # Pivot and manipulate the data
  stacked_df <- df %>%
    pivot_longer(
        cols = -ID,  # Specify the columns to pivot
        names_to = "Count_Type",
        values_to = "Counts"
    ) %>%
    mutate(
        Count_Type = rep(c("Edited", "Non_Edited"), length.out = n()),
        ID = factor(ID, levels = unique(ID))
    ) %>%
    arrange(ID, Count_Type)

  
  # Rename the "Counts" column to "Prefix_Counts"
  colnames(stacked_df)[colnames(stacked_df) == "Counts"] <- paste(prefix, "Counts", sep = "_")
  
  # Create the final stacked data frame
  stacked_df <- stacked_df %>%
    mutate(ID___Count_Type = paste(ID, Count_Type, sep = "___")) %>%
    select(ID___Count_Type, everything()) %>%
    select(-c(ID, Count_Type))

  cat("Head of the stacked data frame final", prefix, ":\n")
  print(head(stacked_df))
  
  # Write the result to a TSV file in the output directory
  output_file_path <- file.path(output_directory, paste(prefix, "stacked.tsv", sep = "_"))
  write.table(stacked_df, file = output_file_path, sep = "\t", quote = FALSE, row.names = FALSE)
  # Print a success message
  cat("*Table written successfully to:*", output_file_path, "\n")
  
}

In [5]:
# List all TSV files in the directory
file_list_AG <- list.files(AG_path, pattern = ".tsv$", full.names = TRUE)
file_list_TC <- list.files(TC_path, pattern = ".tsv$", full.names = TRUE)

In [6]:
print(file_list_AG)
print(file_list_TC)

 [1] "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/REDITs/all_dpf/AG/Ctrl-01_Edited_And_NonEdited__Counts.tsv"
 [2] "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/REDITs/all_dpf/AG/Ctrl-02_Edited_And_NonEdited__Counts.tsv"
 [3] "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/REDITs/all_dpf/AG/Ctrl-03_Edited_And_NonEdited__Counts.tsv"
 [4] "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/REDITs/all_dpf/AG/Ctrl-04_Edited_And_NonEdited__Counts.tsv"
 [5] "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/REDITs/all_dpf/AG/Ctrl-05_Edited_And_NonEdited__Counts.tsv"
 [6] "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/REDITs/all_dpf/AG/Ctrl-06_Edited_And_NonEdited__Counts.tsv"
 [7] "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/REDITs/all_dpf/AG/NO-01_Edited_And_NonEdited__Counts.tsv"  
 [8] "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/REDITs/all_dpf/AG/NO-02_Edited_And_NonEdited__Counts.tsv"  
 [9] "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/REDITs/all_dpf/A

In [7]:
# Process each file and store the results in the output directory
lapply(file_list_AG, function(file_path) {
  process_file(file_path, AG_output_dir)
})

lapply(file_list_TC, function(file_path) {
  process_file(file_path, TC_output_dir)
})

Head of the input file Ctrl-01Edited_And_NonEdited__Counts :
          ID Ctrl.01_Edited Ctrl.01_Nonedited
1 1_10023876              9                40
2 1_10054490            102               471
3 1_10057975             42               155
4 1_10313231             12                 1
5 1_10314652             29                52
6 1_10325782             19                13
Head of the stacked data frame final Ctrl-01Edited_And_NonEdited__Counts :
[90m# A tibble: 6 × 2[39m
  ID___Count_Type         `Ctrl-01Edited_And_NonEdited__Counts_Counts`
  [3m[90m<chr>[39m[23m                                                          [3m[90m<int>[39m[23m
[90m1[39m 1_10023876___Edited                                                9
[90m2[39m 1_10023876___Non_Edited                                           40
[90m3[39m 1_10054490___Edited                                              102
[90m4[39m 1_10054490___Non_Edited                                          471
[90m5[39

Head of the input file Ctrl-01Edited_And_NonEdited__Counts :
          ID Ctrl.01_Edited Ctrl.01_Nonedited
1 1_10067239            151                53
2  1_1006825             15                21
3 1_10068596             95               159
4  1_1009780             23                15
5 1_10822937              2                10
6 1_10822972              3                10
Head of the stacked data frame final Ctrl-01Edited_And_NonEdited__Counts :
[90m# A tibble: 6 × 2[39m
  ID___Count_Type         `Ctrl-01Edited_And_NonEdited__Counts_Counts`
  [3m[90m<chr>[39m[23m                                                          [3m[90m<int>[39m[23m
[90m1[39m 1_10067239___Edited                                              151
[90m2[39m 1_10067239___Non_Edited                                           53
[90m3[39m 1_1006825___Edited                                                15
[90m4[39m 1_1006825___Non_Edited                                            21
[90m5[39