This script merges all of the samples into one for REDITs analysis. 

In [1]:
library(dplyr)
library(purrr)
library(tidyverse)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.0     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mreadr    [39m 2.1.5     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [2]:
# Specify the input directory containing TSV files that were pivoted
AG_input <- "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/REDITs/all_dpf/AG/Pivoted"
TC_input <- "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/REDITs/all_dpf/TC/Pivoted"

Since we are merging A/G edited and T/C edited sites, we are going to add a tag onto the ID for each before the merge to keep track.

In [3]:
AG_files <- list.files(AG_input, pattern = "\\.tsv$", full.names = TRUE)
TC_files <- list.files(TC_input, pattern = "\\.tsv$", full.names = TRUE)

In [4]:
# Add tag to AG files
for (file_path in AG_files) {
  # Read the TSV file into a data frame
  df <- read_tsv(file_path)

  # Add "_AG" at the end of the "ID___Count_Type" column values
  df <- df %>%
    mutate(ID___Count_Type = paste(ID___Count_Type, "_AG", sep = ""))

  # Save the modified data frame back to the file
  write_tsv(df, file_path)
}

[1mRows: [22m[34m26288[39m [1mColumns: [22m[34m2[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (1): ID___Count_Type
[32mdbl[39m (1): Ctrl-01Edited_And_NonEdited__Counts_Counts

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m26288[39m [1mColumns: [22m[34m2[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (1): ID___Count_Type
[32mdbl[39m (1): Ctrl-02Edited_And_NonEdited__Counts_Counts

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m26288[39m [1mColumns: [22m[34m2[39m
[3

In [5]:
#Add tag to TC files
for (file_path in TC_files) {
  # Read the TSV file into a data frame
  df <- read_tsv(file_path)

  # Add "_TC" at the end of the "ID___Count_Type" column values
  df <- df %>%
    mutate(ID___Count_Type = paste(ID___Count_Type, "_TC", sep = ""))

  # Save the modified data frame back to the file
  write_tsv(df, file_path)
}

[1mRows: [22m[34m21678[39m [1mColumns: [22m[34m2[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (1): ID___Count_Type
[32mdbl[39m (1): Ctrl-01Edited_And_NonEdited__Counts_Counts

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m21678[39m [1mColumns: [22m[34m2[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (1): ID___Count_Type
[32mdbl[39m (1): Ctrl-02Edited_And_NonEdited__Counts_Counts

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m21678[39m [1mColumns: [22m[34m2[39m
[3

In [6]:
#Define an output directory
output_directory <- "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/REDITs/all_dpf/Merged"

In [7]:
# Convert to data list
AG_data_list <- lapply(AG_files, function(file) read.delim(file, sep = "\t"))
TC_data_list <- lapply(TC_files, function(file) read.delim(file, sep = "\t"))

In [8]:
# Merge by intersect only for AG and TC separately first.
AG_merged_data <- Reduce(function(x, y) merge(x, y, by = "ID___Count_Type", all = FALSE), AG_data_list)
TC_merged_data <- Reduce(function(x, y) merge(x, y, by = "ID___Count_Type", all = FALSE), TC_data_list)

In [9]:
# Now merge AG and TC dataframes 
combined_df <- bind_rows(AG_merged_data, TC_merged_data)

In [10]:
combined_df

ID___Count_Type,Ctrl.01Edited_And_NonEdited__Counts_Counts,Ctrl.02Edited_And_NonEdited__Counts_Counts,Ctrl.03Edited_And_NonEdited__Counts_Counts,Ctrl.04Edited_And_NonEdited__Counts_Counts,Ctrl.05Edited_And_NonEdited__Counts_Counts,Ctrl.06Edited_And_NonEdited__Counts_Counts,NO.01Edited_And_NonEdited__Counts_Counts,NO.02Edited_And_NonEdited__Counts_Counts,NO.03Edited_And_NonEdited__Counts_Counts,NO.04Edited_And_NonEdited__Counts_Counts,NO.05Edited_And_NonEdited__Counts_Counts,NO.06Edited_And_NonEdited__Counts_Counts
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1_10023876___Edited_AG,9,9,8,14,6,1,17,6,6,18,11,32
1_10023876___Non_Edited_AG,40,54,46,68,39,97,18,21,41,63,87,92
1_10054490___Edited_AG,102,19,218,232,154,60,172,180,161,355,212,271
1_10054490___Non_Edited_AG,471,384,263,790,726,1035,243,363,440,780,1268,707
1_10057975___Edited_AG,42,19,46,96,85,36,46,69,32,228,142,178
1_10057975___Non_Edited_AG,155,115,84,540,494,594,92,147,129,461,535,337
1_10313231___Edited_AG,12,36,20,33,21,17,10,6,15,18,24,25
1_10313231___Non_Edited_AG,1,7,8,4,8,2,16,8,8,10,9,15
1_10314652___Edited_AG,29,25,21,21,20,32,10,11,16,30,67,34
1_10314652___Non_Edited_AG,52,48,35,30,13,28,20,34,27,63,13,28


In [54]:
# Write merged_data to a TSV file
write.table(combined_df, file = file.path(output_directory, "JACUSA2_all_dpf_REDITs_input.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)