# 1. Counting Retained Reads with Chopper

In [1]:
library(dplyr)
library(ggplot2)
library(reshape2)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
setwd("~/Documents/Collaborations/mangrove/analyses_R")

In [3]:
# Define directories
raw_dir <- "./Untrimmed"
filt_dir <- "./chopper_filter"

# Get list of raw and filtered FASTQ files
raw_files <- list.files(raw_dir, pattern = "\\.fastq$", full.names = TRUE)
filt_files <- list.files(filt_dir, pattern = "_filtered\\.fastq$", full.names = TRUE)

print("Raw files:")
print(raw_files)

print("Filtered files:")
print(filt_files)

[1] "Raw files:"
[1] "./Untrimmed/Sample-ID1-A1.fastq" "./Untrimmed/Sample-ID2-A2.fastq"
[3] "./Untrimmed/Sample-ID3-A3.fastq" "./Untrimmed/Sample-ID4-A4.fastq"
[5] "./Untrimmed/Sample-ID5-B1.fastq" "./Untrimmed/Sample-ID6-B2.fastq"
[7] "./Untrimmed/Sample-ID7-B3.fastq" "./Untrimmed/Sample-ID8-B4.fastq"
[1] "Filtered files:"
[1] "./chopper_filter/Sample-ID1-A1_filtered.fastq"
[2] "./chopper_filter/Sample-ID2-A2_filtered.fastq"
[3] "./chopper_filter/Sample-ID3-A3_filtered.fastq"
[4] "./chopper_filter/Sample-ID4-A4_filtered.fastq"
[5] "./chopper_filter/Sample-ID5-B1_filtered.fastq"
[6] "./chopper_filter/Sample-ID6-B2_filtered.fastq"
[7] "./chopper_filter/Sample-ID7-B3_filtered.fastq"
[8] "./chopper_filter/Sample-ID8-B4_filtered.fastq"


In [4]:
# Function to count reads in a FASTQ file
count_reads <- function(file) {
  lines <- length(readLines(file))
  return(lines / 4)  # Each read has 4 lines
}

# Create an empty data frame to store results
results <- data.frame(ID_Sample = character(), raw_reads = integer(), filt_reads = integer())

# Process each raw file
for (raw_file in raw_files) {
  # Extract sample ID
  sample_id <- gsub("\\.fastq$", "", basename(raw_file))
  
  # Define the corresponding filtered file
  filt_file <- file.path(filt_dir, paste0(sample_id, "_filtered.fastq"))
  
  # Count reads
  raw_count <- count_reads(raw_file)
  filt_count <- if (file.exists(filt_file)) count_reads(filt_file) else NA
  
  # Append to results
  results <- rbind(results, data.frame(ID_Sample = sample_id, raw_reads = raw_count, filt_reads = filt_count))
}

# Print results
print(results)

      ID_Sample raw_reads filt_reads
1 Sample-ID1-A1      1742       1134
2 Sample-ID2-A2      4344       2971
3 Sample-ID3-A3      2890       1882
4 Sample-ID4-A4      1628        965
5 Sample-ID5-B1      2689       1686
6 Sample-ID6-B2      3540       2375
7 Sample-ID7-B3      1873       1231
8 Sample-ID8-B4      2654       1693


In [5]:
write.table(results, file = "read_counts_chopper", , sep = "\t", row.names = FALSE)