# Pipeline Counts

Keeping track of the number of sequences at each step of the processing pipeline

In [6]:
# loading libraries
library(tidyverse)

### raw reads

In [31]:
novogene_qc <- read.csv('/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_2024/CE24_RNA-seq/processing/qc_outputs/novogene_qc.summary.csv') 
head(novogene_qc)

Unnamed: 0_level_0,Sample,Raw.reads,Raw.data,Effective...,Error...,Q20...,Q30...,GC...
Unnamed: 0_level_1,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,C1_H6_W62_gi,65297220,9794583000,98.99,0.01,99.47,97.68,42.3
2,B4_W6_O67_gi,73117894,10967684100,99.06,0.01,99.42,97.5,42.09
3,H3_W6_B66_gi,60215268,9032290200,98.67,0.01,99.43,97.47,42.28
4,W6_H6_G71_gi,66355872,9953380800,98.96,0.01,99.47,97.66,42.38
5,H1_B6_B61_gi,69802722,10470408300,98.84,0.01,99.45,97.64,42.12
6,H4_H2_B19_gi,79683032,11952454800,98.81,0.01,99.44,97.63,42.47


In [24]:
pipe_count <- novogene_qc %>%
select(Sample, Raw.reads)

colnames(pipe_count)[2] <- 'raw.reads'

head(pipe_count)

Unnamed: 0_level_0,Sample,raw.reads
Unnamed: 0_level_1,<chr>,<int>
1,C1_H6_W62_gi,65297220
2,B4_W6_O67_gi,73117894
3,H3_W6_B66_gi,60215268
4,W6_H6_G71_gi,66355872
5,H1_B6_B61_gi,69802722
6,H4_H2_B19_gi,79683032


### trimmed reads

to run in command line:
```
grep -H "Reads written" *.txt > reads.written.csv
```

In [25]:
trim.reads <- read.table('/scratch4/workspace/julia_mcdonough_student_uml_edu-novogene_dwnld/trimmed_all/reads.written.tab', header=FALSE, sep = ':')
head(trim.reads)

Unnamed: 0_level_0,V1,V2,V3
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,B1_B1_O01_gi_1.fq.gz_trimming_report.txt,Reads written (passing filters),"35,532,537 (100.0%)"
2,B1_B1_O01_gi_2.fq.gz_trimming_report.txt,Reads written (passing filters),"35,532,537 (100.0%)"
3,B1_Nu_O03_gi_1.fq.gz_trimming_report.txt,Reads written (passing filters),"50,278,071 (100.0%)"
4,B1_Nu_O03_gi_2.fq.gz_trimming_report.txt,Reads written (passing filters),"50,278,071 (100.0%)"
5,B1_W5_O50_gi_1.fq.gz_trimming_report.txt,Reads written (passing filters),"33,889,636 (100.0%)"
6,B1_W5_O50_gi_2.fq.gz_trimming_report.txt,Reads written (passing filters),"33,889,636 (100.0%)"


In [26]:
# get just the sample name alone
trimmed <- trim.reads %>%
separate(
    col = V1,
    into = c('Sample', 'extra'),
    sep = '.fq')

# remove (100.0%) off the end
trimmed <- trimmed %>%
separate(
    col = V3,
    into = c('trimmed.reads', 'trash'),
    sep = "\\(") 

# remove , in the numbers
trimmed$trimmed.reads <- gsub(",", "", trimmed$trimmed.reads)

# select only the columns I need
trim.df <- trimmed %>%
select(Sample, trimmed.reads)

# Create a "sample_base" without _1/_2
trimmed <- trimmed %>%
  mutate(sample_base = str_remove(Sample, "_[12]$")) %>%
select(Sample, trimmed.reads, sample_base) %>%
mutate(trimmed.reads = as.numeric(trimmed.reads))

# Sum reads per sample_base
trimmed_summary <- trimmed %>%
  group_by(sample_base) %>%
  summarise(trimmed.reads = sum(trimmed.reads, na.rm = TRUE))

# make into df
trim.df <- as.data.frame(trimmed_summary) 

# make same sample name column
colnames(trim.df)[1] <- 'Sample'

head(trim.df)

Unnamed: 0_level_0,Sample,trimmed.reads
Unnamed: 0_level_1,<chr>,<dbl>
1,B1_B1_O01_gi,71065074
2,B1_Nu_O03_gi,100556142
3,B1_W5_O50_gi,67779272
4,B2_B5_O51_gi,76008956
5,B2_C4_O40_gi,77162786
6,B2_Nu_O12_gi,102409148


In [27]:
pipe.counts <- merge(pipe_count, trim.df, by = 'Sample')
head(pipe.counts)

Unnamed: 0_level_0,Sample,raw.reads,trimmed.reads
Unnamed: 0_level_1,<chr>,<int>,<dbl>
1,B1_B1_O01_gi,71065074,71065074
2,B1_Nu_O03_gi,100556142,100556142
3,B1_W5_O50_gi,67779272,67779272
4,B2_B5_O51_gi,76008956,76008956
5,B2_C4_O40_gi,77162786,77162786
6,B2_Nu_O12_gi,102409148,102409148


### aligned reads
after running `hisat2`

to run in command line:
```
grep -H "overall alignment rate" *.log >align_rate.csv
```

reporting overall alignment rate, which is the combination of concordant alignments, discordant alignments, and one mate aligned

In [34]:
# read in csv
align.rate <- read.csv('/scratch4/workspace/julia_mcdonough_student_uml_edu-novogene_dwnld/hisat2-align/align_rate.csv', header = FALSE, sep = ':')

# format sample name
align.rate$V1 <- gsub('.log', '_gi', align.rate$V1)

# remove text from alignment rate
align.rate$V2 <- as.numeric(gsub('% overall alignment rate', '', align.rate$V2))

# rename columns
colnames(align.rate) <- c('Sample', 'overall.align.rate.percent')

head(align.rate)

Unnamed: 0_level_0,Sample,overall.align.rate.percent
Unnamed: 0_level_1,<chr>,<dbl>
1,B1_B1_O01_gi,81.18
2,B1_Nu_O03_gi,81.56
3,B1_W5_O50_gi,82.27
4,B2_B5_O51_gi,82.88
5,B2_C4_O40_gi,82.03
6,B2_Nu_O12_gi,83.02


In [35]:
counts <- merge(pipe.counts, align.rate, by = 'Sample')
head(counts)

Unnamed: 0_level_0,Sample,raw.reads,trimmed.reads,overall.align.rate.percent
Unnamed: 0_level_1,<chr>,<int>,<dbl>,<dbl>
1,B1_B1_O01_gi,71065074,71065074,81.18
2,B1_Nu_O03_gi,100556142,100556142,81.56
3,B1_W5_O50_gi,67779272,67779272,82.27
4,B2_B5_O51_gi,76008956,76008956,82.88
5,B2_C4_O40_gi,77162786,77162786,82.03
6,B2_Nu_O12_gi,102409148,102409148,83.02


In [36]:
# calculate number of aligned reads based on rate
counts$aligned.reads <- (counts$overall.align.rate/100) * (counts$trimmed.reads)

head(counts)

Unnamed: 0_level_0,Sample,raw.reads,trimmed.reads,overall.align.rate.percent,aligned.reads
Unnamed: 0_level_1,<chr>,<int>,<dbl>,<dbl>,<dbl>
1,B1_B1_O01_gi,71065074,71065074,81.18,57690627
2,B1_Nu_O03_gi,100556142,100556142,81.56,82013589
3,B1_W5_O50_gi,67779272,67779272,82.27,55762007
4,B2_B5_O51_gi,76008956,76008956,82.88,62996223
5,B2_C4_O40_gi,77162786,77162786,82.03,63296633
6,B2_Nu_O12_gi,102409148,102409148,83.02,85020075


### read counting
from `featureCounts`

### write csv

In [37]:
write.csv(counts, '/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_2024/CE24_RNA-seq/processing/qc_outputs/pipeline_counts.csv', row.names = FALSE)