# Totalling pipeline counts

## 0. load libraries

In [2]:
library(tidyverse)

## 1. load csv

In [3]:
counts <- read.csv('/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_MethylRAD_analysis_2018/processing/pipeline_counts/CV_CE18_pipeline_counts.csv')
head(counts)

Unnamed: 0_level_0,sample_ID,raw_reads,trim.galore,bowtie2_align,paired_reads,unpaired_reads,marked_dups,htseq.count_features
Unnamed: 0_level_1,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,2018--BBB-WBO-B21-CV,60015908,9142520,1987316,2020021,14421,9850,1240797
2,2018--BBB-WBV-B70-CV,47021020,7920020,1702318,1733903,12637,9107,1066412
3,2018--BBO-BBO-B16-CV,55592344,6907492,1504682,1527667,11081,4699,948276
4,2018--BBO-BBY-B27-CV,36591468,4437592,963810,978519,6871,3552,607039
5,2018--BBO-WBO-B16-CV,59769484,9603844,2076074,2109689,14556,10334,1295441
6,2018--BBO-WBV-B64-CV,49958272,8379816,1812804,1840935,13313,8994,1147073


In [5]:
bam_files <- list.files(path = "/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_methyl_assembly/assembly_pipeline_files/BEDtools/working_BAM_sequences/", pattern = "*.bam", full.names = FALSE)

# Remove everything after 'CV'
files_cv <- sub('CV.*', 'CV', bam_files)

# Remove duplicates
unique_CVsamples <- unique(files_cv)
unique_CVsamples
length(unique_CVsamples)

In [6]:
hyp_counts <- counts[counts$sample_ID %in% unique_CVsamples,]
dim(hyp_counts)

## summary statistics 
for sequencing results section of paper

In [7]:
sum(hyp_counts$raw_reads)

In [8]:
sum(hyp_counts$trim.galore)/sum(hyp_counts$raw_reads) * 100

In [21]:
sum(hyp_counts$paired_reads)

In [24]:
mean(hyp_counts$htseq.count_features)

median(hyp_counts$htseq.count_features)

In [25]:
sum(hyp_counts$htseq.count_features)/sum(hyp_counts$raw_reads) * 100

### Formatting table for publication

In [9]:
library(kableExtra)


Attaching package: ‘kableExtra’


The following object is masked from ‘package:dplyr’:

    group_rows




In [17]:
# removing the column with the htseq.count_features bc it's outdated
counts_woHT <- counts[,-ncol(counts)]

# change sample ID to sample to match fc_df
colnames(counts_woHT)[1] <- 'Sample'
head(counts_woHT)

Unnamed: 0_level_0,Sample,raw_reads,trim.galore,bowtie2_align,paired_reads,unpaired_reads,marked_dups
Unnamed: 0_level_1,<chr>,<int>,<int>,<int>,<int>,<int>,<int>
1,2018--BBB-WBO-B21-CV,60015908,9142520,1987316,2020021,14421,9850
2,2018--BBB-WBV-B70-CV,47021020,7920020,1702318,1733903,12637,9107
3,2018--BBO-BBO-B16-CV,55592344,6907492,1504682,1527667,11081,4699
4,2018--BBO-BBY-B27-CV,36591468,4437592,963810,978519,6871,3552
5,2018--BBO-WBO-B16-CV,59769484,9603844,2076074,2109689,14556,10334
6,2018--BBO-WBV-B64-CV,49958272,8379816,1812804,1840935,13313,8994


In [11]:
fc_stats <- read.csv('/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_MethylRAD_analysis_2018/processing/pipeline_counts/gene_featureCounts_stats.csv')

# remove extra characters in the column names
colnames(fc_stats) <- gsub("^X2018\\.{2}(.*)\\_sorted\\.bam", "2018--\\1", colnames(fc_stats))
colnames(fc_stats) <- gsub("\\.", "-", colnames(fc_stats))

# transposing df
fc_transpose <- t(fc_stats) 

# convert to df
fc_df <- as.data.frame(fc_transpose)

# assign columns as first row
colnames(fc_df) <- fc_df[1,] 

# remove that first row bc now they are the column names
fc_df <- fc_df[-1,]

fc_df$Sample <- rownames(fc_df)

fc_df <- fc_df %>% 
# move sample name to front
select(Sample, everything())

# remove rownames
rownames(fc_df) <- NULL

# only look at samples used in analysis (HC, CH, HH, or CC)
fc_df <- fc_df[!grepl("P", fc_df[,1]),]

head(fc_df)

Unnamed: 0_level_0,Sample,Assigned,Unassigned_Unmapped,Unassigned_Read_Type,Unassigned_Singleton,Unassigned_MappingQuality,Unassigned_Chimera,Unassigned_FragmentLength,Unassigned_Duplicate,Unassigned_MultiMapping,Unassigned_Secondary,Unassigned_NonSplit,Unassigned_NoFeatures,Unassigned_Overlapping_Length,Unassigned_Ambiguity
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,2018--BBB-WBO-B21-CV,1215661,192979,0,0,0,0,0,0,0,0,0,219043,0,56552
2,2018--BBB-WBV-B70-CV,1059670,182271,0,0,0,0,0,0,0,0,0,197678,0,46467
3,2018--BBO-BBO-B16-CV,853106,136711,0,0,0,0,0,0,0,0,0,167516,0,36063
4,2018--BBO-BBY-B27-CV,569055,93418,0,0,0,0,0,0,0,0,0,108495,0,27001
5,2018--BBO-WBO-B16-CV,1232808,209302,0,0,0,0,0,0,0,0,0,230694,0,53396
6,2018--BBO-WBV-B64-CV,1120224,185652,0,0,0,0,0,0,0,0,0,212700,0,52055


In [12]:
dim(fc_df)

In [25]:
# add fc_df$Assigned to counts df
reads <- merge(counts_woHT, fc_df, by = 'Sample') 

# remove extra rows from featureCounts df
reads <- reads[,1:8]

# change assigned column name
colnames(reads)[8] <- 'featureCounts_assigned'

head(reads)

Unnamed: 0_level_0,Sample,raw_reads,trim.galore,bowtie2_align,paired_reads,unpaired_reads,marked_dups,featureCounts_assigned
Unnamed: 0_level_1,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>
1,2018--BBB-WBO-B21-CV,60015908,9142520,1987316,2020021,14421,9850,1215661
2,2018--BBB-WBV-B70-CV,47021020,7920020,1702318,1733903,12637,9107,1059670
3,2018--BBO-BBO-B16-CV,55592344,6907492,1504682,1527667,11081,4699,853106
4,2018--BBO-BBY-B27-CV,36591468,4437592,963810,978519,6871,3552,569055
5,2018--BBO-WBO-B16-CV,59769484,9603844,2076074,2109689,14556,10334,1232808
6,2018--BBO-WBV-B64-CV,49958272,8379816,1812804,1840935,13313,8994,1120224


In [26]:
write.csv(reads, '/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_MethylRAD_analysis_2018/processing/pipeline_counts/pipeline_counts.csv', row.names=FALSE)

## calculating total number of reads used in final analysis 

aka assigned column by featureCounts

In [24]:
sum(as.numeric(fc_df$Assigned))

mean(as.numeric(fc_df$Assigned))

## publishable data table

In [30]:
reads2 <- reads %>%
  select('Sample', 'raw_reads', 'trim.galore', 'bowtie2_align', 'paired_reads', 'featureCounts_assigned')
head(reads2)

reads2 %>%
  kbl(booktabs = TRUE, escape = FALSE) %>%
  kable_classic_2(full_width = FALSE) %>%
  row_spec(0, bold = TRUE) %>%
  save_kable(file = "/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_MethylRAD_analysis_2018/processing/pipeline_counts/pipeline_counts.html", self_contained = TRUE)

Unnamed: 0_level_0,Sample,raw_reads,trim.galore,bowtie2_align,paired_reads,featureCounts_assigned
Unnamed: 0_level_1,<chr>,<int>,<int>,<int>,<int>,<chr>
1,2018--BBB-WBO-B21-CV,60015908,9142520,1987316,2020021,1215661
2,2018--BBB-WBV-B70-CV,47021020,7920020,1702318,1733903,1059670
3,2018--BBO-BBO-B16-CV,55592344,6907492,1504682,1527667,853106
4,2018--BBO-BBY-B27-CV,36591468,4437592,963810,978519,569055
5,2018--BBO-WBO-B16-CV,59769484,9603844,2076074,2109689,1232808
6,2018--BBO-WBV-B64-CV,49958272,8379816,1812804,1840935,1120224
