# Table of Contents
- [Processing Functions](#Process-functions)
- [Import Data](#import-data)
  - [Histograms](#Histograms)
  - [Counts files](#counts)
- [Processing](#processing)
- [Plotting](#plotting)
  - [HeLaS3 titration of H3K9ac](#LP78-HeLaS3-titration-of-H3K9ac)
  - [HCT116-Rpb1 degron treatment with TRP/IAA, Biological Replicate 1](#LP-88-HCT116-Rpb1-degron-Biological-Replicate-1)
  - [HCT116-Rpb1 degron treatment with TRP/IAA, Biological Replicate 2](#LP-91-HCT116-Rpb1-degron-Biological-Replicate-2)
  - [HCT116-Rpb1 degron treatment with TRP/IAA, Biological Replicates 3 and 4](#LP-95-HCT116-Rpb1-degron-Biological-Replicates-3-and-4)
- [Spike-in Normalize](#spike-in-normalize)
  - [dual norm with confidence intervals](#dual-norm-with-confidence-intervals)

---

In [None]:
library(tidyverse)
theme_set(theme_classic())
library(DescTools)
library(RColorBrewer)

# Processing Functions

## Processing HOMER Histograms

The functions: `process_histograms_dm6` `process_histograms_sac3` and `process_histograms_hg38` work with histogram files generated with HOMER annotatePeaks with -hist option. More details are available [on the HOMER tutorial website](http://homer.ucsd.edu/homer/ngs/quantification.html).

Making the histogram: 

`annotatePeaks.pl [region_to_center_on] [genome_version] -size [size_to_plot] -hist [bp_resolution] -d [tag directories] > histogram_regions_genome_datasource.txt`

Example for a TSS histogram in the hg38 genome: 

`annotatePeaks.pl tss hg38 -size 4000 -hist 25 -d sample1-tagdir sample2-tagdir input1-tagdir input2-tagdir > hist_tss_hg38_samples12.txt`

Input file: histogram_regions_genome_datasource.txt 

Output: dataframe in tidy (long) format, with additional columns describing the metadata originally in each sample name:
- Cell Type
- Treatment
- Timepoint
- Biological Replicate
- Antibody (usually I have multiple IP samples from each biological sample/replicate)
- Technical Replicate (separate library preps from the same biological sample/replicate)

In [None]:
process_histograms_dm6 <- function(x, .x) {
    colnames(x)[1] <- "Distance_from_tss"
    x <- x %>% 
      ### first two rename_with uses are specific to my sample names
      # get rid of suffix to samples, leaving only sample name.Coverage
    rename_with(~ gsub("\\.+\\.Coverage", ".Coverage", .x), contains("tagdir")) %>% 
      # get rid of any prefix before cell type (ex: directory path)
    rename_with(~ gsub(".+\\_HCT", "HCT", .x), contains("HCT")) %>%
      ### below two uses of rename_with are general to HOMER output
    rename_with(~ gsub("\\.[[:digit:]]$", "_minus", .x), contains("Tags")) %>% 
    rename_with(~ gsub("\\.\\.\\.", "_", .x), contains("Tags"))
    
    # select coverage columns only
    xcov <- x %>% select(contains("Coverage"))
    xcov$Distance_from_tss <- x$Distance_from_tss
    
    xcovlong <- 
    xcov %>% pivot_longer(
      cols = -"Distance_from_tss", 
      names_to = "Sample", 
      values_to = "Coverage")
    xcovlong <- xcovlong %>%
  mutate(
    cell = str_match(Sample, '([^_]+)(?:_[^_]+){5}$')[,2],
    treatment = str_match(Sample, '([^_]+)(?:_[^_]+){4}$')[,2],
    timepoint = str_remove(str_match(Sample, '([^_]+)(?:_[^_]+){3}$')[,2], "TSA"),
    biorep = str_match(Sample, '([^_]+)(?:_[^_]+){2}$')[,2],
    antibody = str_match(Sample, '([^_]+)(?:_[^_]+){1}$')[,2],
    techrep = str_match(Sample, '([^_]+)(?:_[^_]+){0}$')[,2]
  )
  }

process_histograms_sac3 <- function(x, .x) {
    colnames(x)[1] <- "Distance_from_tss"
    x <- x %>% 
      ### first two rename_with uses are specific to my sample names
      # get rid of suffix to samples, leaving only sample name.Coverage
    rename_with(~ gsub(".concat.+Coverage", ".Coverage", .x), contains("tagdir")) %>% 
  #  rename_with(~ gsub("_H3K9ac", "_1_H3K9ac", .x), contains("Hela")) %>%
 #   rename_with(~ gsub("_input", "_1_input", .x), contains("sync")) %>%
      # get rid of any prefix before cell type (ex: directory path)
    rename_with(~ gsub(".+\\_HCT", "HCT", .x), contains("HCT")) %>%
    # get rid of any suffix (ex: S2, S4)
      ### below two uses of rename_with are general to HOMER output
    rename_with(~ gsub("\\.[[:digit:]]$", "_minus", .x), contains("Tags")) %>% 
    rename_with(~ gsub("\\.\\.\\.", "_", .x), contains("Tags"))
    
    # select coverage columns only
    xcov <- x %>% select(contains("Coverage"))
    xcov$Distance_from_tss <- x$Distance_from_tss
    
    xcovlong <- 
    xcov %>% pivot_longer(
      cols = -"Distance_from_tss", 
      names_to = "Sample", 
      values_to = "Coverage")
    xcovlong <- xcovlong %>%
  mutate(
    cell = str_match(Sample, '([^_]+)(?:_[^_]+){5}$')[,2],
    treatment = str_match(Sample, '([^_]+)(?:_[^_]+){4}$')[,2],
    timepoint = str_remove(str_match(Sample, '([^_]+)(?:_[^_]+){3}$')[,2], "TSA"),
    biorep = str_match(Sample, '([^_]+)(?:_[^_]+){2}$')[,2],
    antibody = str_match(Sample, '([^_]+)(?:_[^_]+){1}$')[,2],
    techrep = str_match(Sample, '([^_]+)(?:_[^_]+){0}$')[,2]
  )
  }

process_histograms_hg38 <- function(x, .x) {
    colnames(x)[1] <- "Distance_from_tss"
    x <- x %>% 
      ### first two rename_with uses are specific to my sample names
      # get rid of suffix to samples, leaving only sample name.Coverage
    rename_with(~ gsub(".concat.hg38.tagdir.Coverage", ".Coverage", .x), contains("HCT")) %>% 
      # get rid of any prefix before cell type (ex: directory path)
    rename_with(~ gsub(".+\\_HCT", "HCT", .x), contains("HCT")) %>% 
      ### below two uses of rename_with are general to HOMER output
    rename_with(~ gsub("\\.[[:digit:]]$", "_minus", .x), contains("Tags")) %>% 
    rename_with(~ gsub("\\.\\.\\.", "_", .x), contains("Tags"))
    
    # select coverage columns only
    xcov <- x %>% select(contains("Coverage"))
    xcov$Distance_from_tss <- x$Distance_from_tss
    
    xcovlong <- 
    xcov %>% pivot_longer(
      cols = -"Distance_from_tss", 
      names_to = "Sample", 
      values_to = "Coverage")
    
    xcovlong <- xcovlong %>%
  mutate(
    cell = str_match(Sample, '([^_]+)(?:_[^_]+){5}$')[,2],
    treatment = str_match(Sample, '([^_]+)(?:_[^_]+){4}$')[,2],
    timepoint = str_remove(str_match(Sample, '([^_]+)(?:_[^_]+){3}$')[,2], "TSA"),
    biorep = str_match(Sample, '([^_]+)(?:_[^_]+){2}$')[,2],
    antibody = str_match(Sample, '([^_]+)(?:_[^_]+){1}$')[,2],
    techrep = str_match(Sample, '([^_]+)(?:_[^_]+){0}$')[,2]
  )
  }

## Processing HOMER counts files

The function `process_counts_annotpeaks()` takes HOMER annotatePeaks files generated without the -hist option. Instead of generating average Coverage across all regions specified, the resulting annotated counts file is a matrix, with rows for each region. The first 19 columns are metadata describing each region, the remaining columns are the Read Normalized Tag Counts for each sample at each region.

Relevant Metadata Columns: 
1. **PeakID**
2. **Chr**
3. **Start**
4. **End**
5. Strand - strand that nearest annotated gene/TSS is located on, not necessarily strand of your data! Remember ChIP-seq/ATAC-seq is unstranded by definition. 
6. Not Used - ignore
7. Focus.Ratio.Region.Size
8. **Annotation** - if annotating on TSS regions, all will be promoter-TSS
9. Detailed Annotation
10. **Distance to TSS** - if annotating on TSS regions, all will be 0.

In [4]:
process_counts_annotpeaks <- function(counts_annotpeaks, .x) {
  colnames(counts_annotpeaks)[1] <- "PeakID"
  counts_annotpeaks <- counts_annotpeaks %>% 
    rename_with(~ gsub(".hg38.tagdir", "", .x), contains("tagdir")) %>% 
    rename_with(~ gsub(".dedup.+.Tag", ".Tag", .x), contains("tagdir")) %>%
    rename_with(~ gsub("\\.concat", "", .x), contains("Tag")) %>% 
    rename_with(~ gsub(".+HCT", "HCT", .x), contains("Tag")) %>%
    rename_with(~ gsub(".+Hela", "Hela", .x), contains("Tag")) %>%
    rename_with(~ gsub("\\.[[:digit:]]$", "_minus", .x), contains("Tag")) %>% 
    rename_with(~ gsub("\\.Count.+", "", .x), contains("Tag"))
} 

# import data

## Histograms

There are 3 types of histogram files: 
1. TSS histograms: useful for H3K27ac, H3K4me3, Rpb1, other active histone marks...
2. Peak histograms: calculated around HOMER-defined peak sets, can do for individual IP targets
3. RNA histograms: calculated around gene bodies, useful for H3K36me3, Rpb1

Note that gene bodies can also be grouped by activity (calculated here by untreated/control Rpb1 levels) or gene length

In [None]:
# LP 78 - Mitotic H3K9ac titration
hist_tss_dm6_LP78 <- read.delim("LP_78/data/dm6_data/dm6_nodups_tagdirs/hist_tss_dm6_LP78.txt")
hist_tss_sac3_LP78 <- read.delim("LP_78/data/sac3_data/sac3_nodups_tagdirs/hist_tss_sac3_LP78.txt")
hist_tss_hg38_LP78 <- read.delim("LP_78/data/hg38_data/hg38_nodups_tagdirs/hist_tss_hg38_LP78_cleaned.txt")
hist_peaks_hg38_LP78 <- read.delim("LP_78/data/hg38_data/hg38_nodups_tagdirs/hist_mergedpeaks_hg38_LP78.txt")

# LP 88 - HCT116-Rpb1 degron biological replicate 1
hist_rna_dm6_LP88 <- read.delim("LP_88_combined_fastqs/dm6_data/dm6_tagdirs/hist_rna_dm6_LP88.txt")
hist_tss_dm6_LP88 <- read.delim("LP_88_combined_fastqs/dm6_data/dm6_tagdirs/hist_tss_dm6_LP88.txt")
hist_K4me1_dm6_LP88 <- read.delim("LP_88_combined_fastqs/dm6_data/dm6_tagdirs/hist_H3K4me1peaks_dm6_LP88.txt")
hist_K36me3_dm6_LP88 <- read.delim("LP_88_combined_fastqs/dm6_data/dm6_tagdirs/hist_H3K36me3peaks_dm6_LP88.txt")
hist_rna_sac3_LP88 <- read.delim("LP_88_combined_fastqs/sac3_data/sac3_tagdirs/hist_rna_sac3_LP88.txt")
hist_tss_sac3_LP88 <- read.delim("LP_88_combined_fastqs/sac3_data/sac3_tagdirs/hist_tss_sac3_LP88.txt")
hist_tss_hg38_LP88 <- read.delim("LP_88_combined_fastqs/hg38_data/hg38_tagdirs/hist_tss_hg38_LP88.txt")
hist_K27ac_hg38_LP88 <- read.delim("LP_88_combined_fastqs/hg38_data/hg38_tagdirs/hist_H3K27acpeaks_hg38_LP88.txt")

# LP 91 - HCT116-Rpb1 degron biological replicate 2, technical replicate 1/2
hist_rna_dm6_LP91 <- read.delim("LP_91/dm6_data/dm6_tagdirs/hist_rna_dm6_LP91.txt")
hist_tss_dm6_LP91 <- read.delim("LP_91/dm6_data/dm6_tagdirs/hist_tss_dm6_LP91.txt")
hist_K4me1_dm6_LP91 <- read.delim("LP_91/dm6_data/dm6_tagdirs/hist_H3K4me1peaks_dm6_LP91.txt")
hist_K36me3_dm6_LP91 <- read.delim("LP_91/dm6_data/dm6_tagdirs/hist_H3K36me3peaks_dm6_LP91.txt")
hist_rna_sac3_LP91 <- read.delim("LP_91/sac3_data/sac3_tagdirs/hist_rna_sac3_LP91.txt")
hist_tss_sac3_LP91 <- read.delim("LP_91/sac3_data/sac3_tagdirs/hist_tss_sac3_LP91.txt")

hist_shortrna_hg38_LP91 <- read.delim("LP_91/hg38_data/hg38_tagdirs/hist_5to20kb_hg38_LP91.txt")
hist_tss_hg38_LP91 <- read.delim("LP_91/hg38_data/hg38_tagdirs/hist_tss_hg38_LP91.txt")
hist_rna_hg38_LP91 <- read.delim("LP_91/hg38_data/hg38_tagdirs/hist_rna_hg38_LP91.txt")

# LP 95 - HCT116-Rpb1 degron biological replicates 3 and 4
hist_rna_dm6_LP95 <- read.delim("~/data/250627_nextseq2000_LP95/dm6_data/dm6_tagdirs/hist_rna_dm6_LP95.txt")
hist_tss_dm6_LP95 <- read.delim("~/data/250627_nextseq2000_LP95/dm6_data/dm6_tagdirs/hist_tss_dm6_LP95.txt")
hist_rna_sac3_LP95 <- read.delim("~/data/250627_nextseq2000_LP95/sac3_data/sac3_tagdirs/hist_rna_sac3_LP95.txt")
hist_tss_sac3_LP95 <- read.delim("~/data/250627_nextseq2000_LP95/sac3_data/sac3_tagdirs/hist_tss_sac3_LP95.txt")

## Counts

In [None]:
counts_chr19_hg38_LP78 <- read.delim("~/data/experiments/LP_78/data/hg38_data/counts_chr19_1kb_hg38_LP78_readnormvsspike.txt")

counts_tss_hg38_LP91 <- read.delim("LP_91/hg38_data/hg38_tagdirs/counts_tss_hg38_LP88_LP91.txt")
counts_tss_hg38_LP95 <- read.delim("~/data/250627_nextseq2000_LP95/hg38_data/hg38_tagdirs/counts_tss_hg38_LP95.txt")
counts_allHCT_4hrTRP_H3K27ac <- read.delim("~/data/dualspike_natgen_resubmit1/counts_allHCT_bioreps_4hrTRP_DMSO_H3K27ac_readnorm_alldata.txt")

counts_rna_hg38_spikenorm <- read.delim("~/data/dualspike_natgen_resubmit1/counts_rna_hg38_Rpb1_H3K36me3_spikenorm.txt")

counts_Hela_tss <- read.delim("~/data/experiments/LP_81_combined_fastqs/hg38_data/hg38_normalized_tagdirs/counts_alltss_hg38_Hela_spikenorm.txt")
counts_Hela <- read.delim("~/data/dualspike_natgen_resubmit1/counts_allHela_H3K27ac_hg38_LP100_LP81_spikenorm.txt")

# Processing

In [None]:
hist_tss_dm6_LP78_long <- process_histograms_dm6(hist_tss_dm6_LP78)
hist_tss_sac3_LP78_long <- process_histograms_sac3(hist_tss_sac3_LP78)
hist_tss_hg38_LP78_long <- process_histograms_hg38(hist_tss_hg38_LP78)
hist_peaks_hg38_LP78_long <- process_histograms_hg38(hist_peaks_hg38_LP78)

hist_tss_dm6_LP81_long <- process_histograms_dm6(hist_tss_dm6_LP81)
hist_tss_sac3_LP81_long <- process_histograms_dm6(hist_tss_sac3_LP81)
hist_tss_hg38_LP81_long <- process_histograms_dm6(hist_tss_hg38_LP81)
hist_K4me1_dm6_LP81_long <- process_histograms_dm6(hist_K4me1_dm6_LP81)

hist_rna_dm6_LP88_long <- process_histograms_dm6(hist_rna_dm6_LP88)
hist_tss_dm6_LP88_long <- process_histograms_dm6(hist_tss_dm6_LP88)
hist_K36me3_dm6_LP88_long <- process_histograms_dm6(hist_K36me3_dm6_LP88)
hist_K4me1_dm6_LP88_long <- process_histograms_dm6(hist_K4me1_dm6_LP88)
hist_rna_sac3_LP88_long <- process_histograms_dm6(hist_rna_sac3_LP88)
hist_tss_sac3_LP88_long <- process_histograms_dm6(hist_tss_sac3_LP88)
hist_tss_hg38_LP88_long <- process_histograms_dm6(hist_tss_hg38_LP88)
hist_over100kb_hg38_LP88_long <- process_histograms_hg38(hist_over100kb_hg38_LP88)

hist_rna_dm6_LP91_long <- process_histograms_dm6(hist_rna_dm6_LP91)
hist_tss_dm6_LP91_long <- process_histograms_dm6(hist_tss_dm6_LP91)
hist_K36me3_dm6_LP91_long <- process_histograms_dm6(hist_K36me3_dm6_LP91)
hist_K4me1_dm6_LP91_long <- process_histograms_dm6(hist_K4me1_dm6_LP91)
hist_rna_sac3_LP91_long <- process_histograms_dm6(hist_rna_sac3_LP91)
hist_tss_sac3_LP91_long <- process_histograms_dm6(hist_tss_sac3_LP91)

hist_shortrna_hg38_LP91_long <- process_histograms_hg38(hist_shortrna_hg38_LP91)
hist_tss_hg38_LP91_long <- process_histograms_hg38(hist_tss_hg38_LP91)

hist_tss_dm6_LP92_long <- process_histograms_dm6(hist_tss_dm6_LP92)
hist_tss_sac3_LP92_long <- process_histograms_dm6(hist_tss_sac3_LP92)

hist_rna_dm6_LP95_long <- process_histograms_dm6(hist_rna_dm6_LP95)
hist_tss_dm6_LP95_long <- process_histograms_dm6(hist_tss_dm6_LP95)
hist_rna_sac3_LP95_long <- process_histograms_dm6(hist_rna_sac3_LP95)
hist_tss_sac3_LP95_long <- process_histograms_dm6(hist_tss_sac3_LP95)

hist_shortrna_hg38_LP95_long <- process_histograms_hg38(hist_shortrna_hg38_LP95)
hist_over100kb_hg38_LP95_long <- process_histograms_hg38(hist_over100kb_hg38_LP95)

In [None]:
counts_chr19_hg38_LP78 <- process_counts_annotpeaks(counts_chr19_hg38_LP78)
counts_tss_hg38_LP91 <- process_counts_annotpeaks(counts_tss_hg38_LP91)
counts_tss_hg38_LP95 <- process_counts_annotpeaks(counts_tss_hg38_LP95)

counts_allHCT_4hrTRP_H3K27ac <- process_counts_annotpeaks(counts_allHCT_4hrTRP_H3K27ac)

counts_Hela <- process_counts_annotpeaks(counts_Hela)

counts_rna_hg38_spikenorm <- process_counts_annotpeaks(counts_rna_hg38_spikenorm)

counts_Hela_tss <- process_counts_annotpeaks(counts_Hela_tss)

In [None]:
counts_rna_hg38_spikenorm$Annotation <- gsub("\\(.*", "", as.character(counts_rna_hg38_spikenorm$Annotation))
counts_rna_hg38_spikenorm$Annotation <- gsub(" ", "", as.character(counts_rna_hg38_spikenorm$Annotation))

# Figure 1

In [None]:
LP78_hist_tss_dm6 <- ggplot(hist_tss_dm6_LP78_long %>% 
                            arrange((as.factor(as.numeric(str_remove(timepoint, "inter"))))) %>% 
                            filter(grepl("Hela", Sample))) + 
aes(x = Distance_from_tss, y = Coverage, 
    group= Sample, color = as.factor(as.numeric(str_remove(timepoint, "inter")))) +
  scale_color_manual(
      values = colorRampPalette(brewer.pal(9, "Greys"))(7)[2:7],
      name = "% Human Interphase Cells") +
geom_line(linewidth = 1.1, alpha = 0.8) + 
facet_wrap(~ antibody) + 
labs(title = "Fly signal at TSSs", x = "Distance from TSS Center")

# Open SVG device (base R)
#svg("LP78_hist_tss_dm6.svg", width = 16, height = 7)

# Print the ggplot
print(LP78_hist_tss_dm6) 

# Close the device to save the file
dev.off()

In [None]:
hist_tss_sac3_LP78_long %>% arrange((as.factor(as.numeric(str_remove(timepoint, "inter"))))) %>% head()

In [None]:
options(repr.plot.width = 16, repr.plot.height = 7)
LP78_hist_tss_sac3 <- ggplot(hist_tss_sac3_LP78_long %>% 
       arrange((as.factor(as.numeric(str_remove(timepoint, "inter"))))) %>% 
       filter(grepl("Hela", Sample))) + 
aes(x = Distance_from_tss, y = Coverage, 
    group= Sample, color = as.factor(as.numeric(str_remove(timepoint, "inter")))) +
  scale_color_manual(
      values = colorRampPalette(brewer.pal(9, "Reds"))(7)[2:7],
      name = "% Human Interphase Cells") +
geom_line(linewidth = 1.1, alpha = 0.6) + 
facet_wrap(~ antibody, scales = "free") + 
labs(title = "Yeast signal at TSSs", x = "Distance from TSS Center")

# Open SVG device (base R)
#svg("LP78_hist_tss_sac3.svg", width = 16, height = 7)

# Print the ggplot
print(LP78_hist_tss_sac3) 

# Close the device to save the file
dev.off()

In [None]:
ggplot(hist_tss_hg38_LP78_long %>% 
                            arrange((as.factor(as.numeric(str_remove(timepoint, "inter"))))) %>% 
                            filter(grepl("Hela", Sample))) + 
aes(x = Distance_from_tss, y = Coverage, 
    group= Sample, color = as.factor(as.numeric(str_remove(timepoint, "inter")))) +
  scale_color_manual(
      values = colorRampPalette(brewer.pal(9, "Blues"))(7)[2:7],
      name = "% Human Interphase Cells") +
geom_line(linewidth = 1.1, alpha = 0.8) + 
facet_wrap(~ antibody) + 
labs(title = "Human signal at TSSs", x = "Distance from TSS Center")

### LP78 H3K9ac titration scatterplots

In [None]:
counts_chr19_hg38_LP78_avg <- counts_chr19_hg38_LP78 %>%
  # Pivot longer: collect all tag columns
  pivot_longer(
    cols = ends_with(".Tag"),
    names_to = "SampleColumn",
    values_to = "TagValue"
  ) %>%
  
  # Extract normalization type
  mutate(
    Normalization = case_when(
      str_detect(SampleColumn, "\\.normalized\\.tagdir\\.Tag$") ~ "spike_in",
      str_detect(SampleColumn, "\\.hg38\\.GCcheck\\.tagdir\\.Tag$") ~ "read_depth",
      TRUE ~ NA_character_
    ),
    # remove normalization suffix from name
    Sample = SampleColumn %>%
      str_remove("\\.normalized\\.tagdir\\.Tag$") %>%
      str_remove("\\.hg38\\.GCcheck\\.tagdir\\.Tag$")
  ) %>%
  select(-SampleColumn) %>%
  
  # Extract replicate number (assuming _1/_2/_3)
  mutate(
    Replicate = str_extract(Sample, "_\\d+$"),
    SampleGroup = str_remove(Sample, "_\\d+$")
  ) %>%
  
  # Average technical replicates per Peak × SampleGroup × Normalization
  group_by(across(c(PeakID, Chr, Start, End, Strand, Peak.Score,
                    Focus.Ratio.Region.Size, Annotation, Detailed.Annotation,
                    Distance.to.TSS, Nearest.PromoterID, Entrez.ID,
                    Nearest.Unigene, Nearest.Refseq, Nearest.Ensembl,
                    Gene.Name, Gene.Alias, Gene.Description, Gene.Type,
                    SampleGroup, Normalization))) %>%
  summarise(MeanTag = mean(TagValue, na.rm = TRUE), .groups = "drop") %>%
  
  # Pivot wider: one column per SampleGroup × Normalization
  pivot_wider(
    names_from = c(SampleGroup, Normalization),
    names_glue = "{SampleGroup}.{Normalization}",
    values_from = MeanTag
  )

In [None]:
options(repr.plot.width = 5.7, repr.plot.height = 5)

ggplot(counts_chr19_hg38_LP78_avg) + 
geom_abline() +
geom_point(aes(x = HelaS3_100sync_0inter_1_H3K9ac.read_depth, 
    y = HelaS3_0sync_100inter_1_H3K9ac.read_depth), alpha = 0.05, color = "grey") + 
geom_point(aes(x = HelaS3_100sync_0inter_1_H3K9ac.spike_in, 
    y = HelaS3_0sync_100inter_1_H3K9ac.spike_in), alpha = 0.05, color = "black") + 
#geom_smooth(aes(x = HelaS3_100sync_0inter_1_H3K9ac.spike_in,  y = HelaS3_25sync_75inter_1_H3K9ac.spike_in), 
#    method = "lm", se = FALSE, color = "#93c68c", size = 1.2) +
scale_x_log10() + 
scale_y_log10() + 
coord_cartesian(xlim = c(0.2, 650), ylim = c(0.2, 650)) +
theme(axis.text = element_text(size=20), 
        axis.title=element_text(size=20)) + 
labs(x = "Avg 0% Interphase Signal", y = "Avg 100% Signal") 

# Figure 3

### *H. sapiens* data

We quantify signal at regions biologically relevant for each antibody:
- H3K27ac: at TSSs or H3K27ac peaks
- H3K4me3: at TSSs or H3K4me3 peaks
- H3K4me1: at H3K4me1 peaks
- H3K36me3: at gene bodies or H3K36me3 peaks
- Rpb1: at TSSs or Rpb1 peaks

In [None]:
options(repr.plot.width = 14, repr.plot.height = 9)
ggplot(hist_tss_hg38_LP88_long) + 
aes(x = Distance_from_tss, y = Coverage, 
    group= Sample, color = timepoint) +
  scale_color_manual(
    values = c("firebrick","slateblue","goldenrod"),
    name = "Timepoint") + 
geom_line(linewidth = 1.1, alpha = 0.8) + 
facet_wrap(~ antibody, scales = "free") + 
labs(title = "Human signal at TSS", x = "Distance from TSS Center")

# Calculate spike-in SNRs

In [None]:
histograms_signalarea_calculate_sizegiven <- function(input_df, Sample, Coverage, colname1) {
  # ensym() converts strings into symbols, so function inputs can be understood
  ### within ggplot
  Sample <- rlang::ensym(Sample)
  Coverage <- rlang::ensym(Coverage)
  colname1 <- rlang::ensym(colname1)
  # get a list of unique sample names
  samples <- unique(input_df$Sample)

  # make sure first column is named properly
  colnames(input_df)[1] <- "Distance_from_center"  
    
  # filter to only calculate between -1000 and 1000
  input_df <- input_df %>%
    dplyr::filter(Distance_from_center >= 0.04 & Distance_from_center <= 1)
    
  # get x variable
  x <- unique(input_df[[1]])
    
  # initialize empty matrix, then use to make dataframe output
  AUC_peaks <- matrix(data = "", nrow = length(samples), ncol = 1)
  AUC_peaks_df <- data.frame(AUC_peaks, row.names = samples)

  # fill in AUC dataframe for each sample
  for (i in 1:length(samples)) {

    y <- input_df |>
      dplyr::filter(Distance_from_center >= 0.04 & Distance_from_center <= 1) |>
      dplyr::filter(Sample == samples[i]) |>
      dplyr::select(Coverage)
    y <- dplyr::pull(y, Coverage)

    AUC_peaks_df[i, ] <- DescTools::AUC(x, y, method = c("trapezoid"))

  }
  # get data in numeric form
  AUC_peaks_df <- as.data.frame(sapply(AUC_peaks_df, as.numeric))

  # add in the treatment column for plotting
  treatment_df <- input_df |>
    dplyr::filter(Distance_from_center == "0.04") |>
   dplyr::select(treatment)

  treatment_vect <- dplyr::pull(treatment_df, treatment)

  # graph labels in title custom
  cell_type <- unique(input_df[[3]])
  AUC_name <- colnames(AUC_peaks_df[1])
    
    AUC_peaks_df$ID <- samples
   
    AUC_peaks_df
}

In [None]:
histograms_signalarea_calculatetss <- function(input_df, Sample, Coverage, colname1) {
  # ensym() converts strings into symbols, so function inputs can be understood
  ### within ggplot
  Sample <- rlang::ensym(Sample)
  Coverage <- rlang::ensym(Coverage)
  colname1 <- rlang::ensym(colname1)
  # get a list of unique sample names
  samples <- unique(input_df$Sample)

  # make sure first column is named properly
  colnames(input_df)[1] <- "Distance_from_center"  
    
  # filter to only calculate between -1000 and 1000
  input_df <- input_df %>%
    dplyr::filter(Distance_from_center >= -200 & Distance_from_center <= 700)
    
  # get x variable
  x <- unique(input_df[[1]])
    
  # initialize empty matrix, then use to make dataframe output
  AUC_peaks <- matrix(data = "", nrow = length(samples), ncol = 1)
  AUC_peaks_df <- data.frame(AUC_peaks, row.names = samples)

  # fill in AUC dataframe for each sample
  for (i in 1:length(samples)) {

    y <- input_df |>
      dplyr::filter(Distance_from_center >= -200 & Distance_from_center <= 700) |>
      dplyr::filter(Sample == samples[i]) |>
      dplyr::select(Coverage)
    y <- dplyr::pull(y, Coverage)

    AUC_peaks_df[i, ] <- DescTools::AUC(x, y, method = c("trapezoid"))

  }
  # get data in numeric form
  AUC_peaks_df <- as.data.frame(sapply(AUC_peaks_df, as.numeric))

  # add in the treatment column for plotting
  treatment_df <- input_df |>
    dplyr::filter(Distance_from_center == "-200") |>
   dplyr::select(treatment)

  treatment_vect <- dplyr::pull(treatment_df, treatment)

  # graph labels in title custom
  cell_type <- unique(input_df[[3]])
  AUC_name <- colnames(AUC_peaks_df[1])
    
    AUC_peaks_df$ID <- samples
   
    AUC_peaks_df
}

In [None]:
fly_tss_signal_LP78 <- histograms_signalarea_calculatetss(hist_tss_dm6_LP78_long, colname1 = colname1)
yeast_tss_signal_LP78 <- histograms_signalarea_calculatetss(hist_tss_sac3_LP78_long, colname1 = colname1)

fly_tss_signal_LP81 <- histograms_signalarea_calculatetss(hist_tss_dm6_LP81_long, colname1 = colname1)
yeast_tss_signal_LP81 <- histograms_signalarea_calculatetss(hist_tss_sac3_LP81_long, colname1 = colname1)
fly_K4me1_signal_LP81 <- histograms_signalarea_calculatetss(hist_K4me1_dm6_LP81_long, colname1 = colname1)

fly_rna_signal_LP88 <- histograms_signalarea_calculate_sizegiven(hist_rna_dm6_LP88_long, colname1 = colname1)
fly_K36me3_signal_LP88 <- histograms_signalarea_calculate_sizegiven(hist_K36me3_dm6_LP88_long, colname1 = colname1)
fly_K4me1_signal_LP88 <- histograms_signalarea_calculate_sizegiven(hist_K4me1_dm6_LP88_long, colname1 = colname1)

fly_tss_signal_LP88 <- histograms_signalarea_calculatetss(hist_tss_dm6_LP88_long, colname1 = colname1)
yeast_rna_signal_LP88 <- histograms_signalarea_calculatetss(hist_rna_sac3_LP88_long, colname1 = colname1)
yeast_tss_signal_LP88 <- histograms_signalarea_calculatetss(hist_tss_sac3_LP88_long, colname1 = colname1)
#yeast_K4me1_signal_LP88 <- histograms_signalarea_calculatetss(hist_tss_sac3_LP88_long, colname1 = colname1)

fly_rna_signal_LP91 <- histograms_signalarea_calculate_sizegiven(hist_rna_dm6_LP91_long, colname1 = colname1)
fly_tss_signal_LP91 <- histograms_signalarea_calculatetss(hist_tss_dm6_LP91_long, colname1 = colname1)
fly_K36me3_signal_LP91 <- histograms_signalarea_calculate_sizegiven(hist_K36me3_dm6_LP91_long, colname1 = colname1)
fly_K4me1_signal_LP91 <- histograms_signalarea_calculate_sizegiven(hist_K4me1_dm6_LP91_long, colname1 = colname1)
yeast_rna_signal_LP91 <- histograms_signalarea_calculatetss(hist_rna_sac3_LP91_long, colname1 = colname1)
yeast_tss_signal_LP91 <- histograms_signalarea_calculatetss(hist_tss_sac3_LP91_long, colname1 = colname1)

fly_tss_signal_LP92 <- histograms_signalarea_calculatetss(hist_tss_dm6_LP92_long, colname1 = colname1)
yeast_tss_signal_LP92 <- histograms_signalarea_calculatetss(hist_tss_sac3_LP92_long, colname1 = colname1)

fly_rna_signal_LP95 <- histograms_signalarea_calculate_sizegiven(hist_rna_dm6_LP95_long, colname1 = colname1)
fly_tss_signal_LP95 <- histograms_signalarea_calculatetss(hist_tss_dm6_LP95_long, colname1 = colname1)
yeast_rna_signal_LP95 <- histograms_signalarea_calculatetss(hist_rna_sac3_LP95_long, colname1 = colname1)
yeast_tss_signal_LP95 <- histograms_signalarea_calculatetss(hist_tss_sac3_LP95_long, colname1 = colname1)

In [None]:
head(fly_K4me1_signal_LP81 %>% filter(grepl("Hela",ID)) , n = 30)

In [None]:
fly_tss_signal_plot <- fly_tss_signal
fly_tss_signal_plot$antibody = str_match(fly_tss_signal$ID, '([^_]+)(?:_[^_]+){1}$')[,2]
fly_tss_signal_plot$timepoint = str_match(fly_tss_signal$ID, '([^_]+)(?:_[^_]+){3}$')[,2]
fly_tss_signal_plot$treatment = str_match(fly_tss_signal$ID, '([^_]+)(?:_[^_]+){4}$')[,2]
fly_tss_signal_plot$biorep = str_match(fly_tss_signal$ID, '([^_]+)(?:_[^_]+){2}$')[,2]
head(fly_tss_signal_plot)

In [None]:
fly_peak_K27ac_signal$ID <- gsub("\\.Coverage", "", as.character(fly_peak_K27ac_signal$ID))
colnames(fly_peak_K27ac_signal)[1] <- "fly_K27ac_signal"

fly_peak_K4me3_signal$ID <- gsub("\\.Coverage", "", as.character(fly_peak_K4me3_signal$ID))
colnames(fly_peak_K4me3_signal)[1] <- "fly_K4me3_signal"

fly_peak_K36me3_signal$ID <- gsub("\\.Coverage", "", as.character(fly_peak_K36me3_signal$ID))
colnames(fly_peak_K36me3_signal)[1] <- "fly_K36me3_signal"

fly_tss_signal$ID <- gsub("\\.Coverage", "", as.character(fly_tss_signal$ID))
colnames(fly_tss_signal)[1] <- "fly_tss_signal"

yeast_tss_signal$ID <- gsub("\\.Coverage", "", as.character(yeast_tss_signal$ID))
colnames(yeast_tss_signal)[1] <- "yeast_tss_signal"
#head(fly_tss_IPs_signal)

## spike-in normalized plots

In [None]:
genecov_all_norm_sep_method_LP88 <- genecov_all_norm_LP88 %>% pivot_wider(
  names_from = method, values_from = Coverage
)

genecov_all_norm_sep_method_LP88 <- genecov_all_norm_sep_method_LP88 %>%
  mutate(confidence = abs(flynorm - yeastnorm)) %>%
mutate(dualnorm = (flynorm + yeastnorm)/2)

genecov_all_norm_sep_method_LP88_long <- genecov_all_norm_sep_method_LP88 %>%
  pivot_longer(cols = c("flynorm", "yeastnorm"), 
               names_to = "spike.in", 
               values_to = "Normalized.Coverage") 

In [None]:
plot_histograms_stdcurve <- function(input_df, brewer_cols) {
  xvar <- colnames(input_df)[1]
  condition_num <- length(unique(input_df[[5]]))
  antibody_name <- input_df[[6]]
  cell_name <- input_df[[3]]
  brewer_pallet <- as.numeric(brewer_cols)

  input_df |>
    ggplot2::ggplot(ggplot2::aes(x = Distance_from_tss, y = dualnorm, 
                                 group=Sample, color = as.factor(as.numeric(str_remove(timepoint, "inter"))))) +
    ggplot2::geom_line(alpha = 0.9, linewidth = 1.1) +
    ggplot2::labs(title = paste("Histogram of", antibody_name, "in", cell_name),
         x = xvar,
         y = colnames(input_df[8])) +
    ggplot2::scale_color_manual(
      values = colorRampPalette(
        RColorBrewer::brewer.pal(brewer_pallet, "Blues"))(condition_num+4)[(4):(condition_num+4)],
      name = "Condition",
      labels = as.factor(as.numeric(str_remove(unique(input_df[[5]]), "inter")))) +
    ggplot2::theme_classic() +
    ggplot2::theme(legend.position = c(0.84, 0.76),
          legend.background = element_rect(
            size=0.7, linetype="solid",
            colour ="grey20"))
}

In [None]:
plot_histograms_dualnorm_natgen <- function(input_df, treatment_select, biorep_select, antibody_select, regions) {
  
  # extract y column name
  yvar <- "Normalized.Coverage"
  
  ggplot(input_df %>%
           filter(antibody == antibody_select) %>%
           filter(biorep == biorep_select) %>%
           filter(treatment %in% c("DMSO", treatment_select))) +
    aes(x = Distance_from_tss,
        group = interaction(treatment, timepoint, biorep, techrep, spike.in),
        color = timepoint) + 
    geom_line(aes(y = .data[[yvar]], linetype = spike.in)) +
    geom_ribbon(aes(ymax = dualnorm + confidence * 0.5,
                    ymin = dualnorm - confidence * 0.5,
                    fill = timepoint),
                alpha = 0.4, color = NA) +
    theme(legend.position = c(0.86, 0.75)) +
    labs(title = paste("Dual spike-in normalized", antibody_select, "around", regions, treatment_select), 
         subtitle = paste("Biological replicate", biorep_select),
         x = paste("Distance from", regions, "center"),
         y = "Normalized Coverage") +
    scale_color_manual(values = c("firebrick", "slateblue", "goldenrod2")) +
    scale_fill_manual(values = c("firebrick", "slateblue", "goldenrod2")) +
    scale_linetype_manual(values = c("dotted", "dashed"),
                          name = "spike-in species",
                          labels = c("fly normalized", "yeast normalized"))
}

In [None]:
histograms_signalarea_stdcurve <- function(input_df, Sample, dualnorm, colname1) {
  # ensym() converts strings into symbols, so function inputs can be understood
  ### within ggplot
  Sample <- rlang::ensym(Sample)
  dualnorm <- rlang::ensym(dualnorm)
  colname1 <- rlang::ensym(colname1)
  # get a list of unique sample names
  samples <- unique(input_df$Sample)

  # make sure first column is named properly
  colnames(input_df)[1] <- "Distance_from_center"

  # get x variable
  x <- unique(input_df[[1]])

  # initialize empty matrix, then use to make dataframe output
  AUC_peaks <- matrix(data = "", nrow = length(samples), ncol = 1)
  AUC_peaks_df <- data.frame(AUC_peaks, row.names = samples)

  # fill in AUC dataframe for each sample
  for (i in 1:length(samples)) {

    y <- input_df |>
      dplyr::filter(Sample == samples[i]) |>
      dplyr::select(dualnorm)
    y <- dplyr::pull(y, dualnorm)

    AUC_peaks_df[i, ] <- DescTools::AUC(x, y, method = c("trapezoid"))

  }
 # uncomment below if this stops working
  #AUC_peaks_df

  # get data in numeric form
  AUC_peaks_df <- as.data.frame(sapply(AUC_peaks_df, as.numeric))
  #now get this also as an output:
  AUC_peaks_df

  # add in the treatment column for plotting
  treatment_df <- input_df |>
    dplyr::filter(Distance_from_center == "-2000") |>
    dplyr::select(timepoint)

  treatment_vect <- dplyr::pull(treatment_df, timepoint)
  treatment_vect <- str_remove(treatment_vect, "inter")
 # QC the treatment_vect
  print(treatment_vect)

  # plot signal area
  ## first make colors vector
  color_vect <- unique(treatment_vect)

  # graph labels in title custom
  cell_type <- unique(input_df[[3]])
  AUC_name <- colnames(AUC_peaks_df[1])

  # plot
    AUC_peaks_df |>
    ggplot2::ggplot(ggplot2::aes(x = as.numeric(treatment_vect), 
                                 color = as.factor(as.numeric(treatment_vect)),
                                y = AUC_peaks)) +
    ggplot2::geom_point(size = 5, alpha = 0.7) +
    ggplot2::scale_color_manual(
      values = colorRampPalette(RColorBrewer::brewer.pal(9, "Spectral"))(6)[1:6],
      name = "Condition") +
    ggplot2::theme_classic() +
    ggplot2::labs(title = paste(as.character(AUC_name), "in", as.character(cell_type), "cells"),
         x = "Condition",
         y = "Signal Area")

}

In [None]:
options(repr.plot.width = 9, repr.plot.height = 7)

histograms_signalarea_stdcurve(genecov_all_norm_sep_method, colname1 = colname1) + 
  labs(title = "K9ac around Peaks", 
       subtitle = "Read normalized")