In [None]:
load("/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/scripts/pks/Nissle_processed.RData")
library(tidyverse)
library(reshape2)
library(magrittr)
library(lemon)
library(BSgenome)
library(MutationalPatterns)
library(plyr)
library(data.table)
library(vroom)
library(ggpubr)
library(cowplot)
library(patchwork)
library(BSgenome.Hsapiens.UCSC.hg38)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(sjPlot)
library(org.Hs.eg.db)
library(GenomicFeatures)
library(AnnotationDbi)
library(org.Hs.eg.db)

ref_genome = "BSgenome.Hsapiens.UCSC.hg38"
genes = genes(TxDb.Hsapiens.UCSC.hg38.knownGene)
load("vcfs.RData")

- Coding (snv, indel fractions, signature refit)
- Driver genes (snv, indel fractions, signature refit)

GET CODING SNVs (also have gene functionality but yeah let's worry about that later)

In [None]:
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene

cds <- cdsBy(txdb, by = "gene")

In [None]:
### Genes aren't specific enough here so we will use coding sequences
annotate_with_genes <- function(df, genes) {

  gr <- GRanges(
    seqnames = df$chr,
    ranges = IRanges(start = df$position, end = df$position),
    strand = df$strand
  )

  hits <- findOverlaps(gr, genes)

  gene_ids <- mcols(genes)$gene_id[subjectHits(hits)]

  symbols <- mapIds(
    org.Hs.eg.db,
    keys = gene_ids,
    column = "SYMBOL",
    keytype = "ENTREZID",
    multiVals = "first"
  )

  # initialize column
  df$gene_symbol <- NA_character_

  # assign symbols
  df$gene_symbol[queryHits(hits)] <- symbols

  df
}

In [None]:
annotate_with_coding_genes <- function(df, txdb) {

  gr <- GRanges(
    seqnames = df$chr,
    ranges   = IRanges(start = df$position, end = df$position),
    strand   = df$strand
  )

  cds <- cdsBy(txdb, by = "gene")

  hits <- findOverlaps(gr, cds, ignore.strand = TRUE)

  # initialize column
  df$gene_symbol_coding <- NA_character_

  # ðŸš« no coding hits â†’ return early
  if (length(hits) == 0) {
    return(df)
  }

  gene_ids <- names(cds)[subjectHits(hits)]

  symbols <- mapIds(
    org.Hs.eg.db,
    keys      = gene_ids,
    column    = "SYMBOL",
    keytype   = "ENTREZID",
    multiVals = "first"
  )

  df$gene_symbol_coding[queryHits(hits)] <- symbols

  df
}

In [None]:
contexts_test <- contexts[c(1, 2)]

In [None]:
contexts_test_coding <- lapply(contexts_test, annotate_with_coding_genes, txdb = txdb)

In [None]:
contexts_coding <- lapply(contexts, annotate_with_coding_genes, txdb = txdb)

In [None]:
save.image(file = "colibactincoding.RData")

In [None]:
snv_coding <- data.frame(
  sample = names(contexts_coding),
  n_with_gene = sapply(
    contexts_coding,
    function(df) sum(!is.na(df$gene_symbol_coding))
  ),
  row.names = NULL
)

In [None]:
contexts_coding$'029_DC_T'

In [None]:
sum(snv_coding$n_with_gene)

In [None]:
snv_coding

Now that I have CODING SNVs, let's fitler for the PKS

In [None]:
contexts_coding_filtered <- lapply(
  contexts_coding,
  function(df) df[!is.na(df$gene_symbol_coding), ]
)

In [None]:
context_list = list(normal = contexts_coding_filtered[categories$injection == "normal"],
                    adenoma = contexts_coding_filtered[categories$injection == "adenoma"],
                    carcinoma = contexts_coding_filtered[categories$injection == "carcinoma"])

In [None]:
contexts_TN_sample <- context_list %>%
  purrr::flatten() %>%
  purrr::map(~ .x %>%
    distinct() %>%
    filter(grepl("^T", type))
  )

contexts_TN_sample <- setNames(
  contexts_TN_sample,
  names(contexts_TN_sample)
)

In [None]:
ext_context_sample <- rbindlist(
  contexts_TN_sample,
  idcol = "name"
)

In [None]:
TRIPLETS_48 = TRIPLETS_96[49:96]
SBS88_TN = as.data.table(signatures) %>% dplyr::slice(49:96) %>% pull("SBS88")

In [None]:
ext_context_sample <- ext_context_sample %>%
  mutate(
    pos34 = substr(context, 7, 8),
    trinucleotide = factor(trinucleotide, levels = TRIPLETS_48),
    select = factor(
      ifelse(pos34 == "AA", "AA", "other"),
      levels = c("other", "AA")
    ),
  )

In [None]:
snv_load <- ext_context_sample[
  select == "AA",
  .(n_AA = .N),
  by = name
]

In [None]:
snv_load

In [None]:
coding_snv_burden <- merge(snv_load, snv_coding, by.x = "name", by.y = "sample")

In [None]:
write.csv(coding_snv_burden, "colibactin_snv_burden_coding.csv", row.names = FALSE)

Let's get coding indels + fraction

In [None]:
vcfs_indel

In [None]:
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene

cds_gr <- unlist(
  cdsBy(txdb, by = "gene"),
  use.names = TRUE   # keep ENTREZ IDs as names
)

In [None]:
annotate_indels_coding <- function(gr, cds_gr) {

  # initialize metadata
  mcols(gr)$coding      <- FALSE
  mcols(gr)$gene_id     <- NA_character_
  mcols(gr)$gene_symbol <- NA_character_

  hits <- findOverlaps(gr, cds_gr, ignore.strand = TRUE)

  if (length(hits) == 0)
    return(gr)

  gene_ids <- names(cds_gr)[subjectHits(hits)]

  symbols <- mapIds(
    org.Hs.eg.db,
    keys      = gene_ids,
    column    = "SYMBOL",
    keytype   = "ENTREZID",
    multiVals = "first"
  )

  # mark coding
  mcols(gr)$coding[queryHits(hits)] <- TRUE

  # assign gene info
  mcols(gr)$gene_id[queryHits(hits)]     <- gene_ids
  mcols(gr)$gene_symbol[queryHits(hits)] <- symbols

  gr
}

In [None]:
annotate_indels_genic <- function(gr, genes) {

  # initialize metadata
  mcols(gr)$genic       <- FALSE
  mcols(gr)$gene_id     <- NA_character_
  mcols(gr)$gene_symbol <- NA_character_

  hits <- findOverlaps(gr, genes, ignore.strand = TRUE)

  if (length(hits) == 0)
    return(gr)

  gene_ids <- mcols(genes)$gene_id[subjectHits(hits)]

  symbols <- mapIds(
    org.Hs.eg.db,
    keys      = gene_ids,
    column    = "SYMBOL",
    keytype   = "ENTREZID",
    multiVals = "first"
  )

  # mark genic
  mcols(gr)$genic[queryHits(hits)] <- TRUE

  # assign gene info
  mcols(gr)$gene_id[queryHits(hits)]     <- gene_ids
  mcols(gr)$gene_symbol[queryHits(hits)] <- symbols

  gr
}

In [None]:
vcfs_indel_annotated <- endoapply(
  vcfs_indel,
  annotate_indels_coding,
  cds_gr = cds_gr
)

In [None]:
vcfs_indel_annotated[[3]]

In [None]:
vcfs_indel_genic <- endoapply(
  vcfs_indel,
  annotate_indels_genic,
  genes = genes
)

In [None]:
vcfs_indel_genic[[3]]

In [None]:
indel_coding_counts <- data.frame(
  sample = names(vcfs_indel_annotated),
  n_coding = sapply(
    vcfs_indel_annotated,
    function(gr) sum(gr$coding, na.rm = TRUE)
  ),
  row.names = NULL
)

In [None]:
#table(unlist(vcfs_indel_annotated)$coding)
#sapply(vcfs_indel_annotated, function(gr) table(gr$coding))
table(unlist(vcfs_indel_genic)$genic)
sapply(vcfs_indel_genic, function(gr) table(gr$genic))


In [None]:
vcfs_indel_coding_only <- endoapply(
  vcfs_indel_annotated,
  function(gr) gr[gr$coding]
)

In [None]:
vcfs_indel_genic_only <- endoapply(
  vcfs_indel_genic,
  function(gr) gr[gr$genic]
)

In [None]:
# get mutation loads
indel_loads = lengths(vcfs_indel_coding_only) %>% as.data.frame()
colnames(indel_loads) = "total_indels"
id_contexts = MutationalPatterns::get_indel_context(vcfs_indel_coding_only, ref_genome)
id_pks_contexts = lapply(id_contexts, select_context_indel, type = "Strelka")
indel_counts = count_indel_contexts(id_contexts)
in_pks_motif = lengths(id_pks_contexts)
indel_loads$in_pks_motif <- in_pks_motif[rownames(indel_loads)]
indel_loads$in_pks_motif[is.na(indel_loads$in_pks_motif)] <- 0L
indel_loads$fraction_pksmotif = indel_loads$in_pks_motif/indel_loads$total_indels
write.csv(indel_loads, file = paste0("colibactin_id_burden_coding.csv"), row.names = TRUE)



In [None]:
# get mutation loads
indel_loads = lengths(vcfs_indel_genic_only) %>% as.data.frame()
colnames(indel_loads) = "total_indels"
id_contexts = MutationalPatterns::get_indel_context(vcfs_indel_genic_only, ref_genome)
id_pks_contexts = lapply(id_contexts, select_context_indel, type = "Strelka")
indel_counts = count_indel_contexts(id_contexts)
in_pks_motif = lengths(id_pks_contexts)
indel_loads$in_pks_motif <- in_pks_motif[rownames(indel_loads)]
indel_loads$in_pks_motif[is.na(indel_loads$in_pks_motif)] <- 0L
indel_loads$fraction_pksmotif = indel_loads$in_pks_motif/indel_loads$total_indels
write.csv(indel_loads, file = paste0("colibactin_id_burden_genic.csv"), row.names = TRUE)



Let's refit for signatures...

In [None]:
# Need to get mut_mat of just coding SNVs

In [225]:
vcfs_sbs_genic <- endoapply(
  vcfs_sbs,
  annotate_indels_genic,
  genes = genes
)

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' ret

In [226]:
vcfs_sbs_genic_only <- endoapply(
  vcfs_sbs_genic,
  function(gr) gr[gr$genic]
)

In [231]:
vcfs_sbs_coding <- endoapply(
  vcfs_sbs,
  annotate_indels_coding,
  cds_gr = cds_gr
)

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' ret

In [232]:
vcfs_sbs_coding_only <- endoapply(
  vcfs_sbs_coding,
  function(gr) gr[gr$coding]
)

In [233]:
mut_mat = mut_matrix(vcfs_sbs_coding_only, ref_genome)
fit_res = fit_to_signatures(mut_mat, as.matrix(sigs_known))
fit_res_clones_sbs = fit_res$contribution %>%
    prop.table(2) %>%
    as.data.frame() %>% rownames_to_column("Signature") %>%
    pivot_longer(cols = -Signature) %>%
    filter(Signature == "SBS88")
  fit_res_clones_sbs = merge(fit_res_clones_sbs, categories)

In [235]:
write.csv(fit_res_clones_sbs, "SBS88_refit_coding.csv", row.names = FALSE)

In [None]:
# get mutation loads
indel_loads = lengths(vcfs_indel_coding_only) %>% as.data.frame()
colnames(indel_loads) = "total_indels"
id_contexts = MutationalPatterns::get_indel_context(vcfs_indel_coding_only, ref_genome)
id_pks_contexts = lapply(id_contexts, select_context_indel, type = "Strelka")
indel_counts_coding = count_indel_contexts(id_contexts)

In [None]:
id_sigs_select = id_signatures[, c("ID1", "ID2", "ID18")]
  fit_res_id = fit_to_signatures(indel_counts_coding, as.matrix(id_sigs_select))
  fit_res_clones = fit_res_id$contribution %>%
    prop.table(2) %>%
    as.data.frame() %>% rownames_to_column("Signature") %>%
    pivot_longer(cols = -Signature) %>%
    filter(Signature == "ID18")
  fit_res_clones = merge(fit_res_clones, categories)

In [None]:
write.csv(fit_res_clones, "ID18_refit_coding.csv", row.names = FALSE)

In [None]:
indel_loads = lengths(vcfs_indel_genic_only) %>% as.data.frame()
colnames(indel_loads) = "total_indels"
id_contexts = MutationalPatterns::get_indel_context(vcfs_indel_genic_only, ref_genome)
id_pks_contexts = lapply(id_contexts, select_context_indel, type = "Strelka")
indel_counts_genic = count_indel_contexts(id_contexts)

In [None]:
id_sigs_select = id_signatures[, c("ID1", "ID2", "ID18")]
  fit_res_id = fit_to_signatures(indel_counts_genic, as.matrix(id_sigs_select))
  fit_res_clones = fit_res_id$contribution %>%
    prop.table(2) %>%
    as.data.frame() %>% rownames_to_column("Signature") %>%
    pivot_longer(cols = -Signature) %>%
    filter(Signature == "ID18")
  fit_res_clones = merge(fit_res_clones, categories)

In [None]:
write.csv(fit_res_clones, "ID18_refit_genic.csv", row.names = FALSE)

Are there any driver mutations in our samples (snv or indel)?

In [None]:
driverGenes = c("APC", "TP53", "KRAS", "BRAF", "PIK3CA", "SMAD4", "FBXW7", "TCF7L2","FAT4", "ATM")

In [None]:
driver_gene_check <- data.frame(
  sample = names(contexts_coding),
  n_with_gene = sapply(
    contexts_coding,
    function(df) sum(df$gene_symbol_coding %in% driverGenes, na.rm = TRUE)
  ),
  driver_genes = sapply(
    contexts_coding,
    function(df) {
      hits <- unique(df$gene_symbol_coding[df$gene_symbol_coding %in% driverGenes])
      if (length(hits) == 0) NA_character_ else paste(hits, collapse = ";")
    }
  ),
  row.names = NULL
)

In [None]:
driver_gene_check

In [None]:
write.csv(driver_gene_check, "colibactin_driver_coding_gene_snvs.csv", row.names = FALSE)

In [None]:
sum(driver_gene_check$n_with_gene > 0)

In [None]:
driver_indel_summary <- data.frame(
sample = names(vcfs_indel_genic_only),
n_with_gene = sapply(
vcfs_indel_genic_only,
function(gr) sum(gr$gene_symbol %in% driverGenes, na.rm = TRUE)
),
driver_genes = sapply(
vcfs_indel_genic_only,
function(gr) {
hits <- unique(gr$gene_symbol[gr$gene_symbol %in% driverGenes])
if (length(hits) == 0) NA_character_ else paste(hits, collapse = ";")
}
),
row.names = NULL
)

In [None]:
driver_indel_summary

In [None]:
write.csv(driver_indel_summary, "colibactin_driver_gene_IDs.csv", row.names = FALSE)

In [None]:
driver_indel_summary <- data.frame(
sample = names(vcfs_indel_coding_only),
n_with_gene = sapply(
vcfs_indel_coding_only,
function(gr) sum(gr$coding %in% driverGenes, na.rm = TRUE)
),
driver_genes = sapply(
vcfs_indel_coding_only,
function(gr) {
hits <- unique(gr$coding[gr$coding %in% driverGenes])
if (length(hits) == 0) NA_character_ else paste(hits, collapse = ";")
}
),
row.names = NULL
)