In [None]:
load("/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/scripts/pks/Nissle_processed.RData")
library(tidyverse)
library(reshape2)
library(magrittr)
library(lemon)
library(BSgenome)
library(MutationalPatterns)
library(plyr)
library(data.table)
library(vroom)
library(ggpubr)
library(cowplot)
library(patchwork)
library(BSgenome.Hsapiens.UCSC.hg38)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(sjPlot)
library(org.Hs.eg.db)
library(GenomicFeatures)
library(AnnotationDbi)
library(org.Hs.eg.db)

ref_genome = "BSgenome.Hsapiens.UCSC.hg38"
genes = genes(TxDb.Hsapiens.UCSC.hg38.knownGene)
load("vcfs.RData")

- Coding (snv, indel fractions, signature refit)
- Driver genes (snv, indel fractions, signature refit)

GET CODING SNVs (also have gene functionality but yeah let's worry about that later)

In [None]:
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene

cds <- cdsBy(txdb, by = "gene")

In [None]:
### Genes aren't specific enough here so we will use coding sequences
annotate_with_genes <- function(df, genes) {

  gr <- GRanges(
    seqnames = df$chr,
    ranges = IRanges(start = df$position, end = df$position),
    strand = df$strand
  )

  hits <- findOverlaps(gr, genes)

  gene_ids <- mcols(genes)$gene_id[subjectHits(hits)]

  symbols <- mapIds(
    org.Hs.eg.db,
    keys = gene_ids,
    column = "SYMBOL",
    keytype = "ENTREZID",
    multiVals = "first"
  )

  # initialize column
  df$gene_symbol <- NA_character_

  # assign symbols
  df$gene_symbol[queryHits(hits)] <- symbols

  df
}

In [None]:
annotate_with_coding_genes <- function(df, txdb) {

  gr <- GRanges(
    seqnames = df$chr,
    ranges   = IRanges(start = df$position, end = df$position),
    strand   = df$strand
  )

  cds <- cdsBy(txdb, by = "gene")

  hits <- findOverlaps(gr, cds, ignore.strand = TRUE)

  # initialize column
  df$gene_symbol_coding <- NA_character_

  # ðŸš« no coding hits â†’ return early
  if (length(hits) == 0) {
    return(df)
  }

  gene_ids <- names(cds)[subjectHits(hits)]

  symbols <- mapIds(
    org.Hs.eg.db,
    keys      = gene_ids,
    column    = "SYMBOL",
    keytype   = "ENTREZID",
    multiVals = "first"
  )

  df$gene_symbol_coding[queryHits(hits)] <- symbols

  df
}

In [None]:
contexts_test <- contexts[c(1, 2)]

In [None]:
contexts_test_coding <- lapply(contexts_test, annotate_with_coding_genes, txdb = txdb)

In [None]:
contexts_coding <- lapply(contexts, annotate_with_coding_genes, txdb = txdb)

In [None]:
save.image(file = "colibactincoding.RData")

In [None]:
snv_coding <- data.frame(
  sample = names(contexts_coding),
  n_with_gene = sapply(
    contexts_coding,
    function(df) sum(!is.na(df$gene_symbol_coding))
  ),
  row.names = NULL
)

In [None]:
contexts_coding$'029_DC_T'

In [None]:
sum(snv_coding$n_with_gene)

In [None]:
snv_coding

Now that I have CODING SNVs, let's fitler for the PKS

In [None]:
contexts_coding_filtered <- lapply(
  contexts_coding,
  function(df) df[!is.na(df$gene_symbol_coding), ]
)

In [None]:
context_list = list(normal = contexts_coding_filtered[categories$injection == "normal"],
                    adenoma = contexts_coding_filtered[categories$injection == "adenoma"],
                    carcinoma = contexts_coding_filtered[categories$injection == "carcinoma"])

In [None]:
contexts_TN_sample <- context_list %>%
  purrr::flatten() %>%
  purrr::map(~ .x %>%
    distinct() %>%
    filter(grepl("^T", type))
  )

contexts_TN_sample <- setNames(
  contexts_TN_sample,
  names(contexts_TN_sample)
)

In [None]:
ext_context_sample <- rbindlist(
  contexts_TN_sample,
  idcol = "name"
)

In [None]:
TRIPLETS_48 = TRIPLETS_96[49:96]
SBS88_TN = as.data.table(signatures) %>% dplyr::slice(49:96) %>% pull("SBS88")

In [None]:
ext_context_sample <- ext_context_sample %>%
  mutate(
    pos34 = substr(context, 7, 8),
    trinucleotide = factor(trinucleotide, levels = TRIPLETS_48),
    select = factor(
      ifelse(pos34 == "AA", "AA", "other"),
      levels = c("other", "AA")
    ),
  )

In [None]:
snv_load <- ext_context_sample[
  select == "AA",
  .(n_AA = .N),
  by = name
]

In [None]:
snv_load

In [None]:
coding_snv_burden <- merge(snv_load, snv_coding, by.x = "name", by.y = "sample")

In [None]:
write.csv(coding_snv_burden, "colibactin_snv_burden_coding.csv", row.names = FALSE)

Let's get coding indels + fraction

In [None]:
vcfs_indel

In [126]:
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene

cds_gr <- unlist(
  cdsBy(txdb, by = "gene"),
  use.names = TRUE   # keep ENTREZ IDs as names
)

In [127]:
annotate_indels_coding <- function(gr, cds_gr) {

  # initialize metadata
  mcols(gr)$coding      <- FALSE
  mcols(gr)$gene_id     <- NA_character_
  mcols(gr)$gene_symbol <- NA_character_

  hits <- findOverlaps(gr, cds_gr, ignore.strand = TRUE)

  if (length(hits) == 0)
    return(gr)

  gene_ids <- names(cds_gr)[subjectHits(hits)]

  symbols <- mapIds(
    org.Hs.eg.db,
    keys      = gene_ids,
    column    = "SYMBOL",
    keytype   = "ENTREZID",
    multiVals = "first"
  )

  # mark coding
  mcols(gr)$coding[queryHits(hits)] <- TRUE

  # assign gene info
  mcols(gr)$gene_id[queryHits(hits)]     <- gene_ids
  mcols(gr)$gene_symbol[queryHits(hits)] <- symbols

  gr
}

In [185]:
annotate_indels_genic <- function(gr, genes) {

  # initialize metadata
  mcols(gr)$genic       <- FALSE
  mcols(gr)$gene_id     <- NA_character_
  mcols(gr)$gene_symbol <- NA_character_

  hits <- findOverlaps(gr, genes, ignore.strand = TRUE)

  if (length(hits) == 0)
    return(gr)

  gene_ids <- mcols(genes)$gene_id[subjectHits(hits)]

  symbols <- mapIds(
    org.Hs.eg.db,
    keys      = gene_ids,
    column    = "SYMBOL",
    keytype   = "ENTREZID",
    multiVals = "first"
  )

  # mark genic
  mcols(gr)$genic[queryHits(hits)] <- TRUE

  # assign gene info
  mcols(gr)$gene_id[queryHits(hits)]     <- gene_ids
  mcols(gr)$gene_symbol[queryHits(hits)] <- symbols

  gr
}

In [128]:
vcfs_indel_annotated <- endoapply(
  vcfs_indel,
  annotate_indels_coding,
  cds_gr = cds_gr
)

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' ret

In [131]:
vcfs_indel_annotated[[3]]

GRanges object with 133 ranges and 8 metadata columns:
                      seqnames              ranges strand | paramRangeID
                         <Rle>           <IRanges>  <Rle> |     <factor>
    chr1_1544024_CA_C     chr1     1544024-1544025      * |           NA
   chr1_28603518_A_AT     chr1            28603518      * |           NA
   chr1_33934715_GC_G     chr1   33934715-33934716      * |           NA
   chr1_36286516_TG_T     chr1   36286516-36286517      * |           NA
   chr1_66128218_GT_G     chr1   66128218-66128219      * |           NA
                  ...      ...                 ...    ... .          ...
  chr22_31978916_GT_G    chr22   31978916-31978917      * |           NA
   chrX_94110030_GA_G     chrX   94110030-94110031      * |           NA
  chrX_108898492_GA_G     chrX 108898492-108898493      * |           NA
  chrX_133874071_CA_C     chrX 133874071-133874072      * |           NA
    chrY_4587895_TA_T     chrY     4587895-4587896      * |          

In [186]:
vcfs_indel_genic <- endoapply(
  vcfs_indel,
  annotate_indels_genic,
  genes = genes
)

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' ret

In [187]:
vcfs_indel_genic[[3]]

GRanges object with 133 ranges and 8 metadata columns:
                      seqnames              ranges strand | paramRangeID
                         <Rle>           <IRanges>  <Rle> |     <factor>
    chr1_1544024_CA_C     chr1     1544024-1544025      * |           NA
   chr1_28603518_A_AT     chr1            28603518      * |           NA
   chr1_33934715_GC_G     chr1   33934715-33934716      * |           NA
   chr1_36286516_TG_T     chr1   36286516-36286517      * |           NA
   chr1_66128218_GT_G     chr1   66128218-66128219      * |           NA
                  ...      ...                 ...    ... .          ...
  chr22_31978916_GT_G    chr22   31978916-31978917      * |           NA
   chrX_94110030_GA_G     chrX   94110030-94110031      * |           NA
  chrX_108898492_GA_G     chrX 108898492-108898493      * |           NA
  chrX_133874071_CA_C     chrX 133874071-133874072      * |           NA
    chrY_4587895_TA_T     chrY     4587895-4587896      * |          

In [134]:
indel_coding_counts <- data.frame(
  sample = names(vcfs_indel_annotated),
  n_coding = sapply(
    vcfs_indel_annotated,
    function(gr) sum(gr$coding, na.rm = TRUE)
  ),
  row.names = NULL
)

In [193]:
#table(unlist(vcfs_indel_annotated)$coding)
#sapply(vcfs_indel_annotated, function(gr) table(gr$coding))
table(unlist(vcfs_indel_genic)$genic)
sapply(vcfs_indel_genic, function(gr) table(gr$genic))



FALSE  TRUE 
18932 27875 

$`029_DC_T`

FALSE  TRUE 
   25    52 

$`030_CE_T`

FALSE  TRUE 
   73   110 

$`031_RE_T`

FALSE  TRUE 
   62    71 

$`058_RE_T`

FALSE  TRUE 
   27    53 

$`103_RE`

FALSE  TRUE 
   51    89 

$`116_RE_T`

FALSE  TRUE 
   73   145 

$`116_TR`

FALSE  TRUE 
   50    84 

$`12b_D12_84_TR`

FALSE  TRUE 
   31    56 

$`148_SI`

FALSE  TRUE 
   50    90 

$`17_27AS`

FALSE  TRUE 
   22    28 

$`18_27RE`

FALSE  TRUE 
   25    46 

$`19_27TR`

FALSE  TRUE 
   10    17 

$`20_40DC`

FALSE  TRUE 
   47    76 

$`21_116CE`

FALSE  TRUE 
   51    79 

$`22_76TR`

FALSE  TRUE 
   34    62 

$`23_180RE`

FALSE  TRUE 
   78   110 

$`24_130DC`

FALSE  TRUE 
   54    78 

$`27_CE`

FALSE  TRUE 
   18    26 

$`27_DC`

FALSE  TRUE 
   14    25 

$`29_74TR`

FALSE  TRUE 
   19    47 

$`29_CE`

FALSE  TRUE 
   48    63 

$`30_130RE`

FALSE  TRUE 
   49    73 

$`30_TR`

FALSE  TRUE 
   46    55 

$`32_29TR`

FALSE  TRUE 
   49    62 

$`33_130AS`

FALSE  TRUE 
   33    79 

$`40_AS`

FALSE  TRU

In [135]:
vcfs_indel_coding_only <- endoapply(
  vcfs_indel_annotated,
  function(gr) gr[gr$coding]
)

In [197]:
vcfs_indel_genic_only <- endoapply(
  vcfs_indel_genic,
  function(gr) gr[gr$genic]
)

In [None]:
# get mutation loads
indel_loads = lengths(vcfs_indel_coding_only) %>% as.data.frame()
colnames(indel_loads) = "total_indels"
id_contexts = MutationalPatterns::get_indel_context(vcfs_indel_coding_only, ref_genome)
id_pks_contexts = lapply(id_contexts, select_context_indel, type = "Strelka")
indel_counts = count_indel_contexts(id_contexts)
in_pks_motif = lengths(id_pks_contexts)
indel_loads$in_pks_motif <- in_pks_motif[rownames(indel_loads)]
indel_loads$in_pks_motif[is.na(indel_loads$in_pks_motif)] <- 0L
indel_loads$fraction_pksmotif = indel_loads$in_pks_motif/indel_loads$total_indels
write.csv(indel_loads, file = paste0("colibactin_id_burden_coding.csv"), row.names = TRUE)



In [200]:
# get mutation loads
indel_loads = lengths(vcfs_indel_genic_only) %>% as.data.frame()
colnames(indel_loads) = "total_indels"
id_contexts = MutationalPatterns::get_indel_context(vcfs_indel_genic_only, ref_genome)
id_pks_contexts = lapply(id_contexts, select_context_indel, type = "Strelka")
indel_counts = count_indel_contexts(id_contexts)
in_pks_motif = lengths(id_pks_contexts)
indel_loads$in_pks_motif <- in_pks_motif[rownames(indel_loads)]
indel_loads$in_pks_motif[is.na(indel_loads$in_pks_motif)] <- 0L
indel_loads$fraction_pksmotif = indel_loads$in_pks_motif/indel_loads$total_indels
write.csv(indel_loads, file = paste0("colibactin_id_burden_genic.csv"), row.names = TRUE)



Let's refit for signatures...

In [None]:
# Need to get mut_mat of just coding SNVs

In [None]:
fit_res = fit_to_signatures(mut_mat, as.matrix(sigs_known))

In [None]:
fit_res_clones_sbs = fit_res$contribution %>%
    prop.table(2) %>%
    as.data.frame() %>% rownames_to_column("Signature") %>%
    pivot_longer(cols = -Signature) %>%
    filter(Signature == "SBS88")
  fit_res_clones_sbs = merge(fit_res_clones_sbs, categories)

In [None]:
id_sigs_select = id_signatures[, c("ID1", "ID2", "ID18")]
  fit_res_id = fit_to_signatures(indel_counts, as.matrix(id_sigs_select))
  fit_res_clones = fit_res_id$contribution %>%
    prop.table(2) %>%
    as.data.frame() %>% rownames_to_column("Signature") %>%
    pivot_longer(cols = -Signature) %>%
    filter(Signature == "ID18")
  fit_res_clones = merge(fit_res_clones, categories)

Are there any driver mutations in our samples (snv or indel)?

In [203]:
driverGenes = c("APC", "TP53", "KRAS", "BRAF", "PIK3CA", "SMAD4", "FBXW7", "TCF7L2","FAT4", "ATM")

In [None]:
driver_gene_check <- data.frame(
  sample = names(contexts_coding),
  n_with_gene = sapply(
    contexts_coding,
    function(df) sum(df$gene_symbol_coding %in% driverGenes, na.rm = TRUE)
  ),
  driver_genes = sapply(
    contexts_coding,
    function(df) {
      hits <- unique(df$gene_symbol_coding[df$gene_symbol_coding %in% driverGenes])
      if (length(hits) == 0) NA_character_ else paste(hits, collapse = ";")
    }
  ),
  row.names = NULL
)

In [None]:
driver_gene_check

In [None]:
write.csv(driver_gene_check, "colibactin_driver_coding_gene_snvs.csv", row.names = FALSE)

In [None]:
sum(driver_gene_check$n_with_gene > 0)

In [209]:
driver_indel_summary <- data.frame(
sample = names(vcfs_indel_genic_only),
n_with_gene = sapply(
vcfs_indel_genic_only,
function(gr) sum(gr$gene_symbol %in% driverGenes, na.rm = TRUE)
),
driver_genes = sapply(
vcfs_indel_genic_only,
function(gr) {
hits <- unique(gr$gene_symbol[gr$gene_symbol %in% driverGenes])
if (length(hits) == 0) NA_character_ else paste(hits, collapse = ";")
}
),
row.names = NULL
)

In [210]:
driver_indel_summary

sample,n_with_gene,driver_genes
<chr>,<int>,<chr>
029_DC_T,0,
030_CE_T,0,
031_RE_T,0,
058_RE_T,0,
103_RE,1,FAT4
116_RE_T,0,
116_TR,0,
12b_D12_84_TR,0,
148_SI,0,
17_27AS,0,


In [212]:
write.csv(driver_indel_summary, "colibactin_driver_gene_IDs.csv", row.names = FALSE)

In [213]:
driver_indel_summary <- data.frame(
sample = names(vcfs_indel_coding_only),
n_with_gene = sapply(
vcfs_indel_coding_only,
function(gr) sum(gr$coding %in% driverGenes, na.rm = TRUE)
),
driver_genes = sapply(
vcfs_indel_coding_only,
function(gr) {
hits <- unique(gr$coding[gr$coding %in% driverGenes])
if (length(hits) == 0) NA_character_ else paste(hits, collapse = ";")
}
),
row.names = NULL
)