In [52]:
library(dndscv)
library(VariantAnnotation)
library(GenomicRanges)
library(dndscv)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(GenomicFeatures)
library(dplyr)

load("/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/refcds_GRCh38_hg38.rda")

In [58]:
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene

tx_gr <- transcripts(txdb)

In [None]:
tx_meta <- as.data.frame(mcols(tx_gr))

tx2gene <- data.frame(
  tx = tx_meta$tx_name,
  gene_id = tx_meta$tx_id,
  stringsAsFactors = FALSE
)

In [74]:
# Drop transcripts without gene IDs
tx2gene <- tx2gene[!is.na(tx2gene$gene_id), ]

In [75]:
tx_tbl <- data.frame(
  tx = names(cds_by_tx),
  cds_len = cds_lengths,
  stringsAsFactors = FALSE
)

tx_tbl <- merge(tx_tbl, tx2gene, by = "tx")


In [77]:
best_tx <- tx_tbl %>%
  group_by(gene_id) %>%
  slice_max(cds_len, n = 1) %>%
  ungroup()

In [78]:
cds_table <- do.call(
  rbind,
  lapply(seq_len(nrow(best_tx)), function(i) {

    tx <- best_tx$tx[i]
    gene_id <- best_tx$gene_id[i]
    gr <- cds_by_tx[[tx]]

    data.frame(
      gene_id   = gene_id,
      gene_name = gene_id,   # map to symbols later if desired
      cds_id    = tx,
      chr       = as.character(seqnames(gr)),
      start     = start(gr),
      end       = end(gr),
      cds_start = start(gr),
      cds_end   = end(gr),
      length    = sum(width(gr)),
      strand    = as.character(strand(gr)),
      stringsAsFactors = FALSE
    )
  })
)


In [34]:
vcf_to_dndscv_strict <- function(vcf_file, genome = "hg38") {

  vcf <- readVcf(vcf_file, genome = genome)

  # Expand multiallelics
  vcf <- expand(vcf)

  rr  <- rowRanges(vcf)
  ref <- as.character(ref(vcf))
  alt <- as.character(unlist(alt(vcf)))

  if (length(ref) == 0) {
    return(NULL)
  }

  df <- data.frame(
    chr = as.character(seqnames(rr)),
    pos = start(rr),
    ref = ref,
    mut = alt,
    stringsAsFactors = FALSE
  )

  ## -----------------------------
  ## HARD FILTERS
  ## -----------------------------

  # NA alleles
  df <- df[!is.na(df$ref) & !is.na(df$mut), ]

  # Uppercase
  df$ref <- toupper(df$ref)
  df$mut <- toupper(df$mut)

  # SNVs only
  df <- df[nchar(df$ref) == 1 & nchar(df$mut) == 1, ]

  # A/C/G/T only
  bases <- c("A", "C", "G", "T")
  df <- df[df$ref %in% bases & df$mut %in% bases, ]

  # ref != alt
  df <- df[df$ref != df$mut, ]

  # Chromosome names
  df$chr <- sub("^chr", "", df$chr)

  # ðŸ”‘ CRITICAL: handle empty result
  if (nrow(df) == 0) {
    return(NULL)
  }

  # Sample ID
  df$sampleID <- sub("\\.vcf(\\.gz)?$", "", basename(vcf_file))

  df
}


In [36]:
vcf_dir <- "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/overlap_variants/snvs_all/input"

vcfs <- list.files(vcf_dir, pattern = "\\.vcf(\\.gz)?$", full.names = TRUE)

mut_list <- lapply(vcfs, vcf_to_dndscv_strict)


In [44]:
library(dndscv)
library(GenomicRanges)

load("/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/refcds_GRCh38_hg38.rda")
cds_gr <- do.call(
  c,
  lapply(RefCDS, function(g) {
    if (length(g$cds_start) == 0) return(NULL)

    GRanges(
      seqnames = rep(g$chr, length(g$cds_start)),
      ranges   = IRanges(start = g$cds_start, end = g$cds_end)
    )
  })
)



In [45]:
mut_gr <- GRanges(
  seqnames = mutations$chr,
  ranges   = IRanges(start = mutations$pos, end = mutations$pos)
)


In [48]:
head(mutations, 5)


Unnamed: 0_level_0,sampleID,chr,pos,ref,mut
Unnamed: 0_level_1,<chr>,<chr>,<int>,<chr>,<chr>
1,029_DC_T,1,3714100,T,G
2,029_DC_T,1,4475981,C,A
3,029_DC_T,1,4595543,G,T
4,029_DC_T,1,5846422,G,A
5,029_DC_T,1,7908798,C,A


In [37]:
# Drop empty samples
mut_list <- Filter(Negate(is.null), mut_list)



In [38]:
mutations <- do.call(rbind, mut_list)

In [39]:
stopifnot(
  nrow(mutations) > 0,
  all(!is.na(mutations$ref)),
  all(!is.na(mutations$mut)),
  all(nchar(mutations$ref) == 1),
  all(nchar(mutations$mut) == 1),
  all(mutations$ref %in% c("A","C","G","T")),
  all(mutations$mut %in% c("A","C","G","T"))
)

table(nchar(mutations$ref), nchar(mutations$mut))
unique(c(mutations$ref, mutations$mut))


< table of extent 0 x 0 >

NULL

In [4]:
mut_list <- lapply(vcf_files, vcf_to_dndscv)


In [None]:
mutations <- do.call(rbind, mut_list)


In [6]:
mutations$sampleID <- gsub("\\.nuclear_all\\.intersection$",
                           "",
                           mutations$sampleID)


In [7]:
mutations$chr <- sub("^chr", "", mutations$chr)


In [None]:
mutations <- mutations[!is.na(ref) & !is.na(mut)]

In [10]:
save.image(file = "selection.RData")

In [41]:
load("selection.RData")

In [None]:
head(mutations)

In [None]:
dndsout <- dndscv(
  mutations,
  refdb = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/refcds_GRCh38_hg38.rda"
)


In [11]:
mutations <- mutations[order(mutations$sampleID,
                             mutations$chr,
                             mutations$pos), ]

adjacent <- with(mutations,
                 ave(pos, sampleID, chr,
                     FUN = function(x) c(FALSE, diff(x) == 1)))

mutations_clean <- mutations[!adjacent, ]
# there's 1 adjacent mutation recovered


In [12]:
# After your cleaning step
nrow(mutations_clean)
table(mutations_clean$ref, mutations_clean$mut)

   
        A     C     G     T
  A     0 19380 62874 49354
  C 82209     0 21591 98953
  G 98089 21461     0 78001
  T 49290 61608 19438     0

In [None]:
dnds_tmp <- try(
  dndscv(
    mutations,
    refdb = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/refcds_GRCh38_hg38.rda",
    max_muts_per_gene_per_sample = Inf
  ),
  silent = TRUE
)



In [15]:
mut_per_sample <- table(mutations_clean$sampleID)

keep <- names(mut_per_sample[mut_per_sample >= 20])
mutations_clean <- mutations_clean[mutations_clean$sampleID %in% keep, ]


In [18]:
length(keep)

In [100]:
dndsout <- dndscv(
  mutations_clean,
  refdb = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/refcds_GRCh38_hg38.rda", verbose = TRUE
)


ERROR: Error in dndscv(mutations_clean, refdb = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/refcds_GRCh38_hg38.rda", : unused argument (verbose = TRUE)


In [20]:
nrow(mutations)
length(unique(mutations$sampleID))


In [21]:
mutations <- mutations[order(mutations$sampleID,
                             mutations$chr,
                             mutations$pos), ]

is_adjacent <- with(
  mutations,
  ave(pos, sampleID, chr,
      FUN = function(x) {
        c(FALSE, diff(x) <= 1)
      })
)

table(is_adjacent)  # <- this will be VERY large

mutations_clean <- mutations[!is_adjacent, ]


is_adjacent
     0      1 
662248      1 

In [24]:
mutations <- mutations[!is.na(mutations$ref) & !is.na(mutations$mut), ]
mutations$ref <- toupper(mutations$ref)
mutations$mut <- toupper(mutations$mut)
unique(c(mutations$ref, mutations$mut))
table(nchar(mutations$ref), nchar(mutations$mut))



   
         1
  1 662249

In [27]:
unique(c(mutations$ref, mutations$mut))
# "T" "C" "G" "A"
table(nchar(mutations$ref), nchar(mutations$mut))
# 1 x 1 = 662,249


   
         1
  1 662249

In [99]:
dndsout <- dndscv(
  mutations = mutations,
  refdb = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/refcds_GRCh38_hg38.rda",
  verbose = TRUE
)


ERROR: Error in dndscv(mutations = mutations, refdb = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/refcds_GRCh38_hg38.rda", : unused argument (verbose = TRUE)
