In [1]:
load("ProcessData.RData")

Figure 2: Biological age-effect of SNVs
SNV/indel burden with age (boxplot corrected by age) 
MAKE THE LEE-SIX SUPP FIGURE THAT STARTED IT ALL
SUBTYPES
Transition/transversion
Spectra + AMSD
Stacked bar of subtypes/signatures
MSI
Signatures present in samples? We’ll think about it
Hiatt vs combine?

Compare per year aggregation? (Look at Lynch paper) 


In [5]:
library(mutspecdist)
library(dplyr)
library(tidyr)
library(stringr)
library(sigfit)

In [None]:
amsd <- function(set1,
                set2,
                mean_or_sum = "mean",  # or "sum"
                n_sim = 1000,
                seed = NULL) {
  tol <- 1e-8
  # Validate inputs
  if (!is.matrix(set1) && !is.data.frame(set1)) stop("set1 must be a matrix or data.frame")
  if (!is.matrix(set2) && !is.data.frame(set2)) stop("set2 must be a matrix or data.frame")

  if (!mean_or_sum %in% c("mean", "sum")) {
    stop("Argument 'mean_or_sum' must be either 'mean' or 'sum'")
  }

  if (mean_or_sum == "mean" & max(c(rowSums(set1), rowSums(set2))) - 1 > tol) {
    stop("Spectra fractions should add up to 1 when running 'mean_or_sum' = 'mean'")
  }

  if (mean_or_sum == "sum" & max(c(rowSums(set1), rowSums(set2)) <= 1)) {
    stop("Run 'mean_or_sum' = 'mean' for fractional mutation spectra that add up to 1")
  }

  # Define aggregation function
  aggragate_spectra <- if (mean_or_sum == "mean") {
    function(data) colMeans(data, na.rm = TRUE)
  } else {
    function(data) colSums(data, na.rm = TRUE)
  }


  # Compute observed distance
  spectra1 <- aggragate_spectra(set1)
  spectra2 <- aggragate_spectra(set2)
  observed_distance <- cosine_dist(spectra1, spectra2)[[1]]

  # Prepare permutation dataset
  combined_set <- rbind(set1, set2)

  # Run permutations
  set.seed(seed)
  dist_rands <- numeric(n_sim)
  group_size <- nrow(set1)

  for (k in seq_len(n_sim)) {
    indices <- sample(seq_len(nrow(combined_set)), group_size)
    spectra_group1 <- aggragate_spectra(combined_set[indices, , drop = FALSE])
    spectra_group2 <- aggragate_spectra(combined_set[-indices, , drop = FALSE])
    dist_rands[k] <- cosine_dist(spectra_group1, spectra_group2)[[1]]
  }

  # Calculate p-value
  p_value <- max(c(mean(dist_rands >= observed_distance), 1 / n_sim))

  # Return output
  return(list(cosine = observed_distance, p = p_value, sims = dist_rands))
}

In [None]:
convert_mutation <- function(x) {
  # Extract the left base, ref>alt, and right base
  left <- str_sub(x, 1, 1)
  middle <- str_extract(x, "(?<=\\[).+?(?=\\])")  # grabs C>A etc
  right <- str_sub(x, -1, -1)

  # Split ref>alt into two
  ref_alt <- str_split(middle, ">", simplify = TRUE)

  # Construct new notation
  paste0(left, ref_alt[1], right, ">", left, ref_alt[2], right)
}

# Apply to dataframe
matrix <- matrix %>%
  mutate(MutationType = sapply(MutationType, convert_mutation))

v3.4 <- v3.4 %>%
  mutate(Type = sapply(Type, convert_mutation))

v3.4switched <- t(as.data.frame(v3.4[-1]))

colnames(v3.4switched) <- v3.4$Type

matrixswitched <- t(as.data.frame(matrix[-1]))

colnames(matrixswitched) <- matrix$MutationType
rownames(matrixswitched) <- gsub("X", "", rownames(matrixswitched))
matrixswitched <- matrixswitched[!(row.names(matrixswitched) %in% rows_to_remove), ]

In [None]:
Right <- subset(Hiatt_novaf, side == 'Right')
Left <- subset(Hiatt_novaf, side == "Left")

Right_list <- Right$crypt_sample
Left_list <- Left$crypt_sample

Right <- matrixswitched[rownames(matrixswitched) %in% Right_list, ]
Left <- matrixswitched[rownames(matrixswitched) %in% Left_list, ]

df_Hiatt <- df_Hiatt[!(row.names(df_Hiatt) %in% lowcov), ]

df_LScrypt <- df_LeeSix[!(row.names(df_LeeSix) %in% HLS), ]

df_Hiatt <- as.data.frame(df_Hiatt) / rowSums(as.data.frame(df_Hiatt))

# make sure Hiatt samples are here...
matrixswitched_prop <- matrixswitched / rowSums(matrixswitched)
matrixswitched_prop <- as.data.frame(matrixswitched_prop)
df_Hiatt <- matrixswitched_prop[rownames(matrixswitched_prop) %in% Hiatt_list, ]
df_LeeSix <- matrixswitched_prop[rownames(matrixswitched_prop) %in% LeeSix_list, ]

#df_Hiatt <- df_Hiatt %>%
#  tibble::rownames_to_column(var = "crypt_subject")

matrix <- merge(matrixswitched_prop, combined_df, by = "crypt_sample")

Right <- as.data.frame(Right)/rowSums(as.data.frame(Right))
Left <- as.data.frame(Left)/rowSums(as.data.frame(Left))

#run ASMD
exp_v_con <- amsd(Right,
                  Left,
                  mean_or_sum = "mean",
                  n_sim = 10000,
                  seed = 123)
plot_amsd_histogram(exp_v_con)

# Run with with “sum” option on mutation counts rather than mutation type
# fractions, which changes the permutation cosine distance distribution, but
# not the p-value, indicating the result is not driven by high- or low-mutation
# outliers

LScell <- as.data.frame(df_LScell) / rowSums(as.data.frame(df_LScell))

LScrypt <- as.data.frame(df_LScrypt) / rowSums(as.data.frame(df_LScrypt))

exp_v_con <- amsd(LScell,
                  LScrypt,
                  mean_or_sum = "sum",
                  n_sim = 10000,
                  seed = 123)
plot_amsd_histogram(exp_v_con)