Install libraries and data

In [None]:
library(dplyr)
library(stringr)
library(ggplot2)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [None]:
# study dataframes
Hiatt = read.csv('/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/Hiattdataframe.csv',
                 stringsAsFactors = FALSE, header = TRUE)

Hiatt$sex <- "male"

LeeSix = read.csv('/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/LeeSixdataframe.csv',
                 stringsAsFactors = FALSE, header = TRUE)

LeeSix <- LeeSix %>%
  rename("crypt_sample" = "subject_id",
         "donor" = "patient")
## Get cohort to match Hiatt
LeeSix <- LeeSix %>%
  mutate(cohort = ifelse(cohort_status == 'bowel_cancer_screening_Yes',
                         'carcinoma', "normal"))

## Get region to match Hiatt
LeeSix$region <- ifelse(grepl("Caecum", LeeSix$phenotype, ignore.case = TRUE),
                        "cecum",
                        ifelse(grepl("AC", LeeSix$phenotype, ignore.case = TRUE),
                               "asc_col",
                               ifelse(grepl("Sigmoid", LeeSix$phenotype,
                                            ignore.case = TRUE), "sig_col",
                                      ifelse(grepl("SC", LeeSix$phenotype,
                                                   ignore.case = TRUE), "sig_col",
                                             ifelse(grepl("Tr", LeeSix$phenotype,
                                                          ignore.case = TRUE), "trans_col",
                                                    ifelse(grepl("Transverse", LeeSix$site,
                                                                 ignore.case = TRUE),
                                                           "trans_col", LeeSix$site))))))

pattern_right <- "cecum|asc_col|trans_col"
pattern_left  <- "desc_col|sig_col|rectum"

LeeSix <- LeeSix %>%
  mutate(side = case_when(
    str_detect(region, regex(pattern_right, ignore_case = TRUE)) ~ "Right",
    str_detect(region, regex(pattern_left,  ignore_case = TRUE)) ~ "Left",
    TRUE ~ region
  ))

Hiatt <- Hiatt %>%
  mutate(side = case_when(
    str_detect(region, regex(pattern_right, ignore_case = TRUE)) ~ "Right",
    str_detect(region, regex(pattern_left,  ignore_case = TRUE)) ~ "Left",
    TRUE ~ region
  ))

## Keep the important stuff
LeeSix <-subset(LeeSix, select=c("crypt_sample","region", "side", "donor", "sex", "age",
                                 "cohort", "coverage", "unique_SNVs", "T.G", "T.C",
                                 "C.T", "C.A", "T.A", "C.G", "CpG"))

# Add origin column to each dataframe
LeeSix$study <- "LeeSix"
Hiatt$study <- "Hiatt"

## maybe exclude HLS because it's single cell?
## Also ileum
LeeSix <- LeeSix %>%
  filter(region != "Ileum")

In [3]:
# additional data
denom = read.csv('/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/mosdepth/per_base/depth8_totals.csv', stringsAsFactors = FALSE, header = FALSE)
colnames(denom)[1] <- "crypt_sample"
colnames(denom)[2] <- "depth"
Hiatt <- merge(denom, Hiatt, by = "crypt_sample")
Hiatt$depth <- as.numeric(Hiatt$depth)
LeeSix <- merge(denom, LeeSix, by = "crypt_sample")
LeeSix$depth <- as.numeric(LeeSix$depth)

In [4]:
combined_df <- rbind(Hiatt, LeeSix)

region_order <- c("Right", "cecum", "asc_col", "trans_col", "desc_col", "sig_col", "rectum", "Left")
combined_df$region <- factor(combined_df$region, levels = region_order)

cohort_order <- c("normal", "adenoma", "carcinoma")
combined_df$cohort <- factor(combined_df$cohort, levels = cohort_order)
side_order <- c("Right", "Left")
combined_df$side <- factor(combined_df$side, levels = side_order)

Hiatt <- subset(combined_df, study == "Hiatt")
LeeSix <- subset(combined_df, study == "LeeSix")

combined_df$cohort <- factor(combined_df$cohort)
combined_df$study <- factor(combined_df$study)
combined_df$donor <- factor(combined_df$donor)
combined_df$side <- factor(combined_df$side)
combined_df$sex <- factor(combined_df$sex)

# Make sure factor variables are ordered properly
Hiatt$cohort <- factor(Hiatt$cohort)
Hiatt$donor <- factor(Hiatt$donor)
Hiatt$side <- factor(Hiatt$side)
Hiatt$sex <- factor(Hiatt$sex)

# Make sure factor variables are ordered properly
LeeSix$cohort <- factor(LeeSix$cohort)
LeeSix$donor <- factor(LeeSix$donor)
LeeSix$side <- factor(LeeSix$side)
LeeSix$sex <- factor(LeeSix$sex)


In [5]:
colibactin_coding_genic <- read.csv(
  "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/scripts/pks/colibactin_coding_genic.csv",
  stringsAsFactors = FALSE,
  header = TRUE
)

colnames(colibactin_coding_genic)[1] <- "crypt_sample"
colibactin_coding_genic <-subset(colibactin_coding_genic, select=c("crypt_sample","n_with_driver_gene_coding","driver_coding_snv","n_with_driver_gene_id","driver_gene_id","n_with_driver_gene_snv","driver_genes_snv","total_coding_indels","in_pks_motif_coding","fraction_pksmotif_coding","total_indels_genic","in_pks_motif_genic","fraction_pksmotif_genic","n_AA_genic","n_with_gene_snv","ID18_refit_coding","ID18_refit_genic","SBS88_refit_coding","SBS88_refit_genic","n_AA_coding","n_with_coding_snv"))


In [6]:
SBSActivities <-  read.csv('/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/snvs/SBS288/Suggested_Solution/COSMIC_SBS288_Decomposed_Solution/Activities/COSMIC_SBS288_Activities.txt', stringsAsFactors = FALSE, header = TRUE, sep = "\t")
IDActivities <-  read.csv('/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/ID/ID83/Suggested_Solution/COSMIC_ID83_Decomposed_Solution/Activities/COSMIC_ID83_Activities.txt', stringsAsFactors = FALSE, header = TRUE, sep = "\t")

merge_function <- function(x, y) {
  merge(x, y, by = "Samples", all = TRUE) # Adjust 'all' argument for desired join type
}

activities_list <- list(SBSActivities, IDActivities)
Activities <- Reduce(merge_function, activities_list)
Activities <- Activities %>%
  rename("crypt_sample" = "Samples")

Activities[is.na(Activities)] <- 0
Hiatt <- merge(Hiatt, Activities, by = "crypt_sample", all = FALSE)
LeeSix <- merge(LeeSix, Activities, by = "crypt_sample", all = FALSE)
combined_df <- merge(combined_df, Activities, by = "crypt_sample", all = FALSE)


Hiatt <- merge(Hiatt, colibactin_coding_genic, by = "crypt_sample", all = FALSE)
LeeSix <- merge(LeeSix, colibactin_coding_genic, by = "crypt_sample", all = FALSE)
combined_df <- merge(combined_df, colibactin_coding_genic, by = "crypt_sample", all = FALSE)


In [7]:
indel_loads = read.csv('/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/scripts/pks/2026_01_26_indel_loads.csv', stringsAsFactors = FALSE, header = TRUE)
indel_loads <- indel_loads %>%
  rename("crypt_sample" = "name",
         "cohort" = "injection")

Hiatt <- merge(Hiatt, indel_loads, by = c("crypt_sample", "cohort"))
LeeSix <- merge(LeeSix, indel_loads, by = c("crypt_sample", "cohort"))
combined_df <- merge(combined_df, indel_loads, by = c("crypt_sample", "cohort"))

snv_loads = read.csv('/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/scripts/pks/2026_01_26_snv_loads.csv', stringsAsFactors = FALSE, header = TRUE)
snv_loads <- snv_loads %>%
  rename("crypt_sample" = "name")

Hiatt <- merge(Hiatt, snv_loads, by = c("crypt_sample"))
LeeSix <- merge(LeeSix, snv_loads, by = c("crypt_sample"))
combined_df <- merge(combined_df, snv_loads, by = c("crypt_sample"))

Hiatt$snv_perc <- Hiatt$n_AA/Hiatt$unique_SNVs
LeeSix$snv_perc <- LeeSix$n_AA/LeeSix$unique_SNVs
combined_df$snv_perc <- combined_df$n_AA/combined_df$unique_SNVs

Hiatt$ID18_perc <- Hiatt$ID18/Hiatt$total_indels
LeeSix$ID18_perc <- LeeSix$ID18/LeeSix$total_indels
combined_df$ID18_perc <- combined_df$ID18/combined_df$total_indels

SBS88_loads = read.csv('/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/scripts/pks/2026_01_26_SBS88_refit_abs.csv', stringsAsFactors = FALSE, header = TRUE)
SBS88_loads <- SBS88_loads %>%
  rename("crypt_sample" = "name",
         "SBS88_refit_load" = "value")

SBS88_loads <- SBS88_loads[, c("crypt_sample", "SBS88_refit_load")]

Hiatt <- merge(Hiatt, SBS88_loads, by = c("crypt_sample"))
LeeSix <- merge(LeeSix, SBS88_loads, by = c("crypt_sample"))
combined_df <- merge(combined_df, SBS88_loads, by = c("crypt_sample"))

ID18_loads = read.csv('/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/scripts/pks/2026_01_26_ID18_refit_abs.csv', stringsAsFactors = FALSE, header = TRUE)
ID18_loads <- ID18_loads %>%
  rename("crypt_sample" = "name",
         "ID18_refit_load" = "value")

ID18_loads <- ID18_loads[, c("crypt_sample", "ID18_refit_load")]

Hiatt <- merge(Hiatt, ID18_loads, by = c("crypt_sample"))
LeeSix <- merge(LeeSix, ID18_loads, by = c("crypt_sample"))
combined_df <- merge(combined_df, ID18_loads, by = c("crypt_sample"))

SBS88_percs = read.csv('/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/scripts/pks/2026_01_26_SBS88_refit.csv', stringsAsFactors = FALSE, header = TRUE)
SBS88_percs <- SBS88_percs %>%
  rename("crypt_sample" = "name",
         "SBS88_refit_perc" = "value")

SBS88_percs <- SBS88_percs[, c("crypt_sample", "SBS88_refit_perc")]

Hiatt <- merge(Hiatt, SBS88_percs, by = c("crypt_sample"))
LeeSix <- merge(LeeSix, SBS88_percs, by = c("crypt_sample"))
combined_df <- merge(combined_df, SBS88_percs, by = c("crypt_sample"))

ID18_percs = read.csv('/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/scripts/pks/2026_01_26_ID18_refit.csv', stringsAsFactors = FALSE, header = TRUE)
ID18_percs <- ID18_percs %>%
  rename("crypt_sample" = "name",
         "ID18_refit_perc" = "value")

ID18_percs <- ID18_percs[, c("crypt_sample", "ID18_refit_perc")]

Hiatt <- merge(Hiatt, ID18_percs, by = c("crypt_sample"))
LeeSix <- merge(LeeSix, ID18_percs, by = c("crypt_sample"))
combined_df <- merge(combined_df, ID18_percs, by = c("crypt_sample"))


In [12]:
combined_df <- subset(combined_df, coverage > 9)
Hiatt <- subset(Hiatt, coverage > 9)
LeeSix <- subset(LeeSix, coverage > 9)

In [13]:
save.image(file = "ProcessData.RData")