In [1]:
library(TCGAbiolinks)
library(SummarizedExperiment)
library(dplyr)
library(survival)
library(survminer)
library(tibble)
library(edgeR)
library(biomaRt)

Loading required package: MatrixGenerics

Loading required package: matrixStats


Attaching package: ‘MatrixGenerics’


The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
    rowSdDiffs, rowSds, rowSums2, ro

In [2]:
# Load protein-coding gene symbols
protein_coding_genes <- readLines("~/Downloads/Annan_Project/Survival/Prot_cod_list_107")

In [3]:
# Set up Ensembl BioMart connection
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")

In [4]:
# Define function to convert raw counts to log2(TPM + 1)
counts_to_tpm <- function(counts, gene_lengths) {
  rate <- counts / gene_lengths
  tpm <- t(t(rate) * 1e6 / colSums(rate))
  return(log2(tpm + 1))
}

In [5]:
# Function to run Cox regression
run_cox_analysis <- function(expr_matrix, clin_df) {
  results <- data.frame(Gene=character(), HR=numeric(), pval=numeric(), stringsAsFactors = FALSE)
  for (gene in rownames(expr_matrix)) {
    expr <- expr_matrix[gene, ]
    df <- data.frame(
      expr = as.numeric(expr),
      age = clin_df$age,
      sex = clin_df$gender,
      OS.time = clin_df$OS.time,
      OS = clin_df$OS
    )
    df <- df[complete.cases(df), ]
    if (length(unique(df$expr)) > 5) {
      cox <- tryCatch({
        coxph(Surv(OS.time, OS) ~ expr + age + sex, data = df)
      }, error = function(e) return(NULL))
      if (!is.null(cox)) {
        hr <- summary(cox)$coefficients["expr", "exp(coef)"]
        pval <- summary(cox)$coefficients["expr", "Pr(>|z|)"]
        results <- rbind(results, data.frame(Gene=gene, HR=hr, pval=pval))
      }
    }
  }
  results$FDR <- p.adjust(results$pval, method = "fdr")
  return(results)
}

In [6]:
# Cancer types to analyze
tcga_projects <- c("TCGA-LAML")
#tcga_projects <- c("TCGA-ACC", "TCGA-BLCA", "TCGA-BRCA", "TCGA-CESC",
#                   "TCGA-CHOL", "TCGA-COAD", "TCGA-DLBC", "TCGA-ESCA",
#                   "TCGA-GBM", "TCGA-HNSC", "TCGA-KICH", "TCGA-KIRC",
#                   "TCGA-KIRP", "TCGA-LAML", "TCGA-LGG", "TCGA-LIHC",
#                   "TCGA-LUAD", "TCGA-LUSC", "TCGA-MESO", "TCGA-OV",
#                   "TCGA-PAAD", "TCGA-PCPG", "TCGA-PRAD", "TCGA-READ",
#                   "TCGA-SARC", "TCGA-SKCM", "TCGA-STAD", "TCGA-TGCT",
#                   "TCGA-THCA", "TCGA-THYM", "TCGA-UCEC", "TCGA-UCS",
#                   "TCGA-UVM")

In [8]:
# Loop through each TCGA project
for (project in tcga_projects) {
  cat("Processing", project, "...\n")
  output_file <- paste0(project, "_protein_coding_cox.tsv")
  rds_file <- paste0(project, "_summarized.rds")

  if (file.exists(output_file)) {
    cat("=> Skipping", project, "- output exists\n")
    next
  }

  tryCatch({
    # Step 1: Query & download data
    query <- GDCquery(
      project = project,
      data.category = "Transcriptome Profiling",
      data.type = "Gene Expression Quantification",
      workflow.type = "STAR - Counts",
      sample.type = "Primary Blood Derived Cancer - Peripheral Blood"
    )
    GDCdownload(query, method = "api", files.per.chunk = 20)
    data <- GDCprepare(query, summarizedExperiment = TRUE)
    saveRDS(data, rds_file)

    # Step 2: Extract and clean expression data
    counts <- assay(data)
    rownames(counts) <- gsub("\\..*", "", rownames(counts))  # Remove version numbers

    # Step 3: Map Ensembl IDs to Gene Symbols
    gene_map <- getBM(attributes = c("ensembl_gene_id", "external_gene_name"),
                      filters = "ensembl_gene_id",
                      values = rownames(counts),
                      mart = ensembl)
    gene_map <- gene_map[gene_map$external_gene_name %in% protein_coding_genes, ]
    gene_map <- gene_map[!duplicated(gene_map$ensembl_gene_id), ]
    rownames(gene_map) <- gene_map$ensembl_gene_id

    # Step 4: Filter to protein-coding genes
    counts <- counts[rownames(counts) %in% rownames(gene_map), ]
    gene_symbols <- gene_map[rownames(counts), "external_gene_name"]

    # Step 5: Get gene lengths and compute log(TPM + 1)
    gene_lengths <- getBM(attributes = c("ensembl_gene_id", "transcript_length"),
                          filters = "ensembl_gene_id",
                          values = rownames(counts),
                          mart = ensembl)
    gene_lengths <- aggregate(transcript_length ~ ensembl_gene_id, gene_lengths, median)
    rownames(gene_lengths) <- gene_lengths$ensembl_gene_id

    # Align lengths with counts
    gene_lengths_vec <- gene_lengths[rownames(counts), "transcript_length"]
    keep <- !is.na(gene_lengths_vec)
    counts <- counts[keep, ]
    gene_lengths_vec <- gene_lengths_vec[keep]

    # Compute TPM
    tpm_expr <- counts_to_tpm(counts, gene_lengths_vec)

    # Rename rows with gene symbols
    rownames(tpm_expr) <- gene_map[rownames(tpm_expr), "external_gene_name"]

    # Step 6: Process clinical data
    clin <- colData(data)
    clin$OS.time <- ifelse(is.na(clin$days_to_death), clin$days_to_last_follow_up, clin$days_to_death)
    clin$OS <- ifelse(is.na(clin$days_to_death), 0, 1)
    clin$age <- as.numeric(clin$age_at_diagnosis)
    clin$gender <- as.factor(clin$gender)

    # Step 7: Match expression and clinical samples
    keep_samples <- which(!is.na(clin$OS.time) & !is.na(clin$OS) & !is.na(clin$age) & !is.na(clin$gender))
    clin <- clin[keep_samples, ]
    tpm_expr <- tpm_expr[, keep_samples]

    # Match sample order
    common_samples <- intersect(colnames(tpm_expr), rownames(clin))
    clin <- clin[common_samples, ]
    tpm_expr <- tpm_expr[, common_samples]

    # Step 8: Run Cox regression
    res <- run_cox_analysis(tpm_expr, clin)

    # Step 9: Save results
    write.table(res, file = output_file, sep = "\t", row.names = FALSE, quote = FALSE)
    cat("=> Finished", project, "\n")

  }, error = function(e) {
    cat("!! Error processing", project, ":", e$message, "\n")
  })
}

Processing TCGA-LAML ...


--------------------------------------

o GDCquery: Searching in GDC database

--------------------------------------

Genome of reference: hg38

--------------------------------------------

oo Accessing GDC. This might take a while...

--------------------------------------------

ooo Project: TCGA-LAML

--------------------

oo Filtering results

--------------------

ooo By data.type

ooo By workflow.type

ooo By sample.type

----------------

oo Checking data

----------------

ooo Checking if there are duplicated cases

ooo Checking if there are results for the query

-------------------

o Preparing output

-------------------

Downloading data for project TCGA-LAML

GDCdownload will download 151 files. A total of 639.823772 MB

Downloading chunk 1 of 8 (20 files, size = 84.748568 MB) as Fri_Jun_20_07_52_30_2025_0.tar.gz



Downloading: 22 MB        

Downloading chunk 2 of 8 (20 files, size = 84.762474 MB) as Fri_Jun_20_07_52_30_2025_1.tar.gz



Downloading: 22 MB      

Downloading chunk 3 of 8 (20 files, size = 84.761009 MB) as Fri_Jun_20_07_52_30_2025_2.tar.gz



Downloading: 22 MB      

Downloading chunk 4 of 8 (20 files, size = 84.696685 MB) as Fri_Jun_20_07_52_30_2025_3.tar.gz



Downloading: 22 MB      

Downloading chunk 5 of 8 (20 files, size = 84.739132 MB) as Fri_Jun_20_07_52_30_2025_4.tar.gz



Downloading: 22 MB      

Downloading chunk 6 of 8 (20 files, size = 84.726816 MB) as Fri_Jun_20_07_52_30_2025_5.tar.gz



Downloading: 22 MB      

Downloading chunk 7 of 8 (20 files, size = 84.765898 MB) as Fri_Jun_20_07_52_30_2025_6.tar.gz



Downloading: 22 MB      

Downloading chunk 8 of 8 (11 files, size = 46.62319 MB) as Fri_Jun_20_07_52_30_2025_7.tar.gz





Starting to add information to samples

 => Add clinical information to samples

 => Adding TCGA molecular information from marker papers

 => Information will have prefix 'paper_' 

Available assays in SummarizedExperiment : 
  => unstranded
  => stranded_first
  => stranded_second
  => tpm_unstrand
  => fpkm_unstrand
  => fpkm_uq_unstrand

“Loglik converged before variable  1 ; coefficient may be infinite. ”
“Loglik converged before variable  1 ; coefficient may be infinite. ”


=> Finished TCGA-LAML 
