In [7]:
# MAKE_volcano_plot_Rscript
# This script generates volcano plots from linear regression results

library(ggplot2)    # For plotting
library(ggrepel)    # For non-overlapping text labels

In [38]:
# nolint start

omics_list = c("metabolomics","proteomics")   # Omics types to process
condition_list = c("cVSpos","cVSneg")         # Comparison conditions

for (omics in omics_list){
    cohend_threshold = 0.5  # Threshold for effect size (Cohen's D)

    for (condition in condition_list){
         
        # Load linear regression result file
        data_file = paste("../../../analysis/statistics/linear_regression/", omics,  #
                          "/linear_regression.", condition, ".", omics, ".tsv", sep="")

        print (data_file)
        input_df = read.csv(data_file, sep="\t", header=TRUE, row.names=1)
        
        # Set x and y axes for the volcano plot
        x_axis <- input_df$cohen_d
        y_axis <- -log10(input_df$adj_pval)  # Convert p-values to –log10 scale

        # Set axis labels and colors based on condition
        if (condition == "cVSpos"){
            xaxis_label <- "Effect size (Cohen's D): ACPA+ RA vs. Control"
            numerator_color = "#636363"         # ACPA+ color
            denominator_color = "#78AF3F"       # Control color
        }
        if (condition == "cVSneg"){
            xaxis_label <- "Effect size (Cohen's D): ACPA– RA vs. Control"
            numerator_color = "#B57623"         # ACPA– color
            denominator_color = "#78AF3F"       # Control color
        }
        # print (input_df)
        gene_list <- rownames(input_df)  # Feature names

        df <- data.frame(
            cohen_d = x_axis,
            adj_pval = y_axis,
            row.names = gene_list
        )
        print (dim(df))
        df$genes <- gene_list  # add gene names as a column too (for ggplot)

        # Set y-axis limits and p-value threshold line
        ylim_bottum <- 0
        ylim_top <- 6
        # log2pval_threshold = 1.30103  # = -log10(0.05)
        log2pval_threshold = 2.0  # = -log10(0.01)
        
        # Filter significant features (above p-value threshold)
        sig_subset <- subset(df, adj_pval > log2pval_threshold)
        
        # Subset for up/down regulated features
        sig_red_subset <- subset(sig_subset, cohen_d > cohend_threshold)
        sig_blue_subset <- subset(sig_subset, cohen_d < -cohend_threshold)
        
        # Same subset for labeling
        sig_red_text_subset <- sig_red_subset
        sig_blue_text_subset <- sig_blue_subset

        # Debug printout
        print ('---------------')
        print (paste(omics, ": ", condition, sep=""))
        print (paste("up: ", nrow(sig_red_subset), sep=""))
        print (paste("down: ", nrow(sig_blue_subset), sep=""))
        print ('---------------')
              
        # Set figure title and output path
        figure_title = paste(omics,": ", condition, sep="")
        output_pdf = paste("../../../analysis/statistics/linear_regression/volcano_plot/", omics,
                            ".", condition, ".volcano.label.pdf", sep="")
        
        # Save volcano plot to PDF
        pdf(output_pdf)
        
        # Draw volcano plot
        plot_pdf <- ggplot(df, aes(x=cohen_d, y=adj_pval)) + 
            coord_cartesian(xlim=c(-1.2,1.2), ylim=c(ylim_bottum, ylim_top)) +
            geom_point(colour="#DCDCDC", size = 2.5, stroke = 0) +  # Background dots
            geom_hline(yintercept = log2pval_threshold, colour="#BEBEBE", linetype="dashed") +
            geom_vline(xintercept = c(cohend_threshold, -cohend_threshold), colour="#BEBEBE", linetype="dashed") +
            geom_point(data = sig_red_subset, colour=numerator_color, size = 2.5, stroke = 0) +
            geom_point(data = sig_blue_subset, colour=denominator_color, size = 2.5, stroke = 0) +
            geom_text_repel(data = sig_red_text_subset, aes(cohen_d, adj_pval, label=genes),
                            colour=numerator_color, size=2) +
            geom_text_repel(data = sig_blue_text_subset, aes(cohen_d, adj_pval, label=genes),
                            colour=denominator_color, size=2) +
            ylab("-log10 (P-value)") + xlab(xaxis_label) +  
            ggtitle(figure_title) +
            theme_bw() +
            theme(axis.line = element_line(colour = "black"),
                  panel.grid.major = element_blank(),
                  panel.grid.minor = element_blank(),
                  panel.border = element_blank(),
                  panel.background = element_blank())

        print (plot_pdf)
        dev.off()
    }
}


[1] "../../../analysis/statistics/linear_regression/metabolomics/linear_regression.cVSpos.metabolomics.tsv"
[1] 1061    2
[1] "---------------"
[1] "metabolomics: cVSpos"
[1] "up: 2"
[1] "down: 4"
[1] "---------------"
[1] "../../../analysis/statistics/linear_regression/metabolomics/linear_regression.cVSneg.metabolomics.tsv"
[1] 1061    2
[1] "---------------"
[1] "metabolomics: cVSneg"
[1] "up: 5"
[1] "down: 19"
[1] "---------------"


“ggrepel: 7 unlabeled data points (too many overlaps). Consider increasing max.overlaps”


[1] "../../../analysis/statistics/linear_regression/proteomics/linear_regression.cVSpos.proteomics.tsv"
[1] 7273    2
[1] "---------------"
[1] "proteomics: cVSpos"
[1] "up: 15"
[1] "down: 3"
[1] "---------------"
[1] "../../../analysis/statistics/linear_regression/proteomics/linear_regression.cVSneg.proteomics.tsv"
[1] 7273    2
[1] "---------------"
[1] "proteomics: cVSneg"
[1] "up: 24"
[1] "down: 49"
[1] "---------------"


“ggrepel: 1 unlabeled data points (too many overlaps). Consider increasing max.overlaps”
“ggrepel: 37 unlabeled data points (too many overlaps). Consider increasing max.overlaps”
