In [1]:
suppressMessages(suppressWarnings(library(BiocManager)))
suppressMessages(suppressWarnings(library(GEOquery)))
suppressMessages(suppressWarnings(library(limma)))
suppressMessages(suppressWarnings(library(umap)))
suppressMessages(suppressWarnings(library(ggplot2)))
suppressMessages(suppressWarnings(library(DESeq2)))
suppressMessages(suppressWarnings(library(biomaRt)))
suppressMessages(suppressWarnings(library(stringi)))
suppressMessages(suppressWarnings(library(stringr)))
suppressMessages(suppressWarnings(library(msigdbr)))
suppressMessages(suppressWarnings(library(data.table)))
suppressMessages(suppressWarnings(library(org.Hs.eg.db)))
suppressMessages(suppressWarnings(library("tidyverse")))
suppressMessages(suppressWarnings(library("readxl")))
suppressMessages(suppressWarnings(library(tximportData)))

In [3]:
# ensembl <- useMart("ensembl")
# human <- useDataset(dataset="hsapiens_gene_ensembl", mart=ensembl)
# dmelanogaster <- useMart("ensembl", dataset="dmelanogaster_gene_ensembl")
# drerio <- useMart("ensembl", dataset="drerio_gene_ensembl")
# mmusculus <- useMart("ensembl", dataset="mmusculus_gene_ensembl")
# rnorvegicus <- useMart("ensembl", dataset="rnorvegicus_gene_ensembl")
# ggallus <- useMart("ensembl", dataset="ggallus_gene_ensembl")

# saveRDS(human,"/data/timonaj/biomart_orthologs/human.RDS")
# saveRDS(dmelanogaster,"/data/timonaj/biomart_orthologs/dmelanogaster.RDS")
# saveRDS(drerio,"/data/timonaj/biomart_orthologs/drerio.RDS")
# saveRDS(mmusculus,"/data/timonaj/biomart_orthologs/mmusculus.RDS")
# saveRDS(rnorvegicus,"/data/timonaj/biomart_orthologs/rnorvegicus.RDS")
# saveRDS(ggallus,"/data/timonaj/biomart_orthologs/ggallus.RDS")

In [2]:
human <- readRDS("../data/biomart_orthologs/human.RDS")
dmelanogaster <- readRDS("../data/biomart_orthologs/dmelanogaster.RDS")
drerio <- readRDS("../data/biomart_orthologs/drerio.RDS")
mmusculus <- readRDS("../data/biomart_orthologs/mmusculus.RDS")
rnorvegicus <- readRDS("../data/biomart_orthologs/rnorvegicus.RDS")
ggallus <- readRDS("../data/biomart_orthologs/ggallus.RDS")

In [3]:
# Background set of genes
background_set <- fread("../data/background_set.txt")
#human <- readRDS("/data/timonaj/biomart_orthologs/human.RDS")
#dmelanogaster <- readRDS("/data/timonaj/biomart_orthologs/dmelanogaster.RDS")

# Functions

In [4]:
download_data <- function(geo_code) {
    test_suppl <- getGEOSuppFiles(geo_code)
    print(test_suppl)
    directory <- rownames(test_suppl)[1]
    read_count_mat <- (as.matrix(fread(directory),rownames=1))
    print(head(read_count_mat))
    
    return((read_count_mat))
}
           
get_deseq <- function(countData, condition) {
    dds <- DESeqDataSetFromMatrix(countData, DataFrame(condition), ~ condition)
    dds <- DESeq(dds)
    dds <- estimateSizeFactors(dds)
    #ddsCounts <- counts(dds, normalized=TRUE)
    #colnames(ddsCounts) <- dds$samples
    return(dds)   
}

In [5]:
getDataRanges <- function(qval_object, column) {
    print(paste("adj.P.Val range: ",range(na.omit(qval_object$padj))[1],
                " - ", range(na.omit(qval_object$padj))[2], sep =""))
    print(paste("adj.P.Val neg logfc range: ",range(na.omit(qval_object$log2FoldChange[qval_object$log2FoldChange < 0]))[1],
                " - ", range(na.omit(qval_object$log2FoldChange[qval_object$log2FoldChange < 0]))[2], sep=""))
    print(paste("adj.P.Val pos logfc range: ",range(na.omit(qval_object$log2FoldChange[qval_object$log2FoldChange > 0]))[1],
                " - ", range(na.omit(qval_object$log2FoldChange[qval_object$log2FoldChange > 0]))[2], sep=""))
}

In [6]:
filter_transcripts <- function(deseq_results,organism_name,biomart_file, attrbts,
                               geo_code, experiment, pval, human_orthologs, homology_confidence){
    
    # Examine gene variability
    total_up <- character(0)
    total_down <- character(0)
    recurrent_switchers <- list()
    
    if(homology_confidence) {
        attrbts_homolog <- c(attrbts, "hsapiens_homolog_orthology_confidence", 
                     "hsapiens_homolog_goc_score", "hsapiens_homolog_wga_coverage") 
    } else {
        attrbts_homolog <- attrbts
    }
    
    if (length(human_orthologs)==0) {
        
        suppressMessages(suppressWarnings(library(biomaRt)))
        human_orthologs <- getLDS(attributes=attrbts_homolog, filters=attrbts,
                              values=rownames(deseq_results),
                              mart=biomart_file,attributesL=c("hgnc_symbol","ensembl_gene_id"),
                              martL=human)
        
    }
    
    # remove genes with no symbol
    print(paste("Total", organism_name, "transcripts with Available Human Orthologs:",
                nrow(human_orthologs),
                sep = " "))
    human_orthologs <- human_orthologs[human_orthologs$HGNC.symbol != "",]
    
    print(paste("Total", organism_name, "transcripts with Available Human Ortholog Gene Symbols:",
                length(unique(human_orthologs[,c(1)])),
                sep = " "))
    
    background_filtered_genes <- human_orthologs[human_orthologs$HGNC.symbol %in% background_set$gene,]

    print(paste("Total", organism_name, "genes with Available Human Ortholog Gene Symbols and in Background gene sets:",
                length(unique(background_filtered_genes[,c(1)])),
                sep = " "))

    # get transcript ID from gene
    unique_symbols <- unique(background_filtered_genes[,c(1)])

    genes_filtered <- unique(rownames(deseq_results)[rownames(deseq_results) %in% unique_symbols])


    print(paste("Total genes after orthologous mapping and filtering:",
                length(genes_filtered),
                sep = " "))
    
    # significant transcripts
    significant_genes <- deseq_results[(rownames(deseq_results) %in% genes_filtered) &
                        !is.na(deseq_results$padj) &
                        deseq_results$padj < pval,]
    print(paste("Total significant transcripts after orthologous mapping and filtering:",
                nrow(significant_genes),
                sep = " "))
    
    # separate into downregulated and upregulated with a cut off of logFC
    neg_significant_genes <- significant_genes[significant_genes$log2FoldChange < -1,]
    pos_significant_genes <- significant_genes[significant_genes$log2FoldChange > 1,]
        
    if(nrow(significant_genes) > 8000) {
        neg_significant_genes <- significant_genes[significant_genes$log2FoldChange < -1,]
        pos_significant_genes <- significant_genes[significant_genes$log2FoldChange > 1,]
    }

        #neg_significant_genes <- unique(transcript_to_gene_dict[transcript_to_gene_dict$transcriptIDS %in% neg_significant_transcripts$ID,]$transcriptGenes)
        #pos_significant_genes <- unique(transcript_to_gene_dict[transcript_to_gene_dict$transcriptIDS %in% pos_significant_transcripts$ID,]$transcriptGenes)

    print(paste("Total significant downregulated", organism_name, "genes after orthologous mapping and filtering:",
                nrow(neg_significant_genes),
                sep = " "))

    print(paste("Total significant upregulated", organism_name, "genes after orthologous mapping and filtering:",
                nrow(pos_significant_genes),
                sep = " "))

    final_neg_human_orthologs <- background_filtered_genes[background_filtered_genes$HGNC.symbol %in% rownames(neg_significant_genes),]
    final_pos_human_orthologs <- background_filtered_genes[background_filtered_genes$HGNC.symbol %in% rownames(pos_significant_genes),]

    print(paste("Total significant Final downregulated human othologs:",
                length(unique(final_neg_human_orthologs$HGNC.symbol)),
                sep = " "))

    print(paste("Total significant Final upregulated human othologs:",
                length(unique(final_pos_human_orthologs$HGNC.symbol)),
                sep = " "))

    # remove empty names and NAs
    # saveto the right  repository
    write.table(data.frame("HGNC" =final_neg_human_orthologs$HGNC.symbol),
                file = paste("./geo_degs/", organism_name, "_", experiment, "_downregulated.txt", sep=""), quote = FALSE, sep = "\t",
                row.names = FALSE, col.names = FALSE)
    write.table(data.frame("HGNC" =final_pos_human_orthologs$HGNC.symbol),
                file = paste("./geo_degs/", organism_name, "_", experiment, "_upregulated.txt", sep=""), quote = FALSE, sep = "\t",
                row.names = FALSE, col.names = FALSE)
        
    total_exp_list <- list("downregulated" = final_neg_human_orthologs,
                                            "upregulated" = final_pos_human_orthologs,
                                            "gene_qvals"=significant_genes$padj)
        
    down_genes <- unique(final_neg_human_orthologs$HGNC.symbol)
    up_genes <- unique(final_pos_human_orthologs$HGNC.symbol)

    intersection <- intersect(down_genes, up_genes)
    print(paste("current fraction of Intersection ",length(intersection)/sum(length(down_genes) + length(up_genes)),sep=""))
        
    for(gene in intersection) {
        if(gene %in% names(recurrent_switchers)) {
            recurrent_switchers[[gene]] = recurrent_switchers[[gene]] + 1
        } else {
            recurrent_switchers[[gene]] = 1
        }
    }
        
    total_up <- unique(append(total_up, up_genes, length(total_up)))
    total_down <- unique(append(total_down, down_genes, length(total_down)))
        
    getDataRanges(significant_genes, "Exp")
    
    
    total_intersection <- intersect(total_down, total_up)
    print(paste("Total fraction of gene intersection ",length(total_intersection)/sum(length(total_down) + length(total_up)), sep =""))
    
    return_list <- list("total_exp_list" = total_exp_list,
                        "recurrent_switchers" = recurrent_switchers)
    if(homology_confidence) {
        return_list[["homologs"]] <- background_filtered_genes
        return_list[["significant_homologs"]] <- background_filtered_genes[background_filtered_genes$HGNC.symbol %in% c(total_down, total_up),]
        return(return_list)
    }
    
    return(return_list)

}

In [7]:
filter_transcripts_nonhuman <- function(results,organism_name,biomart_file, attrbts,
                               geo_code, experiment, homology_confidence){
    
    # Examine gene variability
    total_up <- character(0)
    total_down <- character(0)
    recurrent_switchers <- list()
    
    if(homology_confidence) {
        attrbts_homolog <- c(attrbts, "hsapiens_homolog_orthology_confidence", 
                     "hsapiens_homolog_goc_score", "hsapiens_homolog_wga_coverage") 
    } else {
        attrbts_homolog <- attrbts
    }
    
    human_orthologs <- getLDS(attributes=attrbts_homolog, filters=attrbts,
                          values=results$genes,
                          mart=biomart_file,attributesL=c("hgnc_symbol","ensembl_gene_id"),
                          martL=human)
    # remove genes with no symbol
    print(paste("Total", organism_name, "transcripts with Available Human Orthologs:",
                nrow(human_orthologs),
                sep = " "))

    human_orthologs <- human_orthologs[human_orthologs$HGNC.symbol != "",]

    print(paste("Total", organism_name, "transcripts with Available Human Ortholog Gene Symbols:",
                length(unique(human_orthologs[,c(1)])),
                sep = " "))
    
    background_filtered_genes <- human_orthologs[human_orthologs$HGNC.symbol %in% background_set$gene,]
    
    print(paste("Total", organism_name, "genes with Available Human Ortholog Gene Symbols and in Background gene sets:",
            length(unique(background_filtered_genes[,c(1)])),
            sep = " "))
    
    # get transcript ID from gene
    unique_symbols <- unique(background_filtered_genes[,c(1)])
    genes_filtered <- unique(rownames(deseq_results)[rownames(deseq_results) %in% unique_symbols])


    print(paste("Total genes after orthologous mapping and filtering:",
                length(genes_filtered),
                sep = " "))
    
    # significant transcripts
    significant_genes <- results[(rownames(deseq_results) %in% genes_filtered) &
                        !is.na(deseq_results$padj) &
                        deseq_results$padj < pval,]
    print(paste("Total significant transcripts after orthologous mapping and filtering:",
                nrow(significant_genes),
                sep = " "))
    
    # separate into downregulated and upregulated with a cut off of logFC
    neg_significant_genes <- significant_genes[significant_genes$log2FoldChange < 0,]
    pos_significant_genes <- significant_genes[significant_genes$log2FoldChange > 0,]
        
    if(nrow(significant_genes) > 8000) {
        neg_significant_genes <- significant_genes[significant_genes$log2FoldChange < -1,]
        pos_significant_genes <- significant_genes[significant_genes$log2FoldChange > 1,]
    }

        #neg_significant_genes <- unique(transcript_to_gene_dict[transcript_to_gene_dict$transcriptIDS %in% neg_significant_transcripts$ID,]$transcriptGenes)
        #pos_significant_genes <- unique(transcript_to_gene_dict[transcript_to_gene_dict$transcriptIDS %in% pos_significant_transcripts$ID,]$transcriptGenes)

    print(paste("Total significant downregulated", organism_name, "genes after orthologous mapping and filtering:",
                nrow(neg_significant_genes),
                sep = " "))

    print(paste("Total significant upregulated", organism_name, "genes after orthologous mapping and filtering:",
                nrow(pos_significant_genes),
                sep = " "))

    final_neg_human_orthologs <- background_filtered_genes[background_filtered_genes$HGNC.symbol %in% rownames(neg_significant_genes),]
    final_pos_human_orthologs <- background_filtered_genes[background_filtered_genes$HGNC.symbol %in% rownames(pos_significant_genes),]

    print(paste("Total significant Final downregulated human othologs:",
                length(unique(final_neg_human_orthologs$HGNC.symbol)),
                sep = " "))

    print(paste("Total significant Final upregulated human othologs:",
                length(unique(final_pos_human_orthologs$HGNC.symbol)),
                sep = " "))

    # remove empty names and NAs
    # saveto the right  repository
    write_out_files(final_neg_human_orthologs$HGNC.symbol, final_pos_human_orthologs$HGNC.symbol,
                organism_name, experiment)
        
    total_exp_list <- list("downregulated" = final_neg_human_orthologs,
                                            "upregulated" = final_pos_human_orthologs,
                                            "gene_qvals"=significant_genes$padj)
        
    down_genes <- unique(final_neg_human_orthologs$HGNC.symbol)
    up_genes <- unique(final_pos_human_orthologs$HGNC.symbol)

    intersection <- intersect(down_genes, up_genes)
    print(paste("current fraction of Intersection ",length(intersection)/sum(length(down_genes) + length(up_genes)),sep=""))
        
    for(gene in intersection) {
        if(gene %in% names(recurrent_switchers)) {
            recurrent_switchers[[gene]] = recurrent_switchers[[gene]] + 1
        } else {
            recurrent_switchers[[gene]] = 1
        }
    }
        
    total_up <- unique(append(total_up, up_genes, length(total_up)))
    total_down <- unique(append(total_down, down_genes, length(total_down)))
        
    getDataRanges(significant_genes, "Exp")
    
    
    total_intersection <- intersect(total_down, total_up)
    print(paste("Total fraction of gene intersection ",length(total_intersection)/sum(length(total_down) + length(total_up)), sep =""))
    
    return_list <- list("total_exp_list" = total_exp_list,
                        "recurrent_switchers" = recurrent_switchers)
    if(homology_confidence) {
        return_list[["homologs"]] <- background_filtered_genes
        return_list[["significant_homologs"]] <- background_filtered_genes[background_filtered_genes$HGNC.symbol %in% c(total_down, total_up),]
        return(return_list)
    }
    
    return(return_list)

}

In [8]:
write_out_files <- function(neg_genes, pos_genes, label, experiment) {
    write.table(data.frame("HGNC" =neg_genes),
                file = paste("./geo_degs/", label, "_", experiment, "_downregulated.txt", sep=""), quote = FALSE, sep = "\t",
                    row.names = FALSE, col.names = FALSE)
    write.table(data.frame("HGNC" =pos_genes),
                file = paste("./geo_degs/", label, "_", experiment, "_upregulated.txt", sep=""), quote = FALSE, sep = "\t",
                    row.names = FALSE, col.names = FALSE)
}

# DESeq code

## Wound healing

### GSE137897 - Hsapiens - Skin

In [13]:
# read_counts_GSE137897 <- download_data("GSE137897")
# saveRDS(read_counts_GSE137897, "./GSE137897/read_counts_GSE137897.RDS")
read_counts_GSE137897 <- readRDS("./GSE137897/read_counts_GSE137897.RDS")

aw_samples <- colnames(read_counts_GSE137897)[(grep("H.*\\_.*",colnames(read_counts_GSE137897)))]
pu_samples <- colnames(read_counts_GSE137897)[(grep("P.*\\_.*",colnames(read_counts_GSE137897)))]

GSE137897_countData <- read_counts_GSE137897
GSE137897_condition <- factor(c(rep("AW", length(aw_samples)), rep("PU",length(pu_samples))))

In [14]:
results_GSE137897 <- get_deseq(GSE137897_countData, GSE137897_condition)

estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

-- replacing outliers and refitting for 10122 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

estimating dispersions

fitting model and testing



In [15]:
results_GSE137897

class: DESeqDataSet 
dim: 23878 1170 
metadata(1): version
assays(6): counts mu ... replaceCounts replaceCooks
rownames(23878): LEF1 FOXJ2 ... RAI1-AS1 TMEM74B
rowData names(23): baseMean baseVar ... maxCooks replace
colnames(1170): H1_0001 H1_0002 ... PU5_1169 PU5_1170
colData names(3): condition sizeFactor replaceable

In [16]:
results(results_GSE137897, contrast=c("condition","PU", "AW"))

log2 fold change (MLE): condition PU vs AW 
Wald test p-value: condition PU vs AW 
DataFrame with 23878 rows and 6 columns
             baseMean log2FoldChange     lfcSE        stat      pvalue
            <numeric>      <numeric> <numeric>   <numeric>   <numeric>
LEF1          5.82505      -0.737541  0.580669    -1.27016 2.04028e-01
FOXJ2        17.62348      -0.364760  0.269190    -1.35503 1.75408e-01
ZNF654        7.30848       0.195039  0.311275     0.62658 5.30934e-01
TAL1          1.27040       3.820803  0.925940     4.12640 3.68481e-05
ZMYM2        40.98124      -0.455910  0.181253    -2.51532 1.18924e-02
...               ...            ...       ...         ...         ...
NUDT15    2.66934e+01    -0.18021500  0.207885 -0.86689703    0.385998
LINC00564 1.03683e-04    -0.04662163  2.952534 -0.01579038    0.987402
LINC00557 1.44105e-02     0.00993735  2.952512  0.00336573    0.997315
RAI1-AS1  1.90346e-02    -0.08068537  2.952503 -0.02732779    0.978198
TMEM74B   1.13588e-01    

In [21]:
GSE137897_files <- filter_transcripts(deseq_results = data.frame(results(results_GSE137897, contrast=c("condition","PU", "AW"))),
                                      organism_name ="hsapiens_wound_GSE137897",
                                      biomart_file = human,
                                      attrbts = "hgnc_symbol",
                                      geo_code = "GSE137897",
                                      experiment = "AW_vs_PU",
                                      pval = 0.05,
                                      human_ortholog=c(),
                                      homology_confidence = F)

[1] "Total hsapiens_wound_GSE137897 transcripts with Available Human Orthologs: 20423"
[1] "Total hsapiens_wound_GSE137897 transcripts with Available Human Ortholog Gene Symbols: 19949"
[1] "Total hsapiens_wound_GSE137897 genes with Available Human Ortholog Gene Symbols and in Background gene sets: 19933"
[1] "Total genes after orthologous mapping and filtering: 19933"
[1] "Total significant transcripts after orthologous mapping and filtering: 4887"
[1] "Total significant downregulated hsapiens_wound_GSE137897 genes after orthologous mapping and filtering: 1252"
[1] "Total significant upregulated hsapiens_wound_GSE137897 genes after orthologous mapping and filtering: 1029"
[1] "Total significant Final downregulated human othologs: 1252"
[1] "Total significant Final upregulated human othologs: 1029"
[1] "current fraction of Intersection 0"
[1] "adj.P.Val range: 0 - 0.0499821748515364"
[1] "adj.P.Val neg logfc range: -5.99526116267939 - -0.0920834325700806"
[1] "adj.P.Val pos logfc range

### GSE166920 - Hsapiens - Skin

In [22]:
# read_counts_GSE166920 <- download_data("GSE166920")
# saveRDS(read_counts_GSE166920, "./GSE166920/read_counts_GSE166920.RDS")
read_counts_GSE166920 <- readRDS("./GSE166920/read_counts_GSE166920.RDS")

GSE166920_control_samples <- colnames(read_counts_GSE166920)[seq(2,28,7)]
GSE166920_exp_samples <- colnames(read_counts_GSE166920)[seq(4,28,7)]

GSE166920_countData <- read_counts_GSE166920[,c(seq(2,28,7), seq(4,28,7))]
GSE166920_condition <- factor(c(rep("control", length(GSE166920_control_samples)), rep("exp",length(GSE166920_exp_samples))))

                                                                                               size
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE166920/GSE166920_Raw_gene_count_matrix.txt.gz 1034108
                                                                                            isdir
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE166920/GSE166920_Raw_gene_count_matrix.txt.gz FALSE
                                                                                            mode
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE166920/GSE166920_Raw_gene_count_matrix.txt.gz  640
                                                                                                          mtime
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE166920/GSE166920_Raw_gene_count_matrix.txt.gz 2022-04-21 19:28:26
                                                                                                          ctime
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE166920/GSE166920_Raw_gene_co

In [23]:
head(GSE166920_countData)

Unnamed: 0,GTG-224-02,GTG-224-09,GTG-224-16,GTG-224-23,GTG-224-04,GTG-224-11,GTG-224-18,GTG-224-25
MT-TF,170,598,351,131,46,202,163,162
MT-RNR1,128005,245022,99690,58859,79522,118689,89271,105188
MT-TV,400,811,487,262,168,272,278,440
MT-RNR2,344894,800691,414192,227106,235369,393746,447495,382869
MT-TL1,478,1205,631,321,136,464,380,480
MT-ND1,7553,17671,10488,6141,7059,7205,16594,11922


In [24]:
results_GSE166920 <- get_deseq(GSE166920_countData, GSE166920_condition)

estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing



In [25]:
results_GSE166920

class: DESeqDataSet 
dim: 36995 8 
metadata(1): version
assays(4): counts mu H cooks
rownames(36995): MT-TF MT-RNR1 ... RN7SL285P RNU1-5P
rowData names(22): baseMean baseVar ... deviance maxCooks
colnames(8): GTG-224-02 GTG-224-09 ... GTG-224-18 GTG-224-25
colData names(2): condition sizeFactor

In [26]:
GSE166920_files <- filter_transcripts(deseq_results = data.frame(results(results_GSE166920, contrast=c("condition","exp", "control"))),
                                      organism_name ="hsapiens_wound_GSE166920",
                                      biomart_file = human,
                                      attrbts = "hgnc_symbol",
                                      geo_code = "GSE166920",
                                      experiment = "control_vs_exp",
                                      pval = 0.05,
                                      human_orthologs = c(),
                                      homology_confidence = F)

[1] "Total hsapiens_wound_GSE166920 transcripts with Available Human Orthologs: 40216"
[1] "Total hsapiens_wound_GSE166920 transcripts with Available Human Ortholog Gene Symbols: 35983"
[1] "Total hsapiens_wound_GSE166920 genes with Available Human Ortholog Gene Symbols and in Background gene sets: 35971"
[1] "Total genes after orthologous mapping and filtering: 35971"
[1] "Total significant transcripts after orthologous mapping and filtering: 288"
[1] "Total significant downregulated hsapiens_wound_GSE166920 genes after orthologous mapping and filtering: 81"
[1] "Total significant upregulated hsapiens_wound_GSE166920 genes after orthologous mapping and filtering: 108"
[1] "Total significant Final downregulated human othologs: 81"
[1] "Total significant Final upregulated human othologs: 108"
[1] "current fraction of Intersection 0"
[1] "adj.P.Val range: 1.37689906112184e-13 - 0.0496795034547142"
[1] "adj.P.Val neg logfc range: -3.65661819536489 - -0.487100918767246"
[1] "adj.P.Val pos 

### GSE111523 - Hsapiens - Gingivia

In [27]:
# test_suppl_test <- getGEOSuppFiles("GSE111523")
# saveRDS(test_suppl_test, "./GSE111523/test_suppl_test.RDS")
test_suppl_test <- readRDS("./GSE111523/test_suppl_test.RDS")
print(test_suppl_test)
directory <- rownames(test_suppl_test)[1]

                                                                                                                  size
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE111523/GSE111523_Tatakis_RNAseq_analysis_UCSC_reference_.xlsx.gz 7213701
                                                                                                               isdir
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE111523/GSE111523_Tatakis_RNAseq_analysis_UCSC_reference_.xlsx.gz FALSE
                                                                                                               mode
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE111523/GSE111523_Tatakis_RNAseq_analysis_UCSC_reference_.xlsx.gz  640
                                                                                                                             mtime
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE111523/GSE111523_Tatakis_RNAseq_analysis_UCSC_reference_.xlsx.gz 2022-04-21 19:30:55
                                  

In [28]:
excel_sheets(gsub(".gz", "", directory))

In [29]:
all_quant_GSE111523 <- read_excel(gsub(".gz", "", directory), sheet = 'Qunatification ALl')
all_fc_GSE111523 <- read_excel(gsub(".gz", "", directory), sheet = 'Sign. Fold Change All')
all_pval_GSE111523 <- read_excel(gsub(".gz", "", directory), sheet = 'Paired Ttest all (p corr)')

In [30]:
head(all_fc_GSE111523)

Gene ID,FC ([pw] vs [control]),Log FC ([pw] vs [control]),FC (abs) ([pw] vs [control]),Regulation ([pw] vs [control]),Gene Symbol,Aliases,Entrez ID,Ensembl ID,GO Accession
<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>
4312,264.532,8.047298,264.532,up,MMP1,CLG|CLGN,4312,ENSG00000196611,GO:0004222|GO:0005509|GO:0005576|GO:0005578|GO:0006508|GO:0007596|GO:0008270|GO:0019048|GO:0022617|GO:0030198|GO:0030574|GO:0044267|GO:0050900
6362,203.45349,7.668555,203.45349,up,CCL18,AMAC-1|AMAC1|CKb7|DC-CK1|DCCK1|MIP-4|PARC|SCYA18,6362,,GO:0005515|GO:0005615|GO:0006935|GO:0006954|GO:0006955|GO:0007154|GO:0007165|GO:0007267|GO:0008009|GO:0009607
6696,60.71231,5.923917,60.71231,up,SPP1,BNSP|BSPI|ETA-1|OPN,6696,ENSG00000118785,GO:0001649|GO:0005125|GO:0005576|GO:0005615|GO:0006954|GO:0007155|GO:0007566|GO:0010811|GO:0030198|GO:0030593|GO:0031214|GO:0031988|GO:0033280|GO:0042995|GO:0045177|GO:0045780|GO:0046697|GO:0048471|GO:0048545|GO:0048685|GO:0050840
394263,58.25351,5.864273,58.25351,up,MUC21,C6orf205|KMQK697|MUC-21|bCX31G15.2,394263,ENSG00000204544,GO:0005796|GO:0005886|GO:0016021|GO:0016266|GO:0043687|GO:0044267
115908,40.05882,5.324048,40.05882,up,CTHRC1,,115908,ENSG00000164932,GO:0005109|GO:0005581|GO:0005615|GO:0005737|GO:0016477|GO:0017147|GO:0032092|GO:0033690|GO:0043932|GO:0045669|GO:0060071|GO:0060122|GO:0090090|GO:0090103|GO:0090177
4319,38.63843,5.271965,38.63843,up,MMP10,SL-2|STMY2,4319,ENSG00000166670,GO:0004222|GO:0005509|GO:0005576|GO:0005578|GO:0005615|GO:0006508|GO:0008270|GO:0022617|GO:0030198|GO:0030334|GO:0030574


In [31]:
# this data set uses the cut off p(corr) < .05 and a LFC cutoff of 1 and -1
range(all_pval_GSE111523[all_pval_GSE111523$'Gene ID' %in%
                   all_fc_GSE111523[all_fc_GSE111523$'Regulation ([pw] vs [control])' == 'down',]$'Gene ID' &
                  all_pval_GSE111523$'Regulation' == 'down',]$'p (Corr)')
range(all_pval_GSE111523[all_pval_GSE111523$'Gene ID' %in%
                   all_fc_GSE111523[all_fc_GSE111523$'Regulation ([pw] vs [control])' == 'down',]$'Gene ID' &
                  all_pval_GSE111523$'Regulation' == 'down',]$'Log FC')

range(all_pval_GSE111523[all_pval_GSE111523$'Gene ID' %in%
                   all_fc_GSE111523[all_fc_GSE111523$'Regulation ([pw] vs [control])' == 'up',]$'Gene ID' &
                  all_pval_GSE111523$'Regulation' == 'up',]$'p (Corr)')
range(all_pval_GSE111523[all_pval_GSE111523$'Gene ID' %in%
                   all_fc_GSE111523[all_fc_GSE111523$'Regulation ([pw] vs [control])' == 'up',]$'Gene ID' &
                  all_pval_GSE111523$'Regulation' == 'up',]$'Log FC')

In [32]:
GSE111523_down_genes <- unique(all_fc_GSE111523[all_fc_GSE111523$`Regulation ([pw] vs [control])` == 'down',]$'Gene Symbol')
GSE111523_up_genes <- unique(all_fc_GSE111523[all_fc_GSE111523$`Regulation ([pw] vs [control])` == 'up',]$'Gene Symbol')
print(paste("Total significant Final downregulated genes:",
            length(GSE111523_down_genes),
            sep = " "))
print(paste("Total significant Final upregulated genes:",
            length(GSE111523_up_genes),
            sep = " "))

GSE111523_down_genes_filtered <- GSE111523_down_genes[GSE111523_down_genes %in% background_set$gene]
GSE111523_up_genes_filtered <- GSE111523_up_genes[GSE111523_up_genes %in% background_set$gene]

print(paste("Total filtered significant Final downregulated genes:",
            length(GSE111523_down_genes_filtered),
            sep = " "))
print(paste("Total filtered significant Final upregulated genes:",
            length(GSE111523_up_genes_filtered),
            sep = " "))

# remove empty names and NAs
# saveto the right  repository
write.table(data.frame("HGNC" =GSE111523_down_genes_filtered),
            file = paste("./geo_degs/", "hsapiens_wound_GSE111523", "_", "pw.vs.control", "_downregulated.txt", sep=""), quote = FALSE, sep = "\t",
                row.names = FALSE, col.names = FALSE)
write.table(data.frame("HGNC" =GSE111523_up_genes_filtered),
            file = paste("./geo_degs/", "hsapiens_wound_GSE111523", "_", "pw.vs.control", "_upregulated.txt", sep=""), quote = FALSE, sep = "\t",
                row.names = FALSE, col.names = FALSE)

[1] "Total significant Final downregulated genes: 88"
[1] "Total significant Final upregulated genes: 399"
[1] "Total filtered significant Final downregulated genes: 78"
[1] "Total filtered significant Final upregulated genes: 384"


### GSE97615 - Hsapiens - Oral

In [33]:
GSE97615_file <- read_excel("./GSE97615/aap8798_Table_S1.xlsx")

In [34]:
print(paste("Total Differentially expressed genes :", nrow(GSE97615_file)))
print(paste("Total significant DEGs :", nrow(GSE97615_file[GSE97615_file$'qvalue(p-value)' < .05,])))
sig_GSE97615_file <- GSE97615_file[GSE97615_file$'qvalue(p-value)' < .05,]

sig_GSE97615_file_pos <- sig_GSE97615_file[sig_GSE97615_file$'FoldChange(Oral/Skin)' > 1,]
sig_GSE97615_file_neg <- sig_GSE97615_file[sig_GSE97615_file$'FoldChange(Oral/Skin)' < -1,]

print(paste("Total significant Final downregulated genes:",
            nrow(sig_GSE97615_file_neg),
            sep = " "))
print(paste("Total significant Final upregulated genes:",
            nrow(sig_GSE97615_file_pos),
            sep = " "))

sig_GSE97615_file_neg_filtered <- sig_GSE97615_file_neg[sig_GSE97615_file_neg$'Gene Symbol' %in% background_set$gene,]$'Gene Symbol'
sig_GSE97615_file_pos_filtered <- sig_GSE97615_file_pos[sig_GSE97615_file_pos$'Gene Symbol' %in% background_set$gene,]$'Gene Symbol'

print(paste("Total filtered significant Final downregulated genes:",
            length(sig_GSE97615_file_neg_filtered),
            sep = " "))
print(paste("Total filtered significant Final upregulated genes:",
            length(sig_GSE97615_file_pos_filtered),
            sep = " "))

# remove empty names and NAs
# saveto the right  repository
write.table(data.frame("HGNC" =sig_GSE97615_file_neg_filtered),
            file = paste("./geo_degs/", "hsapiens_wound_GSE97615", "_", "oral.vs.skin", "_downregulated.txt", sep=""), quote = FALSE, sep = "\t",
                row.names = FALSE, col.names = FALSE)
write.table(data.frame("HGNC" =sig_GSE97615_file_pos_filtered),
            file = paste("./geo_degs/", "hsapiens_wound_GSE97615", "_", "oral.vs.skin", "_upregulated.txt", sep=""), quote = FALSE, sep = "\t",
                row.names = FALSE, col.names = FALSE)

[1] "Total Differentially expressed genes : 2591"
[1] "Total significant DEGs : 2591"
[1] "Total significant Final downregulated genes: 1831"
[1] "Total significant Final upregulated genes: 760"
[1] "Total filtered significant Final downregulated genes: 1667"
[1] "Total filtered significant Final upregulated genes: 701"


### GSE116678 - Xlaevis - Embryonic

In [9]:
# test_suppl <- getGEOSuppFiles("GSE116678")
# saveRDS(test_suppl,"./GSE116678/test_suppl.RDS")
test_suppl <- readRDS("./GSE116678/test_suppl.RDS")
print(test_suppl)
directory <- rownames(test_suppl)[1]

                                                                          size
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE116678/GSE116678_RAW.tar 9318400
                                                                       isdir
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE116678/GSE116678_RAW.tar FALSE
                                                                       mode
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE116678/GSE116678_RAW.tar  640
                                                                                     mtime
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE116678/GSE116678_RAW.tar 2022-04-21 19:32:34
                                                                                     ctime
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE116678/GSE116678_RAW.tar 2022-04-21 19:32:34
                                                                                     atime
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE116678/GSE116678_RAW.tar 2022-04-19

In [10]:
count_files <- list.files("./GSE116678/", "*.gz")

for(i in 1:length(count_files)) {
    name <- gsub("GSM[0-9].*_|\\.count.txt.gz$","",count_files[i])
    current_file <- fread(paste("./GSE116678/",count_files[i],sep=""))
    
    if(i == 1) {
        colnames(current_file) <- c("gene", name)
        GSE116678_count_mat <- current_file
        
    } else {
        GSE116678_count_mat <- cbind(GSE116678_count_mat, current_file$V2)
        colnames(GSE116678_count_mat)[length(GSE116678_count_mat)] <- name
    }
    
}
genes <- GSE116678_count_mat$gene
GSE116678_count_mat <- as.matrix(GSE116678_count_mat[,2:length(GSE116678_count_mat)])
rownames(GSE116678_count_mat) <- genes
GSE116678_condition <- factor(gsub("GSM[0-9].*_|-mpw.*\\.count.txt.gz$","",count_files))
GSE116678_condition <- gsub("-","_",GSE116678_condition)

In [11]:
results_GSE116678 <- get_deseq(GSE116678_count_mat, GSE116678_condition)

“some variables in design formula are characters, converting to factors”
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing



In [12]:
experiments_GSE116678 <- list("GSE116678_WT_30_WT_0" = results(results_GSE116678, contrast=c("condition","WT_30", "WT_0")),
                              "GSE116678_WT_60_WT_0" = results(results_GSE116678, contrast=c("condition","WT_60", "WT_0")),
                              "GSE116678_WT_90_WT_0" = results(results_GSE116678, contrast=c("condition","WT_90", "WT_0")),
                              "GSE116678_TRIM_0_WT_0" = results(results_GSE116678, contrast=c("condition","TRIM_0", "WT_0")),
                              "GSE116678_enNOS_0_WT_0" = results(results_GSE116678, contrast=c("condition","enNOS_0", "WT_0")),
                              "GSE116678_WT_60_WT_30" = results(results_GSE116678, contrast=c("condition","WT_60", "WT_30")),
                              "GSE116678_WT_90_WT_30" = results(results_GSE116678, contrast=c("condition","WT_90", "WT_30")),
                              "GSE116678_WT_90_WT_60" = results(results_GSE116678, contrast=c("condition","WT_90", "WT_60")),
                              "GSE116678_TRIM_30_WT_30" = results(results_GSE116678, contrast=c("condition","TRIM_30", "WT_30")),
                              "GSE116678_enNOS_30_WT_30" = results(results_GSE116678, contrast=c("condition","enNOS_30", "WT_30")),
                              "GSE116678_TRIM_60_WT_60" = results(results_GSE116678, contrast=c("condition","TRIM_60", "WT_60")),
                              "GSE116678_enNOS_60_WT_60" = results(results_GSE116678, contrast=c("condition","enNOS_60", "WT_60")),
                              "GSE116678_TRIM_90_WT_90" = results(results_GSE116678, contrast=c("condition","TRIM_90", "WT_90")),
                              "GSE116678_enNOS_90_WT_90" = results(results_GSE116678, contrast=c("condition","enNOS_90", "WT_90")),
                              "GSE116678_TRIM_30_TRIM_0" = results(results_GSE116678, contrast=c("condition","TRIM_30", "TRIM_0")),
                              "GSE116678_TRIM_60_TRIM_0" = results(results_GSE116678, contrast=c("condition","TRIM_60", "TRIM_0")),
                              "GSE116678_TRIM_90_TRIM_0" = results(results_GSE116678, contrast=c("condition","TRIM_90", "TRIM_0")),
                              "GSE116678_enNOS_0_TRIM_0" = results(results_GSE116678, contrast=c("condition","enNOS_0", "TRIM_0")),
                              "GSE116678_TRIM_60_TRIM_30" = results(results_GSE116678, contrast=c("condition","TRIM_60", "TRIM_30")),
                              "GSE116678_TRIM_90_TRIM_30" = results(results_GSE116678, contrast=c("condition","TRIM_90", "TRIM_30")),
                              "GSE116678_TRIM_90_TRIM_60" = results(results_GSE116678, contrast=c("condition","TRIM_90", "TRIM_60")),
                              "GSE116678_enNOS_30_TRIM_30" = results(results_GSE116678, contrast=c("condition","enNOS_0", "TRIM_30")),
                              "GSE116678_enNOS_60_TRIM_60" = results(results_GSE116678, contrast=c("condition","enNOS_0", "TRIM_60")),
                              "GSE116678_enNOS_90_TRIM_90" = results(results_GSE116678, contrast=c("condition","enNOS_0", "TRIM_90")),
                              "GSE116678_enNOS_30_enNOS_0" = results(results_GSE116678, contrast=c("condition","enNOS_30", "enNOS_0")),
                              "GSE116678_enNOS_60_enNOS_0" = results(results_GSE116678, contrast=c("condition","enNOS_60", "enNOS_0")),
                              "GSE116678_enNOS_90_enNOS_0" = results(results_GSE116678, contrast=c("condition","enNOS_90", "enNOS_0")),
                              "GSE116678_enNOS_60_enNOS_30" = results(results_GSE116678, contrast=c("condition","enNOS_60", "enNOS_30")),
                              "GSE116678_enNOS_90_enNOS_30" = results(results_GSE116678, contrast=c("condition","enNOS_90", "enNOS_30")),
                              "GSE116678_enNOS_90_enNOS_60" = results(results_GSE116678, contrast=c("condition","enNOS_90", "enNOS_60")))

In [13]:
GSE116678_files <- list()

first_experiment <- data.frame(experiments_GSE116678[[1]])
first_experiment <- first_experiment[toupper(gsub("\\.L|\\.S", "", rownames(first_experiment))) %in% background_set$gene,]
gene_names <- toupper(gsub("\\.L|\\.S", "", rownames(first_experiment)))

suppressMessages(suppressWarnings(library(biomaRt)))
human_orthologs <- getLDS(attributes="hgnc_symbol", filters="hgnc_symbol",
                          values=gene_names,
                          mart=human,attributesL=c("hgnc_symbol","ensembl_gene_id"),
                          martL=human)

for (i in 1:length(experiments_GSE116678)){    
    
    current_experiment <- data.frame(experiments_GSE116678[[i]])
    
    current_experiment <- current_experiment[toupper(gsub("\\.L|\\.S", "", rownames(current_experiment))) %in% background_set$gene,]
    current_experiment <- cbind(current_experiment, "gene_names" = toupper(gsub("\\.L|\\.S", "", rownames(current_experiment))))
    
    deseq_results = current_experiment
    organism_name ="xlaevis_wound_GSE116678"
    biomart_file = human
    attrbts = "hgnc_symbol"
    geo_code = "GSE116678"
    experiment = names(experiments_GSE116678)[i]
    pval = 0.05
    human_orthologs = human_orthologs
    homology_confidence = F
    
    
    # remove genes with no symbol
    print(paste("Total", organism_name, "transcripts with Available Human Orthologs:",
                nrow(human_orthologs),
                sep = " "))
    human_orthologs <- human_orthologs[human_orthologs$HGNC.symbol != "",]
    
    print(paste("Total", organism_name, "transcripts with Available Human Ortholog Gene Symbols:",
                length(unique(human_orthologs[,c(1)])),
                sep = " "))
    
    background_filtered_genes <- human_orthologs[human_orthologs$HGNC.symbol %in% background_set$gene,]

    print(paste("Total", organism_name, "genes with Available Human Ortholog Gene Symbols and in Background gene sets:",
                length(unique(background_filtered_genes[,c(1)])),
                sep = " "))

    # get transcript ID from gene
    unique_symbols <- unique(background_filtered_genes[,c(1)])

    genes_filtered <- unique(deseq_results$gene_names[deseq_results$gene_names %in% unique_symbols])


    print(paste("Total genes after orthologous mapping and filtering:",
                length(genes_filtered),
                sep = " "))
    
    # significant transcripts
    significant_genes <- deseq_results[(deseq_results$gene_names %in% genes_filtered) &
                        !is.na(deseq_results$padj) &
                        deseq_results$padj < pval,]
    print(paste("Total significant transcripts after orthologous mapping and filtering:",
                nrow(significant_genes),
                sep = " "))
    
    # separate into downregulated and upregulated with a cut off of logFC
    neg_significant_genes <- significant_genes[significant_genes$log2FoldChange < -1,]
    pos_significant_genes <- significant_genes[significant_genes$log2FoldChange > 1,]
        
    if(nrow(significant_genes) > 8000) {
        neg_significant_genes <- significant_genes[significant_genes$log2FoldChange < -1,]
        pos_significant_genes <- significant_genes[significant_genes$log2FoldChange > 1,]
    }

        #neg_significant_genes <- unique(transcript_to_gene_dict[transcript_to_gene_dict$transcriptIDS %in% neg_significant_transcripts$ID,]$transcriptGenes)
        #pos_significant_genes <- unique(transcript_to_gene_dict[transcript_to_gene_dict$transcriptIDS %in% pos_significant_transcripts$ID,]$transcriptGenes)

    print(paste("Total significant downregulated", organism_name, "genes after orthologous mapping and filtering:",
                nrow(neg_significant_genes),
                sep = " "))

    print(paste("Total significant upregulated", organism_name, "genes after orthologous mapping and filtering:",
                nrow(pos_significant_genes),
                sep = " "))

    final_neg_human_orthologs <- background_filtered_genes[background_filtered_genes$HGNC.symbol %in% neg_significant_genes$gene_names,]
    final_pos_human_orthologs <- background_filtered_genes[background_filtered_genes$HGNC.symbol %in% pos_significant_genes$gene_names,]

    print(paste("Total significant Final downregulated human othologs:",
                length(unique(final_neg_human_orthologs$HGNC.symbol)),
                sep = " "))

    print(paste("Total significant Final upregulated human othologs:",
                length(unique(final_pos_human_orthologs$HGNC.symbol)),
                sep = " "))

    # remove empty names and NAs
    # saveto the right  repository
    write.table(data.frame("HGNC" =final_neg_human_orthologs$HGNC.symbol),
                file = paste("./geo_degs/", organism_name, "_", experiment, "_downregulated.txt", sep=""), quote = FALSE, sep = "\t",
                row.names = FALSE, col.names = FALSE)
    write.table(data.frame("HGNC" =final_pos_human_orthologs$HGNC.symbol),
                file = paste("./geo_degs/", organism_name, "_", experiment, "_upregulated.txt", sep=""), quote = FALSE, sep = "\t",
               row.names = FALSE, col.names = FALSE)
        
    total_exp_list <- list("downregulated" = final_neg_human_orthologs,
                                            "upregulated" = final_pos_human_orthologs,
                                            "gene_qvals"=significant_genes$padj)
        
    down_genes <- unique(final_neg_human_orthologs$HGNC.symbol)
    up_genes <- unique(final_pos_human_orthologs$HGNC.symbol)
    
    # Examine gene variability
    total_up <- character(0)
    total_down <- character(0)
    recurrent_switchers <- list()

    intersection <- intersect(down_genes, up_genes)
    print(paste("current fraction of Intersection ",length(intersection)/sum(length(down_genes) + length(up_genes)),sep=""))
        
    for(gene in intersection) {
        if(gene %in% names(recurrent_switchers)) {
            recurrent_switchers[[gene]] = recurrent_switchers[[gene]] + 1
        } else {
            recurrent_switchers[[gene]] = 1
        }
    }
        
    total_up <- unique(append(total_up, up_genes, length(total_up)))
    total_down <- unique(append(total_down, down_genes, length(total_down)))
        
    getDataRanges(significant_genes, "Exp")
    
    
    total_intersection <- intersect(total_down, total_up)
    print(paste("Total fraction of gene intersection ",length(total_intersection)/sum(length(total_down) + length(total_up)), sep =""))
    
    return_list <- list("total_exp_list" = total_exp_list,
                        "recurrent_switchers" = recurrent_switchers)
    if(homology_confidence) {
        return_list[["homologs"]] <- background_filtered_genes
        return_list[["significant_homologs"]] <- background_filtered_genes[background_filtered_genes$HGNC.symbol %in% c(total_down, total_up),]
        return(return_list)
    }
    
    GSE116678_files[[names(experiments_GSE116678)[i]]] <- return_list
    
    print('######################################################')
}

[1] "Total xlaevis_wound_GSE116678 transcripts with Available Human Orthologs: 13597"
[1] "Total xlaevis_wound_GSE116678 transcripts with Available Human Ortholog Gene Symbols: 12489"
[1] "Total xlaevis_wound_GSE116678 genes with Available Human Ortholog Gene Symbols and in Background gene sets: 12489"
[1] "Total genes after orthologous mapping and filtering: 12489"
[1] "Total significant transcripts after orthologous mapping and filtering: 382"
[1] "Total significant downregulated xlaevis_wound_GSE116678 genes after orthologous mapping and filtering: 316"
[1] "Total significant upregulated xlaevis_wound_GSE116678 genes after orthologous mapping and filtering: 44"
[1] "Total significant Final downregulated human othologs: 260"
[1] "Total significant Final upregulated human othologs: 34"
[1] "current fraction of Intersection 0"
[1] "adj.P.Val range: 2.85643162997504e-32 - 0.049617763617162"
[1] "adj.P.Val neg logfc range: -9.71834932489583 - -0.599789269932285"
[1] "adj.P.Val pos logfc 

“no non-missing arguments to min; returning Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to min; returning Inf”
“no non-missing arguments to max; returning -Inf”


[1] "adj.P.Val neg logfc range: Inf - -Inf"
[1] "adj.P.Val pos logfc range: 0.819723945194599 - 6.99875964635177"
[1] "Total fraction of gene intersection 0"
[1] "######################################################"
[1] "Total xlaevis_wound_GSE116678 transcripts with Available Human Orthologs: 13597"
[1] "Total xlaevis_wound_GSE116678 transcripts with Available Human Ortholog Gene Symbols: 12489"
[1] "Total xlaevis_wound_GSE116678 genes with Available Human Ortholog Gene Symbols and in Background gene sets: 12489"
[1] "Total genes after orthologous mapping and filtering: 12489"
[1] "Total significant transcripts after orthologous mapping and filtering: 233"
[1] "Total significant downregulated xlaevis_wound_GSE116678 genes after orthologous mapping and filtering: 84"
[1] "Total significant upregulated xlaevis_wound_GSE116678 genes after orthologous mapping and filtering: 124"
[1] "Total significant Final downregulated human othologs: 74"
[1] "Total significant Final upregulated huma

“no non-missing arguments to min; returning Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to min; returning Inf”
“no non-missing arguments to max; returning -Inf”


[1] "adj.P.Val neg logfc range: Inf - -Inf"
[1] "adj.P.Val pos logfc range: 1.7492081580616 - 16.8394188527267"
[1] "Total fraction of gene intersection 0"
[1] "######################################################"
[1] "Total xlaevis_wound_GSE116678 transcripts with Available Human Orthologs: 13597"
[1] "Total xlaevis_wound_GSE116678 transcripts with Available Human Ortholog Gene Symbols: 12489"
[1] "Total xlaevis_wound_GSE116678 genes with Available Human Ortholog Gene Symbols and in Background gene sets: 12489"
[1] "Total genes after orthologous mapping and filtering: 12489"
[1] "Total significant transcripts after orthologous mapping and filtering: 43"
[1] "Total significant downregulated xlaevis_wound_GSE116678 genes after orthologous mapping and filtering: 12"
[1] "Total significant upregulated xlaevis_wound_GSE116678 genes after orthologous mapping and filtering: 28"
[1] "Total significant Final downregulated human othologs: 10"
[1] "Total significant Final upregulated human ot

“no non-missing arguments to min; returning Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to min; returning Inf”
“no non-missing arguments to max; returning -Inf”


[1] "adj.P.Val pos logfc range: Inf - -Inf"
[1] "Total fraction of gene intersection 0"
[1] "######################################################"
[1] "Total xlaevis_wound_GSE116678 transcripts with Available Human Orthologs: 13597"
[1] "Total xlaevis_wound_GSE116678 transcripts with Available Human Ortholog Gene Symbols: 12489"
[1] "Total xlaevis_wound_GSE116678 genes with Available Human Ortholog Gene Symbols and in Background gene sets: 12489"
[1] "Total genes after orthologous mapping and filtering: 12489"
[1] "Total significant transcripts after orthologous mapping and filtering: 23"
[1] "Total significant downregulated xlaevis_wound_GSE116678 genes after orthologous mapping and filtering: 18"
[1] "Total significant upregulated xlaevis_wound_GSE116678 genes after orthologous mapping and filtering: 3"
[1] "Total significant Final downregulated human othologs: 14"
[1] "Total significant Final upregulated human othologs: 3"
[1] "current fraction of Intersection 0"
[1] "adj.P.Val ra

## Regen

### GSE116777 - Amexicanum - limbs

In [22]:
# test_suppl_test <- getGEOSuppFiles("GSE116777")
# saveRDS(test_suppl_test, "./GSE116777/test_suppl_test.RDS")
# test_suppl_test <- readRDS("./GSE116777/test_suppl_test.RDS")
# print(test_suppl_test)
GSE116777_directory_aandEXP <- "./GSE116777/GSE116777_AAND_RNASEQ_EXPCOUNT.txt.gz"
GSE116777_directory_amacEXP <- "./GSE116777/GSE116777_AMAC_RNASEQ_EXPCOUNT.txt.gz"
GSE116777_directory_amexEXP <- "./GSE116777/GSE116777_AMEX_RNASEQ_EXPCOUNT.txt.gz"

In [23]:
GSE116777_amexEXP<- fread(GSE116777_directory_amexEXP)


GSE116777_amex_countData <- GSE116777_amexEXP[,2:ncol(GSE116777_amexEXP)]
GSE116777_amex_countData <- apply(GSE116777_amex_countData,2,as.integer)
rownames(GSE116777_amex_countData) <- GSE116777_amexEXP$geneID
GSE116777_amex_condition <- factor(c(rep("Amex0H", 3), rep("Amex24H",3)))

In [24]:
head(GSE116777_amex_countData)

Unnamed: 0,SRV-0041,SRV-0042,SRV-0043,SRV-0044,SRV-0045,SRV-0046
Ax_TR426181_c0_g1,0,1,0,0,0,0
Ax_TR10879_c0_g1,0,0,0,0,0,0
Ax_TR632813_c0_g1,0,0,0,0,0,0
Ax_TR494631_c0_g1,1,0,0,0,0,3
Ax_TR245315_c0_g1,0,0,0,0,0,0
Ax_TR586350_c0_g1,3,13,1,0,16,1


In [25]:
GSE116777_aandEXP<- fread(GSE116777_directory_aandEXP)


GSE116777_aand_countData <- GSE116777_aandEXP[,2:ncol(GSE116777_aandEXP)]
GSE116777_aand_countData <- apply(GSE116777_aand_countData,2,as.integer)
rownames(GSE116777_aand_countData) <- GSE116777_aandEXP$geneID
GSE116777_aand_condition <- factor(c(rep("Aand0H", 3), rep("Aand24H",3)))

In [26]:
head(GSE116777_aand_countData)

Unnamed: 0,SRV-0023,SRV-0024,SRV-0025,SRV-0026,SRV-0027,SRV-0028
And_TR426181_c0_g1,0,0,0,0,0,0
And_TR93043_c15_g1,1,0,0,0,0,0
And_TR63775_c4_g2,2,7,4,5,3,1
And_TR80400_c1_g1,96,115,46,75,150,42
And_TR10879_c0_g1,0,0,0,0,0,1
And_TR632813_c0_g1,0,2,0,1,2,0


In [27]:
GSE116777_amacEXP<- fread(GSE116777_directory_amacEXP)


GSE116777_amac_countData <- GSE116777_amacEXP[,2:ncol(GSE116777_amacEXP)]
GSE116777_amac_countData <- apply(GSE116777_amac_countData,2,as.integer)
rownames(GSE116777_amac_countData) <- GSE116777_amacEXP$geneID
GSE116777_amac_condition <- factor(c(rep("Amac0H", 3), rep("Amac24H",3)))

In [28]:
head(GSE116777_amac_countData)

Unnamed: 0,SRV-0005,SRV-0006,SRV-0007,SRV-0008,SRV-0009,SRV-0010
Mac_TR303993_c0_g1,0,0,1,0,0,0
Mac_TR187550_c0_g1,0,1,0,0,0,1
Mac_TR187550_c0_g2,4,1,1,4,0,0
Mac_TR10879_c0_g1,0,0,1,0,0,0
Mac_TR318126_c0_g1,0,0,0,0,1,0
Mac_TR245315_c0_g1,0,1,0,0,0,0


In [29]:
results_GSE116777_amex <- results(get_deseq(GSE116777_amex_countData, GSE116777_amex_condition))
results_GSE116777_amac <- results(get_deseq(GSE116777_aand_countData, GSE116777_aand_condition))
results_GSE116777_amac <- results(get_deseq(GSE116777_amac_countData, GSE116777_amac_condition))

estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing



In [30]:
# A. mexicanum transcripts that were identified as commonly,
# differentially expressed using microarray and RNA-Seq, with RNA-Seq log2 fold changes.

# Additionally, A. andersoni and A. maculatum transcripts that significantly matched V5 contigs are reported,
# with RNA-Seq log 2 fold changes.

axolotl_degs <- fread("./GSE116777/amex_genEXP.txt")
print(paste("Total Amexicanum Genes", nrow(axolotl_degs)))
axolotl_degs$update <- toupper(axolotl_degs$update)

axolotl_degs <- axolotl_degs[axolotl_degs$update %in% background_set$gene,]
print(paste("Total filtered Amexicanum Genes", nrow(axolotl_degs)))

amex_degs <- axolotl_degs[,c("V5_contigID", "update", "A. mexicanum (RNA-Seq log2FC)")]
print(paste("Total final Amexicanum Genes", nrow(amex_degs)))

aand_degs <- axolotl_degs[axolotl_degs$"A.andersoni log2-fold change" != "NS" &
                          axolotl_degs$"A.andersoni log2-fold change" != "No Blast Hit",
                          c("A. andersoni transcript match (BLASTn)", "update", "A.andersoni log2-fold change")]
print(paste("Total final A andersoni Genes", nrow(aand_degs)))

amac_degs <- axolotl_degs[axolotl_degs$"A.maculatum log2-Fold change" != "NS" &
                          axolotl_degs$"A.maculatum log2-Fold change" != "No Blast Hit",
                          c("A. maculatum transcript match (tBLASTx)", "update", "A.maculatum log2-Fold change")]
print(paste("Total final A maculatum Genes", nrow(amac_degs)))

[1] "Total Amexicanum Genes 2360"
[1] "Total filtered Amexicanum Genes 1890"
[1] "Total final Amexicanum Genes 1890"
[1] "Total final A andersoni Genes 673"
[1] "Total final A maculatum Genes 873"


In [31]:
amex_neg_genes <- amex_degs[amex_degs$'A. mexicanum (RNA-Seq log2FC)' < -1,]$update
amex_pos_genes <- amex_degs[amex_degs$'A. mexicanum (RNA-Seq log2FC)' > 1,]$update
print(paste("Total final Amexicanum Down Genes", length(amex_neg_genes)))
print(paste("Total final Amexicanum Up Genes", length(amex_pos_genes)))
write_out_files(amex_neg_genes, amex_pos_genes,
                "amexicanum_regen_GSE116777", "0H_vs_24H")

aand_neg_genes <- aand_degs[aand_degs$'A.andersoni log2-fold change' < -1,]$update
aand_pos_genes <- aand_degs[aand_degs$'A.andersoni log2-fold change' > 1,]$update
print(paste("Total final A andersoni Down Genes", length(aand_neg_genes)))
print(paste("Total final A andersoni Up Genes", length(aand_pos_genes)))
write_out_files(aand_neg_genes, aand_pos_genes,
                "aandersoni_regen_GSE116777", "0H_vs_24H")

amac_neg_genes <- amac_degs[amac_degs$'A.maculatum log2-Fold change' < -1,]$update
amac_pos_genes <- amac_degs[amac_degs$'A.maculatum log2-Fold change' > 1,]$update
print(paste("Total final A maculatum Down Genes", length(amac_neg_genes)))
print(paste("Total final A maculatum Up Genes", length(amac_pos_genes)))
write_out_files(amac_neg_genes, amac_pos_genes,
                "amaculatum_regen_GSE116777", "0H_vs_24H")

[1] "Total final Amexicanum Down Genes 491"
[1] "Total final Amexicanum Up Genes 984"
[1] "Total final A andersoni Down Genes 117"
[1] "Total final A andersoni Up Genes 258"
[1] "Total final A maculatum Down Genes 101"
[1] "Total final A maculatum Up Genes 343"


In [32]:
human_feats <- listAttributes(human)
human_feats[grep("mexicanum",human_feats$name),]

name,description,page
<chr>,<chr>,<chr>


### GSE121737 - Amexicanum - limb

In [None]:
test_suppl_test <- getGEOSuppFiles("GSE121737")
print(test_suppl_test)

In [None]:
inst

In [10]:
library(dplyr)
library(Seurat)

Registered S3 method overwritten by 'spatstat.geom':
  method     from
  print.boxx cli 

Attaching SeuratObject


Attaching package: ‘Seurat’


The following object is masked from ‘package:SummarizedExperiment’:

    Assays




In [11]:
print(rownames(test_suppl_test)[6])

[1] "/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE121737/GSE121737_intact_contralateral_cell_by_gene.repGene.txt.gz"


In [12]:
#load in data
inDrops3.data = read.table(rownames(test_suppl_test)[6], header = T, row.names = 1, sep = '\t')
#pull out samples 1 and 2, which are the intact limb samples
inDrops3.intact = inDrops3.data[,grep('^S[12]_', colnames(inDrops3.data))]

In [24]:
#Remove data matrix with extra samples
rm(inDrops3.data) 
#Create Seurat object and make sparse
seurat_inDrops3.intact = CreateSeuratObject(as.sparse(inDrops3.intact), project = 'inDrops3.intact', min.cells = 8, min.genes = 200)
#seurat_inDrops3.intact = MakeSparse(seurat_inDrops3.intact)

“object 'inDrops3.data' not found”
“Feature names cannot have underscores ('_'), replacing with dashes ('-')”
“Feature names cannot have pipe characters ('|'), replacing with dashes ('-')”


In [25]:
seurat_inDrops3.intact@data

ERROR: Error in eval(expr, envir, enclos): no slot of name "data" for this object of class "Seurat"


While we did some filtering above, we need to perform further quality control to ensure that the cells we are working with aren't apoptotic #or have a dearth of genes. First, we need to identify the mitochondrial genes present in this matrix. The axolotl mitochondrial genome can #be found here: https://www.ncbi.nlm.nih.gov/nuccore/AJ584639. Remember that the genes are written as protein names when greping for #mitochondrial genes. 

In [19]:
#find mitochonrial genes in matrix. The protein name should be used and changed for each gene within the mitochondrial genome.
grep(pattern = "*CYB_*", x = rownames(x = seurat_inDrops3.intact@data), value = TRUE)
#list of all mitochondrial genes in this intact matrix
mito.genes.intact <- c("c1084180_g3_i1^sp|Q8LWP6|CYB_RANSI", "c1060846_g1_i1^sp|Q8WA47|CYB_MUSMA", "c1084180_g1_i1^sp|Q8LWP6|CYB_RANSI", "c1451851_g1_i1^sp|Q9ZXY2|COX1_PAPHA", "c220469_g1_i1^sp|P00397|COX1_MOUSE", "c1088733_g1_i1^sp|Q9ZZM6|COX1_SALSA", "c1083417_g1_i2^sp|P00419|COX3_XENLA", "c1049442_g1_i1^sp|Q96133|COX3_CARAU", "c934922_g1_i1^sp|Q9ZXX8|COX3_PAPHA", "c1083535_g6_i1^sp|Q4JQI7|NU1M_TETNG", "c1025234_g1_i1^sp|O63796|NU2M_ANACA", "c1068681_g4_i1^sp|Q9ZZM3|NU5M_SALSA^sp|P82013|VDAC2_MELGA^Porin_3", "c1027109_g1_i1^sp|Q35920|ATP6_SALSA")


ERROR: Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'grep': error in evaluating the argument 'x' in selecting a method for function 'rownames': no slot of name "data" for this object of class "Seurat"


In [None]:
#calculate the percentage mitochondrial RNA for each cell
percent.mito.intact <- Matrix::colSums(seurat_inDrops3.intact@raw.data[mito.genes.intact, ])/Matrix::colSums(seurat_inDrops3.intact@raw.data)
#add the percent mitochondrial content of each cell to the Seurat object
seurat_inDrops3.intact <- AddMetaData(object = seurat_inDrops3.intact, metadata = percent.mito.intact, col.name = "percent.mito")

In [26]:
#visualize number of genes, unique molecular identifiers (UMI), and percent mitochondrial RNA
VlnPlot(object = seurat_inDrops3.intact, features.plot = c("nGene", "nUMI", "percent.mito"), nCol = 3)

ERROR: Error in VlnPlot(object = seurat_inDrops3.intact, features.plot = c("nGene", : unused arguments (features.plot = c("nGene", "nUMI", "percent.mito"), nCol = 3)


In [27]:
#filter out cells
seurat_inDrops3.intact <- FilterCells(object = seurat_inDrops3.intact, subset.names = c("nGene", "percent.mito"), low.thresholds = c(850, -Inf), high.thresholds = c(4000, 0.125))

#normalize data
seurat_inDrops3.intact <- NormalizeData(seurat_inDrops3.intact, normalization.method= "LogNormalize", scale.factor= 10000)

#find variable genes
seurat_inDrops3.intact <- FindVariableGenes(object = seurat_inDrops3.intact, mean.function = ExpMean, dispersion.function = LogVMR, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5, do.plot = FALSE)

#scale data and regress out nUMI and percent.mito
seurat_inDrops3.intact <- ScaleData(seurat_inDrops3.intact, vars.to.regress = c('nUMI', 'percent.mito'))

ERROR: Error in FilterCells(object = seurat_inDrops3.intact, subset.names = c("nGene", : could not find function "FilterCells"


In [None]:
#Next, we perform linear dimensional reduction and visualize the results in a few different ways. 

seurat_inDrops3.intact <- RunPCA(object = seurat_inDrops3.intact, pc.genes = seurat_inDrops3.intact@var.genes, do.print = TRUE, pcs.print = 1:5, genes.print = 5)

#visualize results
PrintPCA(object = seurat_inDrops3.intact, pcs.print = 1:5, genes.print = 5, use.full = FALSE)
VizPCA(object = seurat_inDrops3.intact, pcs.use = 1:2)
PCAPlot(object = seurat_inDrops3.intact, dim.1 = 1, dim.2 = 2)
seurat_inDrops3.intact <- ProjectPCA(object = seurat_inDrops3.intact, do.print = FALSE)
PCHeatmap(object = seurat_inDrops3.intact, pc.use = 1, cells.use = 500, do.balanced = TRUE, label.columns = FALSE)
PCHeatmap(object = seurat_inDrops3.intact, pc.use = 1:12, cells.use = 500, do.balanced = TRUE, label.columns = FALSE, use.full = FALSE)
PCHeatmap(object = seurat_inDrops3.intact, pc.use = 13:20, cells.use = 500, do.balanced = TRUE, label.columns = FALSE, use.full = FALSE)

#plot standard deviations to chose PCs to use in downstream analysis, here we chose 18
PCElbowPlot(object = seurat_inDrops3.intact)

In [None]:
#Now we can identify cell populations within the homeostatic limb, vizualize the resulting populations using tSNE, and subsequently find markers that define these different populations 

#find clusters using first 18 PCs
seurat_inDrops3.intact <- FindClusters(object = seurat_inDrops3.intact, reduction.type = "pca", dims.use = 1:18, resolution = 1.5, print.output = 0, save.SNN = TRUE)

#run non-linear dimensional reduction
seurat_inDrops3.intact <- RunTSNE(object = seurat_inDrops3.intact, dims.use = 1:18, do.fast = TRUE)

# Build a phylogenetic tree to see how cells are related while simultaneously renaming and reordering cluster names according to their #position on the tree. This will be important to determine when deciding whether similar populations should be merged. 
seurat_inDrops3.intact <- BuildClusterTree(seurat_inDrops3.intact, do.reorder=TRUE, reorder.numeric=TRUE)

In [None]:
#visualize tSNE 
set.seed(5)
TSNEPlot(object = seurat_inDrops3.intact, do.label = T)


In [None]:
#visulize tSNE based on sample to determine how similar the two samples are to one another
TSNEPlot(object = seurat_inDrops3.intact, group.by = 'orig.ident')

In [None]:
#assess nodes
node.scores <- AssessNodes(seurat_inDrops3.intact)
node.scores[order(node.scores$oobe, decreasing = TRUE), ] -> node.scores
node.scores

In [None]:
#merge first 7 nodes
#select nodes to merge
nodes.merge <- node.scores[1:7, ]
nodes.to.merge <- sort(x = nodes.merge$node)

#create a new Seurat object in which we will merge our selected nodes
merged <- seurat_inDrops3.intact
#merge nodes
for (n in nodes.to.merge) {merged <- MergeNode(object = merged, node.use = n)}

In [None]:
#re-visualize the tSNE after we have merged the non-distinct nodes
set.seed(5)
TSNEPlot(merged, do.label = TRUE)

In [None]:
#determine differentially expressed genes for each population

#find markers for each population
all.markers <- FindAllMarkers(merged, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25)

In [None]:
#write DE results to table for inspection
#write.table(all.markers, 'intact.only.markers.txt', sep = '\t')

### GSE88975 - Xtropicalis - Tail

In [11]:
# test_suppl <- getGEOSuppFiles("GSE88975")
# saveRDS(test_suppl,"./GSE88975/test_suppl.RDS")
test_suppl <- readRDS("./GSE88975/test_suppl.RDS")
print(test_suppl)
directory <- rownames(test_suppl)[1]
read_counts_GSE88975 <- as.matrix(fread(directory)[,-c("Cluster", "DE")],rownames=1)
print(head(read_counts_GSE88975))


GSE88975_countData <- round(read_counts_GSE88975)
GSE88975_condition <- factor(gsub("\\..*", "",colnames(read_counts_GSE88975)))

                                                                                                            size
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE88975/GSE88975_Tadpole.Tail.Regeneration_Count.Data.txt.gz 1144638
                                                                                                         isdir
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE88975/GSE88975_Tadpole.Tail.Regeneration_Count.Data.txt.gz FALSE
                                                                                                         mode
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE88975/GSE88975_Tadpole.Tail.Regeneration_Count.Data.txt.gz  640
                                                                                                                       mtime
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE88975/GSE88975_Tadpole.Tail.Regeneration_Count.Data.txt.gz 2022-04-21 19:41:46
                                                                                  

In [111]:
results_GSE88975 <- get_deseq(GSE88975_countData, GSE88975_condition)

converting counts to integer mode

estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing



In [112]:
experiments_GSE88975 <- list("GSE88975_0hpa_WT" = results(results_GSE88975, contrast=c("condition","0hpa", "WT")),
                             "GSE88975_6hpa_WT" = results(results_GSE88975, contrast=c("condition","6hpa", "WT")),
                             "GSE88975_15hpa_WT" = results(results_GSE88975, contrast=c("condition","15hpa", "WT")),
                             "GSE88975_24hpa_WT" = results(results_GSE88975, contrast=c("condition","24hpa", "WT")),
                             "GSE88975_72hpa_WT" = results(results_GSE88975, contrast=c("condition","72hpa", "WT")))

In [117]:
GSE88975_files <- list()

first_experiment <- data.frame(experiments_GSE88975[[1]])
first_experiment <- first_experiment[rownames(first_experiment) %in% background_set$gene,]

suppressMessages(suppressWarnings(library(biomaRt)))
human_orthologs <- getLDS(attributes="hgnc_symbol", filters="hgnc_symbol",
                          values=rownames(first_experiment),
                          mart=human,attributesL=c("hgnc_symbol","ensembl_gene_id"),
                          martL=human)

for (i in 1:length(experiments_GSE88975)){
    
    
    current_experiment <- data.frame(experiments_GSE88975[[i]])
    
    current_experiment <- current_experiment[rownames(current_experiment) %in% background_set$gene,]
    
    GSE88975_files[[names(experiments_GSE88975)[i]]] <- filter_transcripts(deseq_results = current_experiment,
                                                                           organism_name ="xtropicalis_regen_GSE88975",
                                                                           biomart_file = human,
                                                                           attrbts = "hgnc_symbol",
                                                                           geo_code = "GSE88975",
                                                                           experiment = names(experiments_GSE88975)[i],
                                                                           pval = 0.05,
                                                                           human_orthologs = human_orthologs,
                                                                           homology_confidence = F)
    print('######################################################')
}

[1] "Total xtropicalis_regen_GSE88975 transcripts with Available Human Orthologs: 10918"
[1] "Total xtropicalis_regen_GSE88975 transcripts with Available Human Ortholog Gene Symbols: 10036"
[1] "Total xtropicalis_regen_GSE88975 genes with Available Human Ortholog Gene Symbols and in Background gene sets: 10036"
[1] "Total genes after orthologous mapping and filtering: 10036"
[1] "Total significant transcripts after orthologous mapping and filtering: 175"
[1] "Total significant downregulated xtropicalis_regen_GSE88975 genes after orthologous mapping and filtering: 69"
[1] "Total significant upregulated xtropicalis_regen_GSE88975 genes after orthologous mapping and filtering: 93"
[1] "Total significant Final downregulated human othologs: 69"
[1] "Total significant Final upregulated human othologs: 93"
[1] "current fraction of Intersection 0"
[1] "adj.P.Val range: 9.92168652564734e-21 - 0.0496129119371455"
[1] "adj.P.Val neg logfc range: -3.80039620989868 - -0.858804578610002"
[1] "adj.P.

## Stress

### GSE140211 - EColi

In [11]:
# test_suppl <- getGEOSuppFiles("GSE140211")
# saveRDS(test_suppl,"./GSE140211/test_suppl.RDS")
test_suppl <- readRDS("./GSE140211/test_suppl.RDS")
print(test_suppl)

                                                                                                                                size
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE140211/GSE140211_AllTimeDataTogether.xlsx                                       656764
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE140211/GSE140211_MG1655_raw_read_counts_per_gene_Nate_BL_06022016_31124104.xlsx 303373
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE140211/GSE140211_ann_t0t120.result.txt.gz                                       251088
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE140211/GSE140211_ann_t0t30.result.txt.gz                                        251770
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE140211/GSE140211_ann_t0t60.result.txt.gz                                        250489
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE140211/GSE140211_readme.txt                                                        507
                                                                     

In [12]:
print(rownames(test_suppl)[3])
GSE140211_results_0min_120min <- fread(rownames(test_suppl)[3])
print(rownames(test_suppl)[4])
GSE140211_results_0min_30min <- fread(rownames(test_suppl)[4])
print(rownames(test_suppl)[5])
GSE140211_results_0min_60min <- fread(rownames(test_suppl)[5])

[1] "/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE140211/GSE140211_ann_t0t120.result.txt.gz"
[1] "/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE140211/GSE140211_ann_t0t30.result.txt.gz"
[1] "/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE140211/GSE140211_ann_t0t60.result.txt.gz"


In [13]:
symbol_ID <- fread('/data/timonaj/biomart_orthologs/ecoli_mg1655_gff_id_to_symbol.txt', header=F)
ID_ensembl <- fread('/data/timonaj/biomart_orthologs/oma_ecolik12_human_ortholog.txt', header=F)
get_ecoli_human_ortholog <- function(x) {
    cur_gene <- gsub("-[0-9].*$","",x[which(names(x) == "gene_symbol")])
    
    if(cur_gene %in% symbol_ID$V2) {
        bID<- symbol_ID$V1[symbol_ID$V2 == cur_gene]
        if(bID %in% ID_ensembl$V1) {
            return(ID_ensembl$V2[ID_ensembl$V1 == bID])
        } else {return("")}
    } else {return("")}
}
filter_ecoli <- function(deseq_results,organism_name,biomart_file, attrbts,
                         geo_code, experiment, pval){
    print(paste("Total", organism_name, "genes :",
                    nrow(deseq_results),
                    sep = " "))

    significant_genes <- deseq_results[deseq_results$padj != 'NA' &
                                       deseq_results$padj < pval,]

    print(paste("Total Significant", organism_name, "genes :",
                    nrow(significant_genes),
                    sep = " "))

    upregulated_genes <- significant_genes[!is.na(significant_genes$log2FoldChange) &
                                           significant_genes$log2FoldChange >1,]
    downregulated_genes <- significant_genes[!is.na(significant_genes$log2FoldChange) &
                                             significant_genes$log2FoldChange < -1,]

    print(paste("Total Significant Upregulated", organism_name, "genes :",
                    nrow(upregulated_genes),
                    sep = " "))

    print(paste("Total Significant Downregulated", organism_name, "genes :",
                    nrow(downregulated_genes),
                    sep = " "))

    up_human_genes <- unlist(apply(upregulated_genes,1, get_ecoli_human_ortholog))
    total_up_human_genes <- up_human_genes[up_human_genes != ""]

    down_human_genes <- unlist(apply(downregulated_genes,1, get_ecoli_human_ortholog))
    total_down_human_genes <- down_human_genes[down_human_genes != ""]

    print(paste("Total Upregulated", organism_name, "human ensembl orthologs:",
                    length(total_up_human_genes),
                    sep = " "))

    print(paste("Total Downregulated", organism_name, "human orthologs:",
                    length(total_down_human_genes),
                    sep = " "))

    if(length(total_up_human_genes) > 0 ) {
        human_orthologs_up <- getLDS(attributes=attrbts, filters=attrbts,
                                     values=gsub("\\.[0-9].*$","",total_up_human_genes),
                                     mart=human,attributesL=c("hgnc_symbol","ensembl_gene_id"),
                                     martL=human)

        print(paste("Total Upregulated", organism_name, "human gene symbol orthologs:",
                    length(unique(human_orthologs_up$HGNC.symbol)),
                    sep = " "))

        write.table(data.frame("HGNC" =unique(human_orthologs_up$HGNC.symbol)),
                    file = paste("./geo_degs/", organism_name, "_", experiment,
                                 "_upregulated.txt", sep=""), quote = FALSE, sep = "\t",
                    row.names = FALSE, col.names = FALSE)

        }
    if(length(total_down_human_genes) > 0 ) {
        human_orthologs_down <- getLDS(attributes=attrbts, filters=attrbts,
                                  values=gsub("\\.[0-9].*$","",total_down_human_genes),
                                  mart=human,attributesL=c("hgnc_symbol","ensembl_gene_id"),
                                  martL=human)

        print(paste("Total Downregulated", organism_name, "human gene symbol orthologs:",
                    length(unique(human_orthologs_down$HGNC.symbol)),
                    sep = " "))

        write.table(data.frame("HGNC" =unique(human_orthologs_down$HGNC.symbol)),
                    file = paste("./geo_degs/", organism_name, "_", experiment,
                                 "_downregulated.txt", sep=""), quote = FALSE, sep = "\t",
                    row.names = FALSE, col.names = FALSE)

        }
    return()
}

In [14]:
filter_ecoli(deseq_results= GSE140211_results_0min_30min,
             organism_name = "ecoli_stress_GSE140211",
             biomart_file = human,
             attrbts = "ensembl_gene_id",
             geo_code = "GSE140211",
             experiment = "GSE140211_0min_30min",
             pval = 0.05)

[1] "Total ecoli_stress_GSE140211 genes : 4564"
[1] "Total Significant ecoli_stress_GSE140211 genes : 741"
[1] "Total Significant Upregulated ecoli_stress_GSE140211 genes : 375"
[1] "Total Significant Downregulated ecoli_stress_GSE140211 genes : 316"
[1] "Total Upregulated ecoli_stress_GSE140211 human ensembl orthologs: 110"
[1] "Total Downregulated ecoli_stress_GSE140211 human orthologs: 86"
[1] "Total Upregulated ecoli_stress_GSE140211 human gene symbol orthologs: 101"
[1] "Total Downregulated ecoli_stress_GSE140211 human gene symbol orthologs: 85"


NULL

In [15]:
head(GSE140211_results_0min_30min)

gene_symbol,ID,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
thrL-1,AAC73112-1,141.39066,0.36642982,0.2369992,1.5461226,0.122075,0.2777456
thrA-1,AAC73113-1,407.54002,0.02900726,0.6274375,0.04623131,0.9631259,0.9802569
thrB-1,AAC73114-1,138.27746,0.19116182,0.6951494,0.27499387,0.7833209,0.8824486
thrC-1,AAC73115-1,134.29806,0.40497325,0.6855792,0.59070237,0.5547199,0.7338566
yaaX-1,AAC73116-1,14.56248,-0.78048996,0.5216315,-1.49624786,0.1345891,0.2985752
yaaA-1,AAC73117-1,20.44944,0.14479231,0.3883685,0.37282199,0.7092809,0.8367534


In [16]:
filter_ecoli(deseq_results= GSE140211_results_0min_60min,
             organism_name = "ecoli_stress_GSE140211",
             biomart_file = human,
             attrbts = "ensembl_gene_id",
             geo_code = "GSE140211",
             experiment = "GSE140211_0min_60min",
             pval = 0.05)

[1] "Total ecoli_stress_GSE140211 genes : 4564"
[1] "Total Significant ecoli_stress_GSE140211 genes : 835"
[1] "Total Significant Upregulated ecoli_stress_GSE140211 genes : 411"
[1] "Total Significant Downregulated ecoli_stress_GSE140211 genes : 376"
[1] "Total Upregulated ecoli_stress_GSE140211 human ensembl orthologs: 154"
[1] "Total Downregulated ecoli_stress_GSE140211 human orthologs: 117"
[1] "Total Upregulated ecoli_stress_GSE140211 human gene symbol orthologs: 127"
[1] "Total Downregulated ecoli_stress_GSE140211 human gene symbol orthologs: 107"


NULL

In [17]:
filter_ecoli(deseq_results= GSE140211_results_0min_120min,
             organism_name = "ecoli_stress_GSE140211",
             biomart_file = human,
             attrbts = "ensembl_gene_id",
             geo_code = "GSE140211",
             experiment = "GSE140211_0min_120min",
             pval = 0.05)

[1] "Total ecoli_stress_GSE140211 genes : 4564"
[1] "Total Significant ecoli_stress_GSE140211 genes : 819"
[1] "Total Significant Upregulated ecoli_stress_GSE140211 genes : 419"
[1] "Total Significant Downregulated ecoli_stress_GSE140211 genes : 342"
[1] "Total Upregulated ecoli_stress_GSE140211 human ensembl orthologs: 141"
[1] "Total Downregulated ecoli_stress_GSE140211 human orthologs: 96"
[1] "Total Upregulated ecoli_stress_GSE140211 human gene symbol orthologs: 116"
[1] "Total Downregulated ecoli_stress_GSE140211 human gene symbol orthologs: 95"


NULL

### GSE124673 - Ecoli - K12

In [19]:
# test_suppl <- getGEOSuppFiles("GSE124673")
# saveRDS(test_suppl,"./GSE124673/test_suppl.RDS")
test_suppl <- readRDS("./GSE124673/test_suppl.RDS")
print(test_suppl)

                                                                                                          size
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE124673/GSE124673_RAW.tar                                 2693120
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE124673/GSE124673_processed_data_file_ecoli_biocides.xlsx  904961
                                                                                                       isdir
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE124673/GSE124673_RAW.tar                                 FALSE
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE124673/GSE124673_processed_data_file_ecoli_biocides.xlsx FALSE
                                                                                                       mode
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE124673/GSE124673_RAW.tar                                  640
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE124673/GSE124673_processed_data_file_ecoli_biocides.xlsx  640
                

In [20]:
count_files <- list.files("./GSE124673/", "*.gz")

for(i in 1:length(count_files)) {
    name <- gsub("^GSM[0-9]*_|_raw\\.csv\\.gz","",count_files[i])
    current_file <- fread(paste("./GSE124673/",count_files[i],sep=""))
    
    if(i == 1) {
        colnames(current_file) <- c("gene", name)
        GSE124673_count_mat <- current_file
        
    } else {
        GSE124673_count_mat <- cbind(GSE124673_count_mat, current_file$x)
        colnames(GSE124673_count_mat)[length(GSE124673_count_mat)] <- name

    }
    
}
genes <- GSE124673_count_mat$gene
GSE124673_count_mat <- as.matrix(GSE124673_count_mat[,2:length(GSE124673_count_mat)])
rownames(GSE124673_count_mat) <- genes
GSE124673_condition <- factor(gsub("_[0-9]*$","",colnames(GSE124673_count_mat)))

In [21]:
GSE124673_condition
head(GSE124673_count_mat)

Unnamed: 0,BENZ_12H_1,BENZ_12H_2,BENZ_12H_3,ETOH_12H_1,ETOH_12H_2,ETOH_12H_3,GLUTA_12H_1,GLUTA_12H_2,GLUTA_12H_3,H2O2_12H_1,⋯,PHE_30Min_3,POV_30Min_1,POV_30Min_2,POV_30Min_3,CONTROL_12H_1,CONTROL_12H_2,CONTROL_12H_3,CONTROL_30Min_1,CONTROL_30Min_2,CONTROL_30Min_3
thrL,1009,838,711,581,384,452,662,686,726,637,⋯,230,358,335,366,575,460,461,361,313,332
thrA,29923,13186,17805,20375,13585,20720,18126,15625,15168,20857,⋯,9330,10891,9698,12836,11758,13114,11648,6778,8218,7077
thrB,9717,4024,5499,5776,3616,5745,4981,4497,4562,6088,⋯,3395,3411,2970,3984,3692,4227,3400,2137,2527,2060
thrC,11596,4357,6751,6364,4174,6458,5615,4549,5053,6779,⋯,4246,3956,3426,4621,3871,4834,4092,2425,2916,2361
yaaX,338,98,152,107,71,101,156,125,123,177,⋯,126,123,89,118,114,131,95,67,77,61
yaaA,424,267,288,271,153,209,262,235,232,280,⋯,118,155,169,189,161,200,192,167,176,140


In [22]:
results_GSE124673 <- get_deseq(GSE124673_count_mat, GSE124673_condition)

estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing



In [23]:
levels(GSE124673_condition)

In [24]:
experiments_GSE124673 <- list("GSE124673_BENZ30Min_Con30Min" = results(results_GSE124673, contrast=c("condition","BENZ_30Min", "CONTROL_30Min")),
                              "GSE124673_PHE30Min_Con30Min" = results(results_GSE124673, contrast=c("condition","PHE_30Min", "CONTROL_30Min")),
                              "GSE124673_POV30Min_Con30Min" = results(results_GSE124673, contrast=c("condition","POV_30Min", "CONTROL_30Min")),
                              "GSE124673_BENZ12H_Con12H" = results(results_GSE124673, contrast=c("condition","BENZ_12H", "CONTROL_12H")),
                              "GSE124673_ETOH12H_Con12H" = results(results_GSE124673, contrast=c("condition","ETOH_12H", "CONTROL_12H")),
                              "GSE124673_GLUTA12H_Con12H" = results(results_GSE124673, contrast=c("condition","GLUTA_12H", "CONTROL_12H")),
                              "GSE124673_H2O212H_Con12H" = results(results_GSE124673, contrast=c("condition","H2O2_12H", "CONTROL_12H")),
                              "GSE124673_ISOP12H_Con12H" = results(results_GSE124673, contrast=c("condition","ISOP_12H", "CONTROL_12H")),
                              "GSE124673_PERA12H_Con12H" = results(results_GSE124673, contrast=c("condition","PERA_12H", "CONTROL_12H")),
                              "GSE124673_PHE12H_Con12H" = results(results_GSE124673, contrast=c("condition","PHE_12H", "CONTROL_12H")),
                              "GSE124673_POV12H_Con12H" = results(results_GSE124673, contrast=c("condition","POV_12H", "CONTROL_12H")),
                              "GSE124673_SOD12H_Con12H" = results(results_GSE124673, contrast=c("condition","SOD_12H", "CONTROL_12H")),
                              "GSE124673_XID12H_Con12H" = results(results_GSE124673, contrast=c("condition","XID_12H", "CONTROL_12H")))

In [25]:
for(i in 1:length(experiments_GSE124673)) {
    print(names(experiments_GSE124673)[i])
    current_file <- data.frame(experiments_GSE124673[i])
    current_file$gene_symbol <- rownames(current_file)
    colnames(current_file) <- gsub("GSE.*\\.","",colnames(current_file))
    
    filter_ecoli(deseq_results= current_file,
             organism_name = "ecoli_stress_GSE124673",
             biomart_file = human,
             attrbts = "ensembl_gene_id",
             geo_code = "GSE124673",
             experiment = names(experiments_GSE124673)[i],
             pval = 0.05)
}

[1] "GSE124673_BENZ30Min_Con30Min"
[1] "Total ecoli_stress_GSE124673 genes : 4452"
[1] "Total Significant ecoli_stress_GSE124673 genes : 1063"
[1] "Total Significant Upregulated ecoli_stress_GSE124673 genes : 77"
[1] "Total Significant Downregulated ecoli_stress_GSE124673 genes : 124"
[1] "Total Upregulated ecoli_stress_GSE124673 human ensembl orthologs: 19"
[1] "Total Downregulated ecoli_stress_GSE124673 human orthologs: 54"
[1] "Total Upregulated ecoli_stress_GSE124673 human gene symbol orthologs: 19"
[1] "Total Downregulated ecoli_stress_GSE124673 human gene symbol orthologs: 53"
[1] "GSE124673_PHE30Min_Con30Min"
[1] "Total ecoli_stress_GSE124673 genes : 4452"
[1] "Total Significant ecoli_stress_GSE124673 genes : 1347"
[1] "Total Significant Upregulated ecoli_stress_GSE124673 genes : 113"
[1] "Total Significant Downregulated ecoli_stress_GSE124673 genes : 132"
[1] "Total Upregulated ecoli_stress_GSE124673 human ensembl orthologs: 30"
[1] "Total Downregulated ecoli_stress_GSE124673 h

### GSE160082 - Ecoli

In [26]:
# test_suppl <- getGEOSuppFiles("GSE160082")
# saveRDS(test_suppl,"./GSE160082/test_suppl.RDS")
test_suppl <- readRDS("./GSE160082/test_suppl.RDS")
print(test_suppl)

                                                                                                   size
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE160082/GSE160082_20201022_normalized_counts.txt.gz 269743
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE160082/GSE160082_20201022_raw_counts.txt.gz         81744
                                                                                                 isdir
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE160082/GSE160082_20201022_normalized_counts.txt.gz FALSE
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE160082/GSE160082_20201022_raw_counts.txt.gz        FALSE
                                                                                                 mode
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE160082/GSE160082_20201022_normalized_counts.txt.gz  640
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE160082/GSE160082_20201022_raw_counts.txt.gz         640
                                                                         

In [27]:
GSE160082_count_mat <- fread(rownames(test_suppl)[2])
genes <- GSE160082_count_mat$V1
GSE160082_count_mat <- as.matrix(GSE160082_count_mat[,2:length(GSE160082_count_mat)])
rownames(GSE160082_count_mat) <- genes
GSE160082_condition <- factor(rep(c("Untreated","Colistin","CSA13","CSA131","LL37"),3))

In [28]:
GSE160082_condition
head(GSE160082_count_mat)

Unnamed: 0,MG_R1B_1,MG_R1B_3,MG_R1B_5,MG_R1B_6,MG_R1B_8,MG_R2_1,MG_R2_3,MG_R2_5,MG_R2_6,MG_R2_8,MG_R3_1,MG_R3_3,MG_R3_5,MG_R3_6,MG_R3_8
thrL,46,90,46,29,17,48,55,45,31,37,27,28,33,25,41
thrA,318,635,143,121,408,341,448,182,115,528,273,462,168,278,537
thrB,165,211,47,52,133,133,135,32,38,180,85,151,44,83,153
thrC,160,252,36,53,166,171,180,47,73,242,149,245,44,131,210
yaaX,2,141,37,347,78,1,138,37,290,93,3,68,37,76,189
yaaA,39,26,9,14,18,30,23,15,13,30,39,26,10,38,35


In [29]:
results_GSE160082 <- get_deseq(GSE160082_count_mat, GSE160082_condition)

estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing



In [30]:
levels(GSE160082_condition)

In [31]:
experiments_GSE160082 <- list("GSE160082_Colistin_Untreated" = results(results_GSE160082, contrast=c("condition","Colistin", "Untreated")),
                              "GSE160082_CSA13_Untreated" = results(results_GSE160082, contrast=c("condition","CSA13", "Untreated")),
                              "GSE160082_CSA131_Untreated" = results(results_GSE160082, contrast=c("condition","CSA131", "Untreated")),
                              "GSE160082_LL37_Untreated" = results(results_GSE160082, contrast=c("condition","LL37", "Untreated")))

In [32]:
for(i in 1:length(experiments_GSE160082)) {
    print(names(experiments_GSE160082)[i])
    current_file <- data.frame(experiments_GSE160082[i])
    current_file$gene_symbol <- rownames(current_file)
    colnames(current_file) <- gsub("GSE.*\\.","",colnames(current_file))
    
    filter_ecoli(deseq_results= current_file,
             organism_name = "ecoli_stress_GSE160082",
             biomart_file = human,
             attrbts = "ensembl_gene_id",
             geo_code = "GSE160082",
             experiment = names(experiments_GSE160082)[i],
             pval = 0.05)
}

[1] "GSE160082_Colistin_Untreated"
[1] "Total ecoli_stress_GSE160082 genes : 4099"
[1] "Total Significant ecoli_stress_GSE160082 genes : 1609"
[1] "Total Significant Upregulated ecoli_stress_GSE160082 genes : 342"
[1] "Total Significant Downregulated ecoli_stress_GSE160082 genes : 240"
[1] "Total Upregulated ecoli_stress_GSE160082 human ensembl orthologs: 88"
[1] "Total Downregulated ecoli_stress_GSE160082 human orthologs: 90"
[1] "Total Upregulated ecoli_stress_GSE160082 human gene symbol orthologs: 85"
[1] "Total Downregulated ecoli_stress_GSE160082 human gene symbol orthologs: 80"
[1] "GSE160082_CSA13_Untreated"
[1] "Total ecoli_stress_GSE160082 genes : 4099"
[1] "Total Significant ecoli_stress_GSE160082 genes : 1886"
[1] "Total Significant Upregulated ecoli_stress_GSE160082 genes : 632"
[1] "Total Significant Downregulated ecoli_stress_GSE160082 genes : 685"
[1] "Total Upregulated ecoli_stress_GSE160082 human ensembl orthologs: 120"
[1] "Total Downregulated ecoli_stress_GSE160082 h

### GSE153850 - DMelanogaster

In [11]:
# test_suppl <- getGEOSuppFiles("GSE153850")
# saveRDS(test_suppl,"./GSE153850/test_suppl.RDS")
test_suppl <- readRDS("./GSE153850/test_suppl.RDS")
print(test_suppl)

                                                                                    size
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE153850/GSE153850_gene_counts.txt.gz 918820
                                                                                  isdir
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE153850/GSE153850_gene_counts.txt.gz FALSE
                                                                                  mode
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE153850/GSE153850_gene_counts.txt.gz  640
                                                                                                mtime
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE153850/GSE153850_gene_counts.txt.gz 2022-04-21 19:50:15
                                                                                                ctime
/gpfs/gsfs11/users/timonaj/cancer_as_wound/GSE153850/GSE153850_gene_counts.txt.gz 2022-04-21 19:50:15
                                                                

In [12]:
fly_df<-listAttributes(dmelanogaster)
fly_df[grep("Fly",fly_df$description),]

Unnamed: 0_level_0,name,description,page
Unnamed: 0_level_1,<chr>,<chr>,<chr>
38,flybase_annotation_id,FlyBase annotation ID,feature_page
39,flybase_gene_id,FlyBase gene ID,feature_page
40,flybasename_gene,FlyBase gene name ID,feature_page
41,flybase_transcript_id,FlyBase transcript ID,feature_page
42,flybasename_transcript,FlyBase transcript name ID,feature_page
43,flybase_translation_id,FlyBase translation ID,feature_page
44,flybasename_translation,FlyBase translation name ID,feature_page
45,flyreactome,"FlyReactome, a Curated Knowledgebase of Drosophila Melanogaster Pathways ID",feature_page


In [13]:
GSE153850_count_mat <- fread(rownames(test_suppl)[1])
genes <- GSE153850_count_mat$Geneid
GSE153850_count_mat <- as.matrix(GSE153850_count_mat[,2:length(GSE153850_count_mat)])
rownames(GSE153850_count_mat) <- genes
GSE153850_condition <- factor(gsub("[0-9]$","",colnames(GSE153850_count_mat)))

In [14]:
GSE153850_condition
head(GSE153850_count_mat)

Unnamed: 0,G2C3,G3C2,G3T2,COT4,MUT2,SWC2,G3C3,TOC3,TOT2,COC2,⋯,COC4,MUC3,G2C2,COT2,G3T1,MUC2,COT1,SWT1,SWT2,MUC4
FBgn0085737,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
FBgn0267594,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
FBgn0085807,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
FBgn0085746,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
FBgn0085751,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
FBgn0085748,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [15]:
results_GSE153850 <- get_deseq(GSE153850_count_mat, GSE153850_condition)

estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing



In [16]:
levels(GSE153850_condition)

In [17]:
experiments_GSE153850 <- list("GSE153850_COT_COC" = results(results_GSE153850, contrast=c("condition","COT", "COC")),
                              "GSE153850_G2T_G2C" = results(results_GSE153850, contrast=c("condition","G2T", "G2C")),
                              "GSE153850_G3T_G3C" = results(results_GSE153850, contrast=c("condition","G3T", "G3C")),
                              "GSE153850_MUT_MUC" = results(results_GSE153850, contrast=c("condition","MUT", "MUC")),
                              "GSE153850_SWT_SWC" = results(results_GSE153850, contrast=c("condition","SWT", "SWC")),
                              "GSE153850_TOT_TOC" = results(results_GSE153850, contrast=c("condition","TOT", "TOC")))

In [19]:
head(first_experiment)

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
FBgn0085737,0,,,,,
FBgn0267594,0,,,,,
FBgn0085807,0,,,,,
FBgn0085746,0,,,,,
FBgn0085751,0,,,,,
FBgn0085748,0,,,,,


In [22]:
"HGNC.symbol" = available_orthologs$`Gene name`, ""

Gene stable ID,Transcript stable ID,Gene name,Drosophila melanogaster gene name,Drosophila melanogaster gene stable ID,Drosophila melanogaster protein or transcript stable ID,"Drosophila melanogaster orthology confidence [0 low, 1 high]"
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>
ENSG00000198888,ENST00000361390,MT-ND1,mt:ND1,FBgn0013679,FBpp0390631,1
ENSG00000198763,ENST00000361453,MT-ND2,mt:ND2,FBgn0013680,FBpp0100175,1
ENSG00000198804,ENST00000361624,MT-CO1,mt:CoI,FBgn0013674,FBpp0100176,1
ENSG00000198712,ENST00000361739,MT-CO2,mt:CoII,FBgn0013675,FBpp0100177,1
ENSG00000198899,ENST00000361899,MT-ATP6,mt:ATPase6,FBgn0013672,FBpp0390630,1
ENSG00000198938,ENST00000362079,MT-CO3,mt:CoIII,FBgn0013676,FBpp0100180,1
ENSG00000198840,ENST00000361227,MT-ND3,mt:ND3,FBgn0013681,FBpp0100181,1
ENSG00000198886,ENST00000361381,MT-ND4,mt:ND4,FBgn0262952,FBpp0390632,1
ENSG00000198786,ENST00000361567,MT-ND5,mt:ND5,FBgn0013684,FBpp0390633,1
ENSG00000198727,ENST00000361789,MT-CYB,mt:Cyt-b,FBgn0013678,FBpp0390634,1


In [None]:
head(human_orthologs)

In [None]:
GSE153850_files <- list()

first_experiment <- data.frame(experiments_GSE153850[[1]])

dmelanogaster <- fread("../data/biomart_orthologs/human_dmelanogaster.txt.gz")
available_orthologs <- dmelanogaster[dmelanogaster$`Drosophila melanogaster gene stable ID`%in% rownames(first_experiment),]
human_orthologs <- getLDS(attributes="hgnc_symbol", filters="hgnc_symbol",
                          values=rownames(first_experiment),
                          mart=human,attributesL=c("hgnc_symbol","ensembl_gene_id"),
                          martL=human)

for (i in 1:length(experiments_GSE88975)){
    
    
    current_experiment <- data.frame(experiments_GSE88975[[i]])
    
    current_experiment <- current_experiment[rownames(current_experiment) %in% background_set$gene,]
    
    GSE88975_files[[names(experiments_GSE88975)[i]]] <- filter_transcripts(deseq_results = current_experiment,
                                                                           organism_name ="xtropicalis_regen_GSE88975",
                                                                           biomart_file = human,
                                                                           attrbts = "hgnc_symbol",
                                                                           geo_code = "GSE88975",
                                                                           experiment = names(experiments_GSE88975)[i],
                                                                           pval = 0.05,
                                                                           human_orthologs = human_orthologs,
                                                                           homology_confidence = F)
    print('######################################################')
}

In [30]:
head(experiments_GSE153850[[1]])

log2 fold change (MLE): condition COT vs COC 
Wald test p-value: condition COT vs COC 
DataFrame with 6 rows and 6 columns
             baseMean log2FoldChange     lfcSE      stat    pvalue      padj
            <numeric>      <numeric> <numeric> <numeric> <numeric> <numeric>
FBgn0085737         0             NA        NA        NA        NA        NA
FBgn0267594         0             NA        NA        NA        NA        NA
FBgn0085807         0             NA        NA        NA        NA        NA
FBgn0085746         0             NA        NA        NA        NA        NA
FBgn0085751         0             NA        NA        NA        NA        NA
FBgn0085748         0             NA        NA        NA        NA        NA

In [28]:
test_df <- data.frame(experiments_GSE153850[1])
rownames(test_df)

## Test Code - RSEM

# temp code

In [None]:
read_counts_ <- download_data("")

_samples <- colnames(read_counts_GSE137897)[(grep("H.*\\_.*",colnames(read_counts_GSE137897)))]
_samples <- colnames(read_counts_GSE137897)[(grep("P.*\\_.*",colnames(read_counts_GSE137897)))]

_countData <- read_counts_
_condition <- factor(c(rep("", length(_samples)), rep("",length(_samples))))

In [None]:
results_ <- get_deseq(_countData, _condition)

In [None]:
_files <- filter_transcripts(deseq_results = data.frame(results_),
                            organism_name ="",
                            biomart_file = human,
                            attrbts = "hgnc_symbol",
                            geo_code = "",
                            experiment = "",
                            pval = 0.05,
                            homology_confidence = F)