In [22]:
library(data.table)
library(DescTools)
library(ggplot2)
library(devtools)
library(pheatmap)
library(tidyverse)

Loading required package: usethis



Error in get(genname, envir = envir) : object 'testthat_print' not found


── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mtibble [39m 3.1.2     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0
[32m✔[39m [34mpurrr  [39m 0.3.4     

── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mbetween()[39m   masks [34mdata.table[39m::between()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m    masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mfirst()[39m     masks [34mdata.table[39m::first()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m       masks [34mstats[39m::lag()
[31m✖[39m [34mdplyr[39m::[32mlast()[39m      masks [34mdata.table[39m::last()
[31m✖[39m [34mpurrr[39m:

In [2]:
### files to be downloaded

# read in the .txt file for tissue type as well as the larger gene expression file\
ptm <- proc.time()
sample_attributes <- fread(file = "/data/timonaj/gene_variability/GTEx_v7_Annotations_SampleAttributesDS.txt")
gene_tpm <- fread(file = "/data/timonaj/gene_variability/GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm.gct")
gene_tpm_copy <- as.data.frame(gene_tpm)
print("download completed in")
print(proc.time() - ptm)

### commonly used variables

# length of the numerical tpm values since the first 2 columns are characters
tpm_length <- 3:length(gene_tpm_copy)

tissue_types <- unique(sample_attributes$SMTS)

gtex_project_files <- list()
for(i in 1:length(tissue_types)) {
  project <- tissue_types[i]
  sampids<-sample_attributes[sample_attributes$SMTS == project,]$SAMPID
  gtex_project_files[[project]] <- cbind(gene_tpm_copy[,1:2], gene_tpm_copy[,colnames(gene_tpm_copy) %in% sampids])
}

[1] "download completed in"
   user  system elapsed 
 19.225   8.894  36.187 


In [3]:
summary(gtex_project_files)

                Length Class      Mode
Blood            539   data.frame list
Adipose Tissue   799   data.frame list
Muscle           566   data.frame list
Blood Vessel     915   data.frame list
Heart            602   data.frame list
Ovary            135   data.frame list
Uterus           113   data.frame list
Vagina           117   data.frame list
Breast           292   data.frame list
Skin            1205   data.frame list
Salivary Gland    99   data.frame list
Brain           1673   data.frame list
Adrenal Gland    192   data.frame list
Thyroid          448   data.frame list
Lung             429   data.frame list
Spleen           164   data.frame list
Pancreas         250   data.frame list
Esophagus       1023   data.frame list
Stomach          264   data.frame list
Colon            509   data.frame list
Small Intestine  139   data.frame list
Prostate         154   data.frame list
Testis           261   data.frame list
Nerve            416   data.frame list
Pituitary        185   da

In [21]:
get_reccurent_genes <- function(duplicated_list,species_exptype,total_exps,title) {
    print(paste(species_exptype,title, sep =" "))
    
    if(total_exps == 1) {
        print(paste(species_exptype, "total_datasets :", total_exps, sep =" "))
        print("################################################")
        return(duplicated_list)
    }
    
    duplicated_list <- sort(table(duplicated_list), decreasing = TRUE)
    
    if(length(duplicated_list) > 1000) {
        duplicated_list <- duplicated_list[1:1000]
    }
    
    duplicated_list <- duplicated_list[duplicated_list > 2]
    
    print(paste(species_exptype, "total_datasets :", total_exps, sep =" "))
    print(paste(species_exptype, "mean reccurence :", mean(duplicated_list), sep =" "))
    print(paste(species_exptype, "min reccurence :", min(duplicated_list), sep =" "))
    print(paste(species_exptype, "max reccurence :", max(duplicated_list), sep =" "))
    print("################################################")
    return(names(duplicated_list))
}

wrs_list <- list("upregulated" = list(),
                 "downregulated" = list())
recurrent_wrs_list <- list("upregulated" = list(),
                           "downregulated" = list())


wrs_files <-list.files('./geo_degs/')
wrs_foi <- wrs_files[grep("^[a-z].*", wrs_files)]
species_exptype <- unique(sub('_[A-Z].*$', '',wrs_foi))

wrs_foi_up <- wrs_files[grep("^[a-z].*upregulated.*", wrs_files)]
wrs_foi_down <- wrs_files[grep("^[a-z].*downregulated.*", wrs_files)]

for(i in 1:length(species_exptype)) {
    
    current_species_up <- wrs_foi_up[grep(species_exptype[i], wrs_foi_up)]
    current_species_down <- wrs_foi_down[grep(species_exptype[i], wrs_foi_down)]
    total_spec_exp_up <- character(0)
    total_spec_exp_down <- character(0)
    
    if(length(current_species_up) == length(current_species_down)) {
        for(j in 1:length(current_species_up)) {
            current_file_path_up <- paste('./geo_degs/', current_species_up[j], sep="")
            current_file_path_down <- paste('./geo_degs/', current_species_down[j], sep="")
            
            total_genes_up <- fread(current_file_path_up, header=FALSE)$V1
            total_genes_down <- fread(current_file_path_down, header=FALSE)$V1
            
            total_spec_exp_up <- append(total_spec_exp_up,
                                        unique(total_genes_up),
                                        length(total_spec_exp_up))
            total_spec_exp_down <- append(total_spec_exp_down,
                                          unique(total_genes_down),
                                          length(total_spec_exp_down))
        }
    }
    # union of all genes
    wrs_list[["upregulated"]][[species_exptype[i]]] <- unique(total_spec_exp_up)
    wrs_list[["downregulated"]][[species_exptype[i]]] <- unique(total_spec_exp_down)
    
    # top recurrent genes
    recurrent_wrs_list[["upregulated"]][[species_exptype[i]]] <- get_reccurent_genes(total_spec_exp_up,
                                                                                     species_exptype[i],
                                                                                     length(current_species_up),
                                                                                     "upregulated")
    recurrent_wrs_list[["downregulated"]][[species_exptype[i]]] <- get_reccurent_genes(total_spec_exp_down,
                                                                                       species_exptype[i],
                                                                                       length(current_species_down),
                                                                                     "downregulated")
    
}

[1] "amexicanum_regen upregulated"
[1] "amexicanum_regen total_datasets : 46"
[1] "amexicanum_regen mean reccurence : 8.323"
[1] "amexicanum_regen min reccurence : 5"
[1] "amexicanum_regen max reccurence : 26"
[1] "################################################"
[1] "amexicanum_regen downregulated"
[1] "amexicanum_regen total_datasets : 46"
[1] "amexicanum_regen mean reccurence : 9.158"
[1] "amexicanum_regen min reccurence : 5"
[1] "amexicanum_regen max reccurence : 27"
[1] "################################################"
[1] "celegans_stress upregulated"
[1] "celegans_stress total_datasets : 53"
[1] "celegans_stress mean reccurence : 44.082"
[1] "celegans_stress min reccurence : 38"
[1] "celegans_stress max reccurence : 53"
[1] "################################################"
[1] "celegans_stress downregulated"
[1] "celegans_stress total_datasets : 53"
[1] "celegans_stress mean reccurence : 43.088"
[1] "celegans_stress min reccurence : 36"
[1] "celegans_stress max reccurence : 5

In [13]:
gene_tpms <- data.frame("geneNames" = gtex_project_files[[1]]$Description)
for(i in 1:length(gtex_project_files)) {
    cur_files <- gtex_project_files[[i]]
    mean_tpm <- apply(cur_files[,3:length(cur_files)],1,mean)
    gene_tpms <- cbind(gene_tpms, mean_tpm)
    colnames(gene_tpms)[i+1] <- names(gtex_project_files)[i]
}

ERROR: Error in `[.data.frame`(cur_files, , 3:length(cur_files)): undefined columns selected


In [17]:
ineq <- apply(gene_tpms[,2:length(gene_tpms)],
      1,
      function(x) {Gini(x,  n = rep(1, length(x)))})

In [20]:
tissue_speceficity <- cbind("genes"= gene_tpms$geneNames,
                            "giniSpecificity" = ineq)

In [23]:
total_scores <- numeric()
spec_exptype <- character()
for(i in 1:length(recurrent_wrs_list[["upregulated"]])) {
    current_list <- recurrent_wrs_list[["upregulated"]][[i]]
    scores <-tissue_speceficity[tissue_speceficity$genes %in% current_list,]$giniSpecificity
    type <- names(recurrent_wrs_list[["upregulated"]])[i]
    spec_exptype <- append(spec_exptype, rep(type,length(scores)), length(spec_exptype))
    total_scores <- append(total_scores, scores, length(total_scores))
}

ERROR: Error: $ operator is invalid for atomic vectors


In [24]:
head(gene_tpms)

Unnamed: 0_level_0,geneNames,Blood,Adipose Tissue,Muscle,Blood Vessel,Heart,Ovary,Uterus,Vagina,Breast,⋯,Small Intestine,Prostate,Testis,Nerve,Pituitary,Liver,Kidney,Fallopian Tube,Bladder,Cervix Uteri
Unnamed: 0_level_1,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,DDX11L1,0.16160946,0.0712865,0.08507681,0.05952639,0.09960527,0.04778271,0.05215514,0.04717713,0.07792321,⋯,0.07471299,0.07388638,1.79809305,0.070477,0.05508787,0.08680291,0.07558178,0.04401714,0.05492455,0.05390182
2,WASH7P,9.71775978,11.590399,6.95123936,12.76671084,5.15145833,21.77142105,23.56507207,17.1621913,13.86089655,⋯,13.69509489,23.54824342,17.92720849,20.48243237,16.85879235,5.94239429,11.94446667,18.29714286,15.92909091,17.45054545
3,MIR1302-11,0.04362047,0.0895695,0.11462793,0.06773352,0.13767725,0.04629594,0.04648856,0.04995313,0.08464124,⋯,0.08320533,0.07748046,0.08397089,0.08764242,0.05387372,0.1064416,0.10659178,0.05733571,0.05916636,0.04672545
4,FAM138A,0.03001177,0.05236898,0.06270426,0.04245461,0.07623223,0.02860692,0.03460964,0.02749096,0.05363517,⋯,0.04851949,0.04858039,0.04194907,0.04572097,0.0361276,0.05712463,0.05535089,0.04442286,0.03293364,0.04083909
5,OR4G4P,0.01800994,0.03783211,0.04583262,0.02991457,0.05633497,0.01776165,0.02352423,0.02718826,0.04259466,⋯,0.0367735,0.03632553,0.03234367,0.03593203,0.02602055,0.04110366,0.043032,0.0,0.03350636,0.03863909
6,OR4G11P,0.02715438,0.05627418,0.0734834,0.04631588,0.0875047,0.02784083,0.03651315,0.0327293,0.05658034,⋯,0.05316737,0.05122507,0.06487189,0.04791353,0.03921514,0.06607046,0.06330889,0.04937429,0.02537091,0.03379273
