In [37]:
library("data.table")
library(pheatmap)
library(ggpubr)
library(ggfortify)
library(reshape)
library(dplyr)

In [38]:
cosmic_genes<-fread("cosmic_hallmarks.txt",header=F)
cosmic_genes<-as.data.frame(cosmic_genes)
colnames(cosmic_genes)<-c("gene")

cosmic_matched_genes<-fread("cosmic_driver_matched.txt",header=F)
cosmic_matched_genes<-as.data.frame(cosmic_matched_genes)
colnames(cosmic_matched_genes)<-c("gene")

In [39]:
for(file in list.files("/data/timonaj/cancer_as_wound/ppi_analysis/fractions_data/","^fractions_.*.csv")) {print(file)}

[1] "fractions_aandersoni_regen_down.csv"
[1] "fractions_aandersoni_regen_up.csv"
[1] "fractions_amaculatum_regen_down.csv"
[1] "fractions_amaculatum_regen_up.csv"
[1] "fractions_amexicanum_regen_down.csv"
[1] "fractions_amexicanum_regen_up.csv"
[1] "fractions_celegans_stress_down.csv"
[1] "fractions_celegans_stress_up.csv"
[1] "fractions_clupisFamiliaris_wound_down.csv"
[1] "fractions_clupisFamiliaris_wound_up.csv"
[1] "fractions_dmelanogaster_stress_down.csv"
[1] "fractions_dmelanogaster_stress_up.csv"
[1] "fractions_dmelanogaster_wound_down.csv"
[1] "fractions_dmelanogaster_wound_up.csv"
[1] "fractions_downregulated.regen.csv"
[1] "fractions_downregulated.stress.csv"
[1] "fractions_downregulated.wound.csv"
[1] "fractions_drerio_regen_down.csv"
[1] "fractions_drerio_regen_up.csv"
[1] "fractions_ecoli_stress_down.csv"
[1] "fractions_ecoli_stress_up.csv"
[1] "fractions_hsapiens_regen_down.csv"
[1] "fractions_hsapiens_regen_up.csv"
[1] "fractions_hsapiens_stress_down.csv"
[1] "fractions

In [40]:
fractions_list <- list()
for(file in list.files("/data/timonaj/cancer_as_wound/ppi_analysis/fractions_data/","^fractions_.*.csv")) {
    fractions_list[[gsub(".csv", "",file)]] <- fread(paste("/data/timonaj/cancer_as_wound/ppi_analysis/fractions_data/",
                                                                     file, sep = ""))
}

In [41]:
names(fractions_list)

In [42]:
fractions_list_exp <- fractions_list[grep("upregulated|downregulated|msigdb",names(fractions_list))]
fractions_list_spec_exp <- fractions_list[grep("up$|down$",names(fractions_list))]

In [43]:
names(fractions_list_exp)
names(fractions_list_spec_exp)

# create combined DF

In [46]:
get_combined_DF <- function(final_fractions_list, ppi_name){
    
    fractions_initial_combinedDF <- data.frame("gene_list" = final_fractions_list[[1]][[1]])
    for(i in 1:length(final_fractions_list)) {
        print(names(final_fractions_list)[i])
        temp <- cbind(fractions_initial_combinedDF[fractions_initial_combinedDF$gene_list %in% final_fractions_list[[i]][[1]],],
                                                            final_fractions_list[[i]][final_fractions_list[[i]][[1]] %in% fractions_initial_combinedDF$gene_list,])
        colnames(temp)[1] <- "gene_list"
        print(nrow(temp))
        print(ncol(temp))
        fractions_initial_combinedDF <- temp
    }


    #oncogenic_status<-gsub(" ", "", combinedDF$gene_regen) %in% as.character(cosmic_genes$gene)
    oncogenic_status<-as.character(fractions_initial_combinedDF$gene_list) %in% as.character(cosmic_genes$gene)
    fractions_initial_combinedDF<- cbind(fractions_initial_combinedDF,"oncogenic_status" =as.character(oncogenic_status))


    oncogenic_status_degree_matched<-as.character(fractions_initial_combinedDF$gene_list) %in% as.character(cosmic_matched_genes$gene)
    fractions_initial_combinedDF<- cbind(fractions_initial_combinedDF,"degree_matched_oncogenic_status" =as.character(oncogenic_status_degree_matched))

    current_oncogenic_status <- as.factor(fractions_initial_combinedDF$oncogenic_status)
    levels(current_oncogenic_status) <- c("Non-Cancer-Driver", "COSMIC-Driver")
    current_oncogenic_status <- as.character(current_oncogenic_status)
    current_oncogenic_status[fractions_initial_combinedDF$degree_matched_oncogenic_status == 'TRUE'] <- 'Matched-Controls'

    fractions_initial_combinedDF<- cbind(fractions_initial_combinedDF,"total_oncogenic_status" =current_oncogenic_status)  
    
    hppin_degree <- fread("hPPIN_degree_list.txt", header=F)
    overlapping_degree_genes <- fractions_initial_combinedDF[fractions_initial_combinedDF$gene_list %in% hppin_degree$V1,]
    nrow(overlapping_degree_genes)
    degree_list <- hppin_degree[hppin_degree$V1 %in% overlapping_degree_genes$gene_list]$V2
    length(degree_list)
    current_oncogenic_status <- as.factor(overlapping_degree_genes$oncogenic_status)
    levels(current_oncogenic_status) <- c(0, 1)
    
    up_fractions<- overlapping_degree_genes[,
                                            colnames(overlapping_degree_genes)[grep("(up_Fraction_sp_less4$|gene_list)",
                                                                                    colnames(overlapping_degree_genes))]]
    up_fraction_input_total <- (cbind("degree" = degree_list, up_fractions, "label" = current_oncogenic_status))
    up_fraction_input_total <- up_fraction_input_total %>% relocate(gene_list, .after = last_col())

    up_cd<- up_fraction_input_total[up_fraction_input_total$label == 1,]
    up_ncd<- up_fraction_input_total[up_fraction_input_total$label == 0,]

    sampled_up_cd <- up_cd[sample(nrow(up_cd)),]
    sampled_up_ncd <- up_ncd[sample(nrow(up_ncd)),]


    down_fractions <- overlapping_degree_genes[,
                                               colnames(overlapping_degree_genes)[grep("(gene_list|down_Fraction_sp_less4$)",
                                                                                       colnames(overlapping_degree_genes))]]

    down_fraction_input_total <- cbind("degree" = degree_list, down_fractions, "label" = current_oncogenic_status)
    down_fraction_input_total <- down_fraction_input_total %>% relocate(gene_list, .after = last_col())


    down_cd<- down_fraction_input_total[down_fraction_input_total$label == 1,]
    down_ncd<- down_fraction_input_total[down_fraction_input_total$label == 0,]

    sampled_down_cd <- down_cd[sample(nrow(down_cd)),]
    sampled_down_ncd <- down_ncd[sample(nrow(down_ncd)),]



    total_fractions<- overlapping_degree_genes[,
                                            colnames(overlapping_degree_genes)[grep("(Fraction_sp_less4$|gene_list)",
                                                                                    colnames(overlapping_degree_genes))]]
    total_fraction_input_total <- cbind("degree" = degree_list, total_fractions, "label" = current_oncogenic_status)
    total_fraction_input_total <- total_fraction_input_total %>% relocate(gene_list, .after = last_col())

    total_cd<- total_fraction_input_total[total_fraction_input_total$label == 1,]
    total_ncd<- total_fraction_input_total[total_fraction_input_total$label == 0,]

    sampled_total_cd <- total_cd[sample(nrow(total_cd)),]
    sampled_total_ncd <- total_ncd[sample(nrow(total_ncd)),]
    
    ###up
    write.table(up_fraction_input_total,
               paste("./machine_learning/",ppi_name, "_up_fraction_total.csv",sep=""),
                sep=",", quote=F, row.names = F, col.names=TRUE)
    write.table(rbind(sampled_up_cd[1:round((nrow(sampled_up_cd)*.8)),1:length(up_fraction_input_total)-1], sampled_up_ncd[1:round(nrow(sampled_up_ncd)*.8),1:length(up_fraction_input_total) -1]),
                paste("./machine_learning/",ppi_name, "_up_fraction_train.csv",sep=""),
                sep=",", quote=F, row.names = F, col.names=TRUE)
    write.table(rbind(sampled_up_cd[(round((nrow(sampled_up_cd)*.8))+1):nrow(sampled_up_cd),1:length(up_fraction_input_total)-1], sampled_up_ncd[(round(nrow(sampled_up_ncd)*.8)+1):nrow(sampled_up_ncd),1:length(up_fraction_input_total)-1]),
                paste("./machine_learning/",ppi_name, "_up_fraction_test.csv",sep=""),
                sep=",", quote=F, row.names = F, col.names=TRUE)


    write.table(c(sampled_up_cd[1:round((nrow(sampled_up_cd)*.8)),]$gene_list, sampled_up_ncd[1:round(nrow(sampled_up_ncd)*.8),]$gene_list),
                paste("./machine_learning/",ppi_name, "_up_fraction_train_gene_list.csv",sep=""),
                sep=",", quote=F, row.names = F, col.names=TRUE)
    write.table(c(sampled_up_cd[(round((nrow(sampled_up_cd)*.8))+1):nrow(sampled_up_cd),]$gene_list, sampled_up_ncd[(round(nrow(sampled_up_ncd)*.8)+1):nrow(sampled_up_ncd),]$gene_list),
                paste("./machine_learning/",ppi_name, "_up_fraction_test_gene_list.csv",sep=""),
                sep=",", quote=F, row.names = F, col.names=TRUE)

    ### total
    write.table(total_fraction_input_total,
                paste("./machine_learning/",ppi_name, "_total_fraction_total.csv",sep=""),
                sep=",", quote=F, row.names = F, col.names=TRUE)
    write.table(rbind(sampled_total_cd[1:round((nrow(sampled_up_cd)*.8)),1:length(total_fraction_input_total)-1], sampled_total_ncd[1:round(nrow(sampled_up_ncd)*.8),1:length(total_fraction_input_total) -1]),
                paste("./machine_learning/",ppi_name, "_total_fraction_train.csv",sep=""),
                sep=",", quote=F, row.names = F, col.names=TRUE)
    write.table(rbind(sampled_total_cd[(round((nrow(sampled_up_cd)*.8))+1):nrow(sampled_total_cd),1:length(total_fraction_input_total)-1], sampled_total_ncd[(round(nrow(sampled_up_ncd)*.8)+1):nrow(sampled_total_ncd),1:length(total_fraction_input_total)-1]),
                paste("./machine_learning/",ppi_name, "_total_fraction_test.csv",sep=""),
                sep=",", quote=F, row.names = F, col.names=TRUE)


    write.table(c(sampled_total_cd[1:round((nrow(sampled_up_cd)*.8)),]$gene_list, sampled_total_ncd[1:round(nrow(sampled_up_ncd)*.8),]$gene_list),
                paste("./machine_learning/",ppi_name, "_total_fraction_train_gene_list.csv",sep=""),
                sep=",", quote=F, row.names = F, col.names=TRUE)
    write.table(c(sampled_total_cd[(round((nrow(sampled_up_cd)*.8))+1):nrow(sampled_total_cd),]$gene_list, sampled_total_ncd[(round(nrow(sampled_up_ncd)*.8)+1):nrow(sampled_total_ncd),]$gene_list),
                paste("./machine_learning/",ppi_name, "_total_fraction_test_gene_list.csv",sep=""),
                sep=",", quote=F, row.names = F, col.names=TRUE)


    ###  down
    write.table(down_fraction_input_total,
                paste("./machine_learning/",ppi_name, "_down_fraction_total.csv",sep=""),
                sep=",", quote=F, row.names = F, col.names=FALSE)

    write.table(rbind(sampled_down_cd[1:round((nrow(sampled_up_cd)*.8)),], sampled_down_ncd[1:round(nrow(sampled_up_ncd)*.8),]),
                paste("./machine_learning/",ppi_name, "_down_fraction_train.csv",sep=""),
                sep=",", quote=F, row.names = F, col.names=TRUE)
    write.table(rbind(sampled_down_cd[(round((nrow(sampled_up_cd)*.8))+1):nrow(sampled_down_cd),], sampled_down_ncd[(round(nrow(sampled_up_ncd)*.8)+1):nrow(sampled_down_ncd),]),
                paste("./machine_learning/",ppi_name, "_down_fraction_test.csv",sep=""),
                sep=",", quote=F, row.names = F, col.names=TRUE)

}

In [47]:
fractions_combinedDF_EXP <- get_combined_DF(fractions_list_exp)
#fractions_combinedDF_SPEC_EXP <- get_combined_DF(fractions_list_exp)

[1] "fractions_downregulated.regen"
[1] 17061
[1] 7
[1] "fractions_downregulated.stress"
[1] 17061
[1] 13
[1] "fractions_downregulated.wound"
[1] 17061
[1] 19
[1] "fractions_msigdb.regen"
[1] 17061
[1] 25
[1] "fractions_msigdb.stress"
[1] 17061
[1] 31
[1] "fractions_msigdb.wound"
[1] 17061
[1] 37
[1] "fractions_upregulated.regen"
[1] 17061
[1] 43
[1] "fractions_upregulated.stress"
[1] 17061
[1] 49
[1] "fractions_upregulated.wound"
[1] 17061
[1] 55


ERROR: Error: Can't subset columns that don't exist.
[31m✖[39m Column `gene_list` doesn't exist.


In [None]:
fractions_combinedDF_EXP