In [1]:
library(rcellminer) # Load the rcellminer library
library(rcellminerData) # Load the rcellminerData library
library(tidyverse)
library(data.table)

Loading required package: Biobase

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
    tapply, union, unique, unsplit, which.max, which.min


Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.


Loading required package: rcellminerData

Consider citing this package: Luna A, et al. rcellminer: exploring molecular profiles and drug response of th

In [2]:
install.packages("~/Desktop/ctrpData_1.0.8.tar.gz", repos = NULL, type = "source")
install.packages("~/Desktop/gdsc1Data_0.1.1.tar.gz", repos = NULL, type = "source")
install.packages("~/Desktop/gdsc2Data_0.1.1.tar.gz", repos = NULL, type = "source")
install.packages("~/Desktop/gdscDataDec15_1.1.4.tar.gz", repos = NULL, type = "source")
install.packages("~/Downloads/ccleData_1.0.9.tar.gz", repos = NULL, type = "source")

# NCI60 data

In [6]:
expression <- getAllFeatureData(rcellminerData::molData)[["xai"]]
names <- read.csv("simplified_names.csv", stringsAsFactors = FALSE)
colnames(expression) <- sapply(colnames(expression), function(x) names[names[,1] == x, 2])
expression <- expression[order(row.names(expression)),]
                               
expression <- cbind(Gene = rownames(expression), as.data.frame(expression))
expression <- as.data.table(expression)

setkey(expression, Gene)

n <- nrow(expression)
half <- ceiling(n / 2)

expression_part1 <- expression[1:half]
expression_part2 <- expression[(half + 1):n]

fwrite(expression_part1, '../data/nci_data/gene_exp_part1.csv.gz', 
       compress = "gzip", quote = TRUE)
fwrite(expression_part2, '../data/nci_data/gene_exp_part2.csv.gz',
       compress = "gzip", quote = TRUE)                        

mutation <- getAllFeatureData(rcellminerData::molData)[["mut"]]
names <- read.csv("simplified_names.csv", stringsAsFactors = FALSE)
colnames(mutation) <- sapply(colnames(mutation), function(x) names[names[,1] == x, 2])
mutation <- mutation[order(row.names(mutation)),]

write.csv(mutation, '../data/nci_data/mut.csv') # Save as a CSV file
                             
methylation <- getAllFeatureData(rcellminerData::molData)[["met"]]
names <- read.csv("simplified_names.csv", stringsAsFactors = FALSE)
colnames(methylation) <- sapply(colnames(methylation), function(x) names[names[,1] == x, 2])
methylation <- methylation[order(row.names(methylation)),]
write.csv(methylation, '../data/nci_data/met.csv') # Save as a CSV file

cop <- getAllFeatureData(rcellminerData::molData)[["cop"]]
names <- read.csv("simplified_names.csv", stringsAsFactors = FALSE)
colnames(cop) <- sapply(colnames(cop), function(x) names[names[,1] == x, 2])
cop <- cop[order(row.names(cop)),]
write.csv(cop, '../data/nci_data/cop.csv') # Save as a CSV file

# CTRP data

Mutation data is the same with CCLE

In [15]:
expression <- getAllFeatureData(ccleData::molData)[["exp"]]
expression <- expression[order(row.names(expression)),]

expression <- cbind(Gene = rownames(expression), as.data.frame(expression))
expression <- as.data.table(expression)

setkey(expression, Gene)

n <- nrow(expression)
third <- ceiling(n / 3)

expression_part1 <- expression[1:third]
expression_part2 <- expression[(third + 1):(2 * third)]
expression_part3 <- expression[(2 * third + 1):n]

fwrite(expression_part1, '../data/ctrp_data/gene_exp_part1.csv.gz', compress = "gzip")
fwrite(expression_part2, '../data/ctrp_data/gene_exp_part2.csv.gz', compress = "gzip")
fwrite(expression_part3, '../data/ctrp_data/gene_exp_part3.csv.gz', compress = "gzip")

mutation <- getAllFeatureData(ccleData::molData)[["mut"]]
mutation <- mutation[order(row.names(mutation)),]
write.csv(mutation, '../data/ctrp_data/mut.csv') # Save as a CSV file

methylation <- getAllFeatureData(ccleData::molData)[["rrb"]]
methylation <- methylation[order(row.names(methylation)),]
                               
methylation <- cbind(Gene = rownames(methylation), as.data.frame(methylation))
methylation <- as.data.table(methylation)

setkey(methylation, Gene)

n <- nrow(methylation)
half <- ceiling(n / 2)

methylation_part1 <- methylation[1:half]
methylation_part2 <- methylation[(half + 1):n]

fwrite(methylation_part1, '../data/ctrp_data/met_part1.csv.gz', compress = "gzip")
fwrite(methylation_part2, '../data/ctrp_data/met_part2.csv.gz', compress = "gzip")

cop <- getAllFeatureData(ccleData::molData)[["cop"]]
cop <- methylation[order(row.names(cop)),]
                               
cop <- cbind(Gene = rownames(cop), as.data.frame(cop))
cop <- as.data.table(cop)

setkey(cop, Gene)

n <- nrow(cop)
half <- ceiling(n / 2)

cop_part1 <- cop[1:half]
cop_part2 <- cop[(half + 1):n]

fwrite(cop_part1, '../data/ctrp_data/cop_part1.csv.gz', compress = "gzip")
fwrite(cop_part2, '../data/ctrp_data/cop_part2.csv.gz', compress = "gzip")

In [19]:
cop <- getAllFeatureData(ccleData::molData)[["cop"]]

# まず data.frame に変換
cop <- as.data.frame(cop)

# Gene列に rownames を追加
cop$Gene <- rownames(cop)

# 行名はもう不要
rownames(cop) <- NULL

# data.table に変換
cop <- as.data.table(cop)

# 不要な列があるなら削除（例: Gene.1）
cop[, Gene.1 := NULL]

# キー設定
setkey(cop, Gene)


# n <- nrow(cop)
# half <- ceiling(n / 2)

# cop_part1 <- cop[1:half]
# cop_part2 <- cop[(half + 1):n]

# fwrite(cop_part1, '../data/ctrp_data/cop_part1.csv.gz', compress = "gzip")
# fwrite(cop_part2, '../data/ctrp_data/cop_part2.csv.gz', compress = "gzip")

“Tried to assign NULL to column 'Gene.1', but this column does not exist to remove”


In [20]:
cop

1321N1,143B,22Rv1,23132/87,253J,253J-BV,42-MG-BA,5637,59M,639-V,⋯,TTC1240,TTC549,TTC642,UO31,RPMI-6666,DOV13,COLO699,OVCAR-5,TK10,Gene
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
-0.1544,-0.2107,-0.0762,-0.0331,,,0.0346,-0.0324,0.2090,0.0797,⋯,,,,,,,,,,A1BG
-0.1544,-0.2107,-0.0762,-0.0331,,,0.0346,-0.0324,0.2090,0.0797,⋯,,,,,,,,,,A1BG-AS1
-0.0985,0.1580,-0.0025,-0.0511,,,-0.0522,-0.4928,0.5006,0.0469,⋯,,,,,,,,,,A1CF
0.2942,-0.0193,-0.0428,-0.0209,,,-0.4863,0.3886,0.4266,0.0474,⋯,,,,,,,,,,A2LD1
-0.1725,-0.1753,0.4486,-0.1752,,,-0.3816,-0.0657,0.2536,0.2914,⋯,,,,,,,,,,A2M
-0.1725,-0.1753,0.4486,-0.1752,,,-0.3816,-0.0657,0.2536,0.2914,⋯,,,,,,,,,,A2M-AS1
-0.1725,-0.1753,0.4486,-0.1752,,,-0.3816,-0.0657,0.2536,0.2914,⋯,,,,,,,,,,A2ML1
-0.1725,-0.1753,0.4486,-0.1752,,,-0.3816,-0.0657,0.2536,0.2914,⋯,,,,,,,,,,A2MP1
-0.1726,-0.2536,-0.0574,-0.0512,,,-0.0155,-0.0390,-0.2246,0.3187,⋯,,,,,,,,,,A4GALT
0.2583,0.2097,-0.0303,-0.0443,,,-0.2675,0.0518,-0.1626,0.0863,⋯,,,,,,,,,,A4GNT


# GDSC1 & 2
GDSC's expression and mutation data are in gdscDataDec15

In [5]:
expression <- getAllFeatureData(gdscDataDec15::molData)[["exp"]]
expression <- expression[order(row.names(expression)),]
                               
expression <- cbind(Gene = rownames(expression), as.data.frame(expression))
expression <- as.data.table(expression)

setkey(expression, Gene)

n <- nrow(expression)
half <- ceiling(n / 2)

expression_part1 <- expression[1:half]
expression_part2 <- expression[(half + 1):n]

fwrite(expression_part1, '../data/gdsc1_data/gene_exp_part1.csv.gz', compress = "gzip")
fwrite(expression_part2, '../data/gdsc1_data/gene_exp_part2.csv.gz', compress = "gzip")

fwrite(expression_part1, '../data/gdsc2_data/gene_exp_part1.csv.gz', compress = "gzip")
fwrite(expression_part2, '../data/gdsc2_data/gene_exp_part2.csv.gz', compress = "gzip")

mutation <- getAllFeatureData(gdscDataDec15::molData)[["mut"]]
mutation <- mutation[order(row.names(mutation)),]
write.csv(mutation, '../data/gdsc1_data/mut.csv') # Save as a CSV file
write.csv(mutation, '../data/gdsc2_data/mut.csv') # Save as a CSV file

methylation <- getAllFeatureData(gdscDataDec15::molData)[["met"]]
methylation <- methylation[order(row.names(methylation)),]
                               
methylation <- cbind(Gene = rownames(methylation), as.data.frame(methylation))
methylation <- as.data.table(methylation)

setkey(methylation, Gene)

n <- nrow(methylation)
half <- ceiling(n / 2)

methylation_part1 <- methylation[1:half]
methylation_part2 <- methylation[(half + 1):n]

fwrite(methylation_part1, '../data/gdsc1_data/met_part1.csv.gz', compress = "gzip")
fwrite(methylation_part2, '../data/gdsc1_data/met_part2.csv.gz', compress = "gzip")

fwrite(methylation_part1, '../data/gdsc2_data/met_part1.csv.gz', compress = "gzip")
fwrite(methylation_part2, '../data/gdsc2_data/met_part2.csv.gz', compress = "gzip")

cop <- getAllFeatureData(gdscDataDec15::molData)[["cop"]]
cop <- cop[order(row.names(cop)),]
                               
cop <- cbind(Gene = rownames(cop), as.data.frame(cop))
cop <- as.data.table(cop)

setkey(cop, Gene)

n <- nrow(cop)
half <- ceiling(n / 2)

cop_part1 <- cop[1:half]
cop_part2 <- cop[(half + 1):n]

fwrite(cop_part1, '../data/gdsc1_data/cop_part1.csv.gz', compress = "gzip")
fwrite(cop_part2, '../data/gdsc1_data/cop_part2.csv.gz', compress = "gzip")

fwrite(cop_part1, '../data/gdsc2_data/cop_part1.csv.gz', compress = "gzip")
fwrite(cop_part2, '../data/gdsc2_data/cop_part2.csv.gz', compress = "gzip")

In [14]:
count_na_or_zero_cols <- function(df) {
  sum(sapply(df, function(col) all(is.na(col) | col == 0)))
}

In [23]:
expression <- getAllFeatureData(rcellminerData::molData)[["xsq"]]
names <- read.csv("simplified_names.csv", stringsAsFactors = FALSE)
colnames(expression) <- sapply(colnames(expression), function(x) names[names[,1] == x, 2])
expression <- expression[order(row.names(expression)),]
                               
expression <- cbind(Gene = rownames(expression), as.data.frame(expression))
expression <- as.data.table(expression)
expr_na0_count <- count_na_or_zero_cols(expression[, -1, with = FALSE])
expr_na0_count

In [55]:
expression <- getAllFeatureData(rcellminerData::molData)[["exp"]]
names <- read.csv("simplified_names.csv", stringsAsFactors = FALSE)
colnames(expression) <- sapply(colnames(expression), function(x) names[names[,1] == x, 2])
expression <- expression[order(row.names(expression)),]
                               
expression <- cbind(Gene = rownames(expression), as.data.frame(expression))
expression <- as.data.table(expression)
expr_na0_count <- count_na_or_zero_cols(expression[, -1, with = FALSE])
expr_na0_count

In [61]:
expression <- getAllFeatureData(rcellminerData::molData)[["xai"]]
names <- read.csv("simplified_names.csv", stringsAsFactors = FALSE)
colnames(expression) <- sapply(colnames(expression), function(x) names[names[,1] == x, 2])
expression <- expression[order(row.names(expression)),]
                               
expression <- cbind(Gene = rownames(expression), as.data.frame(expression))
expression <- as.data.table(expression)
expr_na0_count <- count_na_or_zero_cols(expression[, -1, with = FALSE])
expr_na0_count

In [63]:
na_or_zero_cols <- colnames(expression[, -1, with = FALSE])[sapply(expression[, -1, with = FALSE], function(col) all(is.na(col) | col == 0))]

# 結果の確認
print(na_or_zero_cols)

[1] "SF_539"


In [64]:
expression[, -1, with = FALSE]

MCF7,MDA_MB_231,HS578T,BT_549,T47D,SF_268,SF_295,SF_539,SNB_19,SNB_75,⋯,PC_3,DU_145,786_0,A498,ACHN,CAKI_1,RXF_393,SN12C,TK_10,UO_31
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
6.963,6.418,7.593,7.444,6.669,6.610,6.432,,6.219,5.694,⋯,5.653,6.229,5.681,5.684,5.725,6.371,6.698,6.973,5.858,6.024
6.230,5.271,5.994,6.434,6.129,5.536,5.803,,5.230,4.972,⋯,4.654,5.542,4.819,5.014,5.058,5.554,5.373,5.091,4.886,5.202
3.605,3.588,3.779,3.608,3.613,3.419,3.537,,3.524,3.500,⋯,3.504,3.435,3.600,3.638,3.608,3.636,3.538,3.553,3.607,3.530
4.266,4.294,4.534,4.267,4.263,4.219,4.403,,4.679,11.206,⋯,4.196,4.291,4.243,4.294,4.269,4.270,4.249,4.242,4.317,4.242
3.702,3.148,3.213,2.982,3.491,3.117,3.269,,3.019,3.266,⋯,3.418,4.045,2.952,2.883,3.712,3.240,3.206,3.067,3.166,3.380
4.152,4.118,4.109,4.113,4.094,4.039,4.227,,4.106,4.056,⋯,3.950,4.161,4.086,4.088,4.154,4.138,4.089,4.094,4.137,4.099
4.756,4.923,4.907,4.739,4.411,4.268,4.409,,4.626,4.481,⋯,4.331,4.875,4.712,4.610,4.475,4.801,4.914,4.734,4.746,4.757
7.912,6.971,6.134,6.715,6.460,5.972,6.303,,6.942,7.578,⋯,6.265,6.445,7.275,6.812,6.711,7.734,7.518,6.835,6.276,7.222
9.356,8.434,8.284,8.596,9.516,9.042,9.000,,8.654,8.317,⋯,9.048,9.048,8.529,8.558,9.089,9.195,8.524,9.001,8.888,8.478
7.834,7.569,6.919,6.341,7.973,7.633,7.457,,7.430,6.840,⋯,7.541,7.619,7.174,7.854,8.046,7.571,7.342,7.512,7.494,7.726


In [62]:
numeric_data <- expression[, !'Gene']

# 各統計量を計算（全体ベース）
overall_mean <- mean(as.matrix(numeric_data), na.rm = TRUE)
overall_var <- var(as.numeric(as.matrix(numeric_data)), na.rm = TRUE)
overall_max <- max(as.matrix(numeric_data), na.rm = TRUE)
overall_min <- min(as.matrix(numeric_data), na.rm = TRUE)
overall_median <- median(as.matrix(numeric_data), na.rm = TRUE)

# 結果をまとめて表示
stats_overall <- data.table(
  Mean = overall_mean,
  Variance = overall_var,
  Max = overall_max,
  Min = overall_min,
  Median = overall_median
)

# 結果の確認
print(stats_overall)

       Mean Variance    Max   Min Median
      <num>    <num>  <num> <num>  <num>
1: 6.001992 3.730668 15.264 1.438  5.914


In [17]:
expression <- getAllFeatureData(ccleData::molData)[["xsq"]]
expression <- expression[order(row.names(expression)),]
expression <- cbind(Gene = rownames(expression), as.data.frame(expression))
expression <- as.data.table(expression)

expr_na0_count <- count_na_or_zero_cols(expression[, -1, with = FALSE])
expr_na0_count

In [57]:
expression <- getAllFeatureData(ccleData::molData)[["exp"]]
expression <- expression[order(row.names(expression)),]
expression <- cbind(Gene = rownames(expression), as.data.frame(expression))
expression <- as.data.table(expression)

expr_na0_count <- count_na_or_zero_cols(expression[, -1, with = FALSE])
expr_na0_count

In [58]:
numeric_data <- expression[, !'Gene']

# 各統計量を計算（全体ベース）
overall_mean <- mean(as.matrix(numeric_data), na.rm = TRUE)
overall_var <- var(as.numeric(as.matrix(numeric_data)), na.rm = TRUE)
overall_max <- max(as.matrix(numeric_data), na.rm = TRUE)
overall_min <- min(as.matrix(numeric_data), na.rm = TRUE)
overall_median <- median(as.matrix(numeric_data), na.rm = TRUE)

# 結果をまとめて表示
stats_overall <- data.table(
  Mean = overall_mean,
  Variance = overall_var,
  Max = overall_max,
  Min = overall_min,
  Median = overall_median
)

# 結果の確認
print(stats_overall)

       Mean Variance      Max      Min   Median
      <num>    <num>    <num>    <num>    <num>
1: 5.931517 4.022404 14.69479 2.643324 5.489619


In [31]:
expression <- getAllFeatureData(gdscDataDec15::molData)[["xsq"]]
expression <- expression[order(row.names(expression)),]
expression <- cbind(Gene = rownames(expression), as.data.frame(expression))
expression <- as.data.table(expression)

expr_na0_count <- count_na_or_zero_cols(expression[, -1, with = FALSE])
expr_na0_count

In [59]:
expression <- getAllFeatureData(gdscDataDec15::molData)[["exp"]]
expression <- expression[order(row.names(expression)),]
expression <- cbind(Gene = rownames(expression), as.data.frame(expression))
expression <- as.data.table(expression)

expr_na0_count <- count_na_or_zero_cols(expression[, -1, with = FALSE])
expr_na0_count

In [60]:
numeric_data <- expression[, !'Gene']

# 各統計量を計算（全体ベース）
overall_mean <- mean(as.matrix(numeric_data), na.rm = TRUE)
overall_var <- var(as.numeric(as.matrix(numeric_data)), na.rm = TRUE)
overall_max <- max(as.matrix(numeric_data), na.rm = TRUE)
overall_min <- min(as.matrix(numeric_data), na.rm = TRUE)
overall_median <- median(as.matrix(numeric_data), na.rm = TRUE)

# 結果をまとめて表示
stats_overall <- data.table(
  Mean = overall_mean,
  Variance = overall_var,
  Max = overall_max,
  Min = overall_min,
  Median = overall_median
)

# 結果の確認
print(stats_overall)

       Mean Variance      Max      Min   Median
      <num>    <num>    <num>    <num>    <num>
1: 4.869682 4.687176 14.32768 2.098777 3.887526
