forked from cbroeckl/RAMClustR
-
Notifications
You must be signed in to change notification settings - Fork 0
/
rc.feature.replace.na.R
136 lines (119 loc) · 5.03 KB
/
rc.feature.replace.na.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#' rc.feature.replace.na
#'
#' replaces any NA (and optionally zero) values with small signal (20% of minimum feature signal value + 20% random noise)
#'
#' @param ramclustObj ramclustObj containing MSdata with optional MSMSdata (MSe, DIA, idMSMS)
#' @param replace.int default = 0.2. proportion of minimum feature value to replace NA (or zero) values with
#' @param replace.noise default = 0.2. proportion ofreplace.int value by which noise is added via 'jitter'
#' @param replace.zero logical if TRUE, any zero values are replaced with noise as if they were NA values
#' @details noise is added by finding for each feature the minimum detected value, multiplying that value by replace.int, then adding (replace.int*replace.noise) noise. abs() is used to ensure no negative values result.
#' @return ramclustR object with NA and zero values removed.
#'
#' @references Broeckling CD, Afsar FA, Neumann S, Ben-Hur A, Prenni JE. RAMClust: a novel feature clustering method enables spectral-matching-based annotation for metabolomics data. Anal Chem. 2014 Jul 15;86(14):6812-7. doi: 10.1021/ac501530d. Epub 2014 Jun 26. PubMed PMID: 24927477.
#' @concept ramclustR
#' @concept RAMClustR
#' @concept metabolomics
#' @concept mass spectrometry
#' @concept clustering
#' @concept feature
#' @concept MSFinder
#' @concept xcms
#' @author Corey Broeckling
#' @export
rc.feature.replace.na <- function(
ramclustObj=NULL,
replace.int = 0.1,
replace.noise = 0.1,
replace.zero = TRUE
) {
if(is.null(ramclustObj)) {
stop("please provide a ramclustR Object as input.", '\n')
}
if(!is.numeric(replace.int)) {
stop("replace.int must be numeric",'\n')
}
if(!is.numeric(replace.noise)) {
stop("replace.noise must be numeric",'\n')
}
if(!is.logical(replace.zero)) {
stop("replace.zero must be logical",'\n')
}
params <- c(
"replace.int" = replace.int,
"replace.noise" = replace.noise,
"replace.zero" = replace.zero
)
########
# ensure that we have all numeric values,
# then optionally ensure we have all non-zero values in the dataset.
# uses a noise addition 'jitter' around minimum values with missing data points.
# this is mostly necessary for csv input, where other programs may not have used a 'fillPeaks' like step
# it is important for clustering that variation is present for every feature and MS level.
n.feat.replaced <- 0
n.feat.total <- 0
for(x in c("MSdata", "MSMSdata")) {
# skip MSMS data if it isn't present
if(x == "MSMSdata" & is.null(ramclustObj$MSMSdata)) {
next
}
# select data frame to use
data <- ramclustObj[[x]]
n.feat.total <- n.feat.total + (dim(data)[[1]] * dim(data)[[2]])
# define a global minimum for the data set to use when all feature values are missing/zero
min.int.global <- min(data, na.rm = TRUE)
# which values need replacing
for(i in 1:ncol(data)) {
rpl <- unique(c(which(is.na(data[,i])), which(is.nan(data[,i])), which(is.infinite(data[,i]))))
if(replace.zero) {
rpl.z <- which(data[,i] == 0)
if(length(rpl.z) > 0) {
rpl <- unique(c(rpl, rpl.z))
}
}
if(length(rpl) > 0) {
if(all(is.na(data[,i]))) {
min.int.local <- min.int.global
} else {
min.int.local <- min(data[,i], na.rm = TRUE)
}
min.int <- min(min.int.local, min.int.global, na.rm = TRUE)
rpl.with <- rep((min.int * replace.int), length(rpl))
rpl.with <- abs(jitter(rpl.with, amount = rpl.with[1]*replace.noise))
data[rpl, i] <- rpl.with
n.feat.replaced <- n.feat.replaced + length(rpl)
}
}
ramclustObj[[x]] <- data
}
result <- paste("replaced", n.feat.replaced, "of", n.feat.total, "total feature values (",
round((100 * n.feat.replaced/n.feat.total)), "% )", '\n')
cat(result)
ramclustObj$history$replace.na <- {
paste0(
"Features with missing values were replaced with small values simulating noise. ",
"For each feature, the minimum detected value was multiplied by ", replace.int, ". ",
"Noise was then added using a factor of ", replace.noise, ". ",
"The absulute value of this value was used as the filled value to ensure that only non-negative values carried forward. ",
if(replace.zero) {
"Zero values were treated as missing values."
}
)
}
# ## update msint and optionally msmsint
# msint<-rep(0, length(ramclustObj$fmz))
# for(i in 1:ncol(ramclustObj$MSdata)){
# msint[i]<-weighted.mean(ramclustObj$MSdata[,i], ramclustObj$MSdata[,i], na.rm = TRUE)
# }
# ramclustObj$msint <- msint
#
# if(!is.null(ramclustObj$MSMSdata)) {
# msmsint<-rep(0, length(ramclustObj$fmz))
# for(i in 1:ncol(ramclustObj$MSMSdata)){
# msmsint[i]<-weighted.mean(ramclustObj$MSMSdata[,i], ramclustObj$MSMSdata[,i], na.rm = TRUE)
# }
# ramclustObj$msmsint <- msmsint
# }
if(is.null(ramclustObj$params)) {ramclustObj$params <- list()}
ramclustObj$params$rc.feature.replace.na <- params
return(ramclustObj)
}