-
Notifications
You must be signed in to change notification settings - Fork 18
/
importNet.STRING.R
268 lines (225 loc) · 10.7 KB
/
importNet.STRING.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
# importNet.STRING.R
#' Import network data from a STRING database file.
#'
#' \code{importNet.STRING} imports network edges from a STRING database file,
#' selects the highest confidence edges, maps ENSP IDs to HGNC gene symbols, and
#' returns a weighted, directed igraph graph object of a rete gG type.
#'
#' @section Selecting edges:
#' STRING scores are p-values * 1000, rounded to
#' integer. The function can retrieve the highest scored edges according to
#' three different cutoff type. Type "xN" (default: 10000) retrieves the xN
#' highest scored edges. Type xQ (default 0.9) retrieves the edges with scores
#' larger than the xQ quantile. Type "xS" (default 950) retrieves all edges
#' with scores larger or equal to xS. If different values are requested, they
#' are passed in the parameter val. To read all edges, cutoff type should be
#' (the default) xN, val = Inf.
#' @section Networks:
#' STRING "protein.links.detailed.v10.txt" files contain
#' several protein networks: neighborhood, fusion, cooccurence, coexpression,
#' experimental, database, textmining, and combined_score. However this
#' function is not restricted to these types, but will read one network for
#' which the column name is requested in the function's net parameter. This
#' allows users to define their own column.
#' @section Tax ID:
#' The taxID parameter is used as a sanity check on the file
#' contents. Currently only the first data record protein IDs are checked for
#' being prefixed with the tax ID. During processing all numbers and one
#' period prefixed to the ENSP ID are removed.
#' @section Console and log:
#' Progress is summarized to console and results are written to the log-file
#' unless writeLog is FALSE.
#' @section The gG object:
#' The function returns a rete gG object, a weighted,
#' directed, simple igraph graph in which HGNC gene symbols are vertex names,
#' and the edge attributes $weight hold the network scores. Metadata are
#' stored as object attributs: $type: "gG"; $version: the gG
#' object version; $UUID: the UUID which allows to retrieve detailed
#' process information from the log file with \code{\link{findUUID}}.
#'
#' @param fName Filename of a STRING protein.links.detailed.v10.txt file.
#' @param net The requested network. This must be a string that exists in the
#' header. The default is "combined_score".
#' @param cutoffType one of xN, xQ or xS (see Details).
#' @param val A number, quantile or score appropriate to the requested cutoff
#' type.
#' @param taxID The NCBI tax ID prefix of the protein1 and protein2 IDs.
#' Defaults to "9606" (homo sapiens)
#' @param dropUnmapped Controls whether to drop records in which at least one
#' interactor could not be mapped to HGNC gene symbol. TRUE by default.
#' by default.
#' @param silent Controls whether output to console should be suppressed. FALSE
#' by default.
#' @param writeLog Controls whether writing the result to the global logfile is
#' enabled. TRUE by default.
#' @return a weighted, directed, simple igraph graph which is a rete gG object.
#'
#' @family ImportNet.STRING, importNet.MultiNet, importNet.MITAB
#'
#' ## @seealso \code{\link{fastMap}} fastMap() is used internally to map ENSP
#' IDs to gene symbols.
#'
#' ## @examples ## \dontrun{ ## importNet.STRING(IN, OUT) ## }
#' @export
importNet.STRING <- function(fName,
net = "combined_score",
cutoffType = "xN",
val,
taxID = "9606",
dropUnmapped = TRUE,
silent = FALSE,
writeLog = TRUE) {
# ToDo: can we select the number of vertices ?
# ToDo: complete writing progress to console
# ToDo: Handle unmapped edges
# ToDo: Remove duplicate edges and remove loops before subsetting,
# since this changes the number of edges above cutoff.
# ToDo: write tests for that
# ToDo: fix the fastMap() call
# ToDo: stop on incomplete last line - suspect corrupt file
#
# ==== PARAMETERS ==========================================================
cutoffTypes <- c("xS", "xQ", "xN")
defaultValues <- c(xS = 950, xQ = 0.9, xN = 10000)
if (missing(val)) {
val <- defaultValues[cutoffType]
}
# ==== VALIDATIONS =========================================================
# General parameter checks
cR <- character()
cR <- c(cR, .checkArgs(fName, like = "FILE_E", checkSize = TRUE))
cR <- c(cR, .checkArgs(net, like = "a", checkSize = TRUE))
cR <- c(cR, .checkArgs(cutoffType, like = "a", checkSize = TRUE))
cR <- c(cR, .checkArgs(val, like = numeric(1), checkSize = TRUE))
cR <- c(cR, .checkArgs(taxID, like = "a", checkSize = TRUE))
cR <- c(cR, .checkArgs(dropUnmapped, like = logical(1), checkSize = TRUE))
cR <- c(cR, .checkArgs(silent, like = logical(1), checkSize = TRUE))
cR <- c(cR, .checkArgs(writeLog, like = logical(1), checkSize = TRUE))
if(length(cR) > 0) {
stop(cR)
}
# Validate the cutoffType parameter
if (!cutoffType %in% cutoffTypes) {
stop(paste0("Parameter error:", "\n",
"Valid cutoff types: ",
"\"", paste(cutoffTypes, collapse = "\" | \""), "\"", "\n",
"Found: \"", cutoffType, "\"", "\n"))
}
# Read header and one contents line
tmp <- readLines(fName, n = 2)
data <- unlist(strsplit(tmp[2], " "))
# Validate theInteractor ID format
patt <- paste(taxID, ".ENSP", sep = "")
if (length(grep(patt, data[1:2])) != 2) {
stop(paste0("ID error:", "\n",
"Expected format ", sprintf("<%s....>", patt), "\n",
"Found: <",paste(data[1:2], collapse = "> and <"), ">", "\n"))
}
# Parse for requested column. STRING data is " "-delimited.
header <- unlist(strsplit(tmp[1], " "))
iCol <- which(net == header) # column that contains the requested net
# Validate that the requested network exists in data
if (length(iCol) != 1) {
stop(paste0("Request error:", "\n",
"Requested network type is ", sprintf("\"%s\"", net), "\n",
"Found network type(s): <",
paste(header[3:length(header)], collapse = ", "),
">", "\n"))
}
# ==== READ DATA ===========================================================
# Create column mask for readr::read_delim()
readMask <- paste("cc",
paste(rep("_", length(header) - 2), collapse = ""),
sep = "")
substr(readMask, iCol, iCol) <- "i" # set target column to integer
# start reading
if (!silent) { cat("Reading", net, "from", fName, "...") }
# Read file into data frame, use only selected column
netDF <- readr::read_delim(file = fName,
delim = " ",
col_types = readMask,
n_max = Inf)
colnames(netDF) <- c("a", "b", "weight")
nIn <- nrow(netDF)
if (!silent) { cat("done. (", nIn, " edges)", "\n") }
# ==== SELECT EDGES ========================================================
# Remove values below cutoff
if (cutoffType == "xS") {
# select all rows with weight >= val
sel <- netDF$weight >= val
nSel <- sum(sel)
} else if (cutoffType == "xQ") {
# select all rows with weight >= the val-quantile
x <- stats::quantile(netDF$weight, probs = val)
sel <- netDF$weight >= x
nSel <- sum(sel)
} else if (cutoffType == "xN") {
# select the val highest scores, or all, whichever is fewer
x <- order(netDF$weight, decreasing = TRUE)
sel <- x[1:min(val, length(x))]
nSel <- min(val, length(x))
}
netDF <- netDF[sel, ] # Keep only selected edges
# ==== MAP IDS TO GENE SYMBOLS =============================================
# load fastMap hash table
fastMapENSP <- readRDS(system.file("extdata",
"fastMapENSP.rds",
package = "rete"))
# remove taxID prefix from STRING-format IDs
netDF$a <- gsub("^[0-9]*\\.{0,1}", "", netDF$a)
netDF$b <- gsub("^[0-9]*\\.{0,1}", "", netDF$b)
# map ID to gene names
mapA <- fastMap(netDF$a, fastMapENSP, type = "ENSP", dev = TRUE)
mapB <- fastMap(netDF$b, fastMapENSP, type = "ENSP", dev = TRUE)
if (dropUnmapped) {
netDF$a <- mapA
netDF$b <- mapB
# select edges where one or the other gene symbol is NA
sel <- is.na(netDF$a) | is.na(netDF$b)
nDrop <- sum(sel)
netDF <- netDF[-sel, ] # Drop unmapped edges
} else {
# replace mapped IDs with their gene symbol
netDF$a[! is.na(mapA)] <- mapA[! is.na(mapA)]
netDF$b[! is.na(mapB)] <- mapB[! is.na(mapB)]
}
# ==== CREATE GRAPH OBJECT =================================================
gG <- .df2gG(netDF)
# ==== WRITE LOG ===========================================================
if(writeLog) {
myTitle <- "importNet.STRING"
# Compile function call record
myCall <- character()
myCall[1] <- "importNet.STRING("
myCall[2] <- sprintf("fname = \"%s\", ", fName)
myCall[3] <- sprintf("net = \"%s\", ", net)
myCall[4] <- sprintf("cutoffType = \"%s\", ", cutoffType)
myCall[5] <- sprintf("val = %s, ", as.character(val))
myCall[6] <- sprintf("taxID = \"%s\", ", taxID)
myCall[7] <- sprintf("dropUnmapped = %s, ", as.character(dropUnmapped))
myCall[8] <- sprintf("silent = %s, ", as.character(silent))
myCall[9] <- sprintf("writeLog = %s)", as.character(writeLog))
myCall <- paste0(myCall, collapse = "")
# indicate input object name(s)
# (NA for this importNet.STRING())
# Record progress information
myNotes <- character()
myNotes <- c(myNotes, sprintf("Read %s edges from file.", nIn))
myNotes <- c(myNotes, sprintf("Selected %s edges via cutoff.", nSel))
if (dropUnmapped) {
myNotes <- c(myNotes, sprintf("Droppped %s unmapped edges.", nDrop))
}
myNotes <- c(myNotes, sprintf("gG object has %s vertices and %s edges.",
igraph::gorder(gG), igraph::gsize(gG)))
# indicate output object name(s)
myOutput = c("gG")
# send info to log file
logEvent(eventTitle = myTitle,
eventCall = myCall,
# input = character(),
notes = myNotes,
output = myOutput)
}
return(gG)
}
# [END]