@@ -11,18 +11,13 @@ var isUrl = require('is-url')
11
11
var isArray = Array . isArray
12
12
var fs = require ( 'fs' )
13
13
14
- function handleStreamError ( stream , fn ) {
15
- fn ( function ( err ) {
16
- if ( err ) stream . emit ( 'error' , err )
17
- } )
18
- }
19
-
20
14
/**
21
15
* Locals
22
16
*/
23
17
24
18
var absolutes = require ( './lib/absolutes' )
25
19
var resolve = require ( './lib/resolve' )
20
+ var streamHelper = require ( './lib/stream' )
26
21
var params = require ( './lib/params' )
27
22
var walk = require ( './lib/walk' )
28
23
@@ -69,6 +64,9 @@ function Xray () {
69
64
var pages = [ ]
70
65
var stream
71
66
67
+ var walkHTML = WalkHTML ( xray , selector , scope )
68
+ var request = Request ( crawler )
69
+
72
70
function node ( source2 , fn ) {
73
71
if ( arguments . length === 1 ) {
74
72
fn = source2
@@ -84,10 +82,10 @@ function Xray () {
84
82
85
83
if ( isUrl ( source ) ) {
86
84
debug ( 'starting at: %s' , source )
87
- xray . request ( source , function ( err , html ) {
85
+ request ( source , function ( err , html ) {
88
86
if ( err ) return next ( err )
89
87
var $ = load ( html , source )
90
- node . html ( $ , next )
88
+ walkHTML ( $ , next )
91
89
} )
92
90
} else if ( scope && ~ scope . indexOf ( '@' ) ) {
93
91
debug ( 'resolving to a url: %s' , scope )
@@ -96,21 +94,21 @@ function Xray () {
96
94
// ensure that a@href is a URL
97
95
if ( ! isUrl ( url ) ) {
98
96
debug ( '%s is not a url. Skipping!' , url )
99
- return node . html ( load ( '' ) , next )
97
+ return walkHTML ( load ( '' ) , next )
100
98
}
101
99
102
100
debug ( 'resolved "%s" to a %s' , scope , url )
103
- xray . request ( url , function ( err , html ) {
101
+ request ( url , function ( err , html ) {
104
102
if ( err ) return next ( err )
105
103
var $ = load ( html , url )
106
- node . html ( $ , next )
104
+ walkHTML ( $ , next )
107
105
} )
108
106
} else if ( source ) {
109
107
var $ = load ( source )
110
- node . html ( $ , next )
108
+ walkHTML ( $ , next )
111
109
} else {
112
110
debug ( '%s is not a url or html. Skipping!' , source )
113
- return node . html ( load ( '' ) , next )
111
+ return walkHTML ( load ( '' ) , next )
114
112
}
115
113
116
114
function next ( err , obj , $ ) {
@@ -120,8 +118,8 @@ function Xray () {
120
118
121
119
// create the stream
122
120
if ( ! stream ) {
123
- if ( paginate ) stream = stream_array ( state . stream )
124
- else stream = stream_object ( state . stream )
121
+ if ( paginate ) stream = streamHelper . array ( state . stream )
122
+ else stream = streamHelper . object ( state . stream )
125
123
}
126
124
127
125
if ( paginate ) {
@@ -152,10 +150,10 @@ function Xray () {
152
150
debug ( 'paginating %j' , url )
153
151
isFinite ( limit ) && debug ( '%s page(s) left to crawl' , limit )
154
152
155
- xray . request ( url , function ( err , html ) {
153
+ request ( url , function ( err , html ) {
156
154
if ( err ) return next ( err )
157
155
var $ = load ( html , url )
158
- node . html ( $ , next )
156
+ walkHTML ( $ , next )
159
157
} )
160
158
} else {
161
159
stream ( obj , true )
@@ -166,54 +164,6 @@ function Xray () {
166
164
return node
167
165
}
168
166
169
- function load ( html , url ) {
170
- html = html || ''
171
- var $ = html . html ? html : cheerio . load ( html )
172
- if ( url ) $ = absolutes ( url , $ )
173
- return $
174
- }
175
-
176
- node . html = function ( $ , fn ) {
177
- walk ( selector , function ( v , k , next ) {
178
- if ( typeof v === 'string' ) {
179
- var value = resolve ( $ , root ( scope ) , v )
180
- return next ( null , value )
181
- } else if ( typeof v === 'function' ) {
182
- return v ( $ , function ( err , obj ) {
183
- if ( err ) return next ( err )
184
- return next ( null , obj )
185
- } )
186
- } else if ( isArray ( v ) ) {
187
- if ( typeof v [ 0 ] === 'string' ) {
188
- return next ( null , resolve ( $ , root ( scope ) , v ) )
189
- } else if ( typeof v [ 0 ] === 'object' ) {
190
- var $scope = $ . find ? $ . find ( scope ) : $ ( scope )
191
- var pending = $scope . length
192
- var out = [ ]
193
-
194
- // Handle the empty result set (thanks @jenbennings!)
195
- if ( ! pending ) return next ( null , out )
196
-
197
- $scope . each ( function ( i , el ) {
198
- var $innerscope = $scope . eq ( i )
199
- var node = xray ( scope , v [ 0 ] )
200
- node ( $innerscope , function ( err , obj ) {
201
- if ( err ) return next ( err )
202
- out [ i ] = obj
203
- if ( ! -- pending ) {
204
- return next ( null , compact ( out ) )
205
- }
206
- } )
207
- } )
208
- }
209
- }
210
- return next ( )
211
- } , function ( err , obj ) {
212
- if ( err ) return fn ( err )
213
- fn ( null , obj , $ )
214
- } )
215
- }
216
-
217
167
node . paginate = function ( paginate ) {
218
168
if ( ! arguments . length ) return state . paginate
219
169
state . paginate = paginate
@@ -229,29 +179,20 @@ function Xray () {
229
179
node . stream = function ( ) {
230
180
state . stream = store . createWriteStream ( )
231
181
var rs = store . createReadStream ( )
232
- handleStreamError ( rs , node )
182
+ streamHelper . waitCb ( rs , node )
233
183
return rs
234
184
}
235
185
236
186
node . write = function ( path ) {
237
187
if ( ! arguments . length ) return node . stream ( )
238
188
state . stream = fs . createWriteStream ( path )
239
- handleStreamError ( state . stream , node )
189
+ streamHelper . waitCb ( state . stream , node )
240
190
return state . stream
241
191
}
242
192
243
193
return node
244
194
}
245
195
246
- xray . request = function ( url , fn ) {
247
- debug ( 'fetching %s' , url )
248
- crawler ( url , function ( err , ctx ) {
249
- if ( err ) return fn ( err )
250
- debug ( 'got response for %s with status code: %s' , url , ctx . status )
251
- return fn ( null , ctx . body )
252
- } )
253
- }
254
-
255
196
methods . forEach ( function ( method ) {
256
197
xray [ method ] = function ( ) {
257
198
if ( ! arguments . length ) return crawler [ method ] ( )
@@ -263,13 +204,29 @@ function Xray () {
263
204
return xray
264
205
}
265
206
207
+ function Request ( crawler ) {
208
+ return function request ( url , fn ) {
209
+ debug ( 'fetching %s' , url )
210
+ crawler ( url , function ( err , ctx ) {
211
+ if ( err ) return fn ( err )
212
+ debug ( 'got response for %s with status code: %s' , url , ctx . status )
213
+ return fn ( null , ctx . body )
214
+ } )
215
+ }
216
+ }
217
+
218
+ function load ( html , url ) {
219
+ var $ = html . html ? html : cheerio . load ( html )
220
+ if ( url ) $ = absolutes ( url , $ )
221
+ return $
222
+ }
223
+
266
224
/**
267
225
* Get the root, if there is one.
268
226
*
269
227
* @param {Mixed }
270
228
* @return {Boolean|String }
271
229
*/
272
-
273
230
function root ( selector ) {
274
231
return ( typeof selector === 'string' || isArray ( selector ) ) &&
275
232
! ~ selector . indexOf ( '@' ) &&
@@ -294,53 +251,45 @@ function compact (arr) {
294
251
} )
295
252
}
296
253
297
- /**
298
- * Streaming array helper
299
- *
300
- * @param {Stream } data (optional)
301
- */
302
-
303
- function stream_array ( stream ) {
304
- if ( ! stream ) return function ( ) { }
305
- var first = true
306
-
307
- return function _stream_array ( data , end ) {
308
- var json = JSON . stringify ( data , true , 2 )
309
-
310
- if ( first ) {
311
- stream . write ( '[\n' )
312
- first = false
313
- }
314
-
315
- if ( isArray ( data ) ) {
316
- json = json . slice ( 1 , - 1 )
317
- }
318
-
319
- if ( end ) {
320
- stream . end ( json + ']' )
321
- } else {
322
- stream . write ( json + ',' )
323
- }
324
- }
325
- }
326
-
327
- /**
328
- * Streaming object helper
329
- *
330
- * @param {Stream } data (optional)
331
- * @return {Function }
332
- */
333
-
334
- function stream_object ( stream ) {
335
- if ( ! stream ) return function ( ) { }
336
-
337
- return function _stream_object ( data , end ) {
338
- var json = JSON . stringify ( data , true , 2 )
339
-
340
- if ( end ) {
341
- stream . end ( json )
342
- } else {
343
- stream . write ( json )
344
- }
254
+ function WalkHTML ( xray , selector , scope ) {
255
+ return function _walkHTML ( $ , fn ) {
256
+ walk ( selector , function ( v , k , next ) {
257
+ if ( typeof v === 'string' ) {
258
+ var value = resolve ( $ , root ( scope ) , v )
259
+ return next ( null , value )
260
+ } else if ( typeof v === 'function' ) {
261
+ return v ( $ , function ( err , obj ) {
262
+ if ( err ) return next ( err )
263
+ return next ( null , obj )
264
+ } )
265
+ } else if ( isArray ( v ) ) {
266
+ if ( typeof v [ 0 ] === 'string' ) {
267
+ return next ( null , resolve ( $ , root ( scope ) , v ) )
268
+ } else if ( typeof v [ 0 ] === 'object' ) {
269
+ var $scope = $ . find ? $ . find ( scope ) : $ ( scope )
270
+ var pending = $scope . length
271
+ var out = [ ]
272
+
273
+ // Handle the empty result set (thanks @jenbennings!)
274
+ if ( ! pending ) return next ( null , out )
275
+
276
+ $scope . each ( function ( i , el ) {
277
+ var $innerscope = $scope . eq ( i )
278
+ var node = xray ( scope , v [ 0 ] )
279
+ node ( $innerscope , function ( err , obj ) {
280
+ if ( err ) return next ( err )
281
+ out [ i ] = obj
282
+ if ( ! -- pending ) {
283
+ return next ( null , compact ( out ) )
284
+ }
285
+ } )
286
+ } )
287
+ }
288
+ }
289
+ return next ( )
290
+ } , function ( err , obj ) {
291
+ if ( err ) return fn ( err )
292
+ fn ( null , obj , $ )
293
+ } )
345
294
}
346
295
}
0 commit comments