-
Notifications
You must be signed in to change notification settings - Fork 0
/
pipeline.js
84 lines (77 loc) · 2.38 KB
/
pipeline.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
//NOTE: When morph.io executes the scraper, it sets process.env.NODE_ENV
//to 'production'
const path = require('path')
const getDB = require('rwv-sqlite/lib/db')
const errorTable = require('rwv-sqlite/lib/error')
const InsertStream = require('rwv-sqlite/lib/stream')
const {JSONToString} = require('rwv-sqlite')
const scrape = require('./scrape')
const toStringStream = new JSONToString()
let insert
let source
let dbHandle
//Setup the database
const pipeline = async () => getDB( path.resolve('./config/database.json'), false )
.then( ({DB, migrations}) => new Promise( (s,f) => {
dbHandle = DB
//Get the newest inserted report, if any
DB.db.get('SELECT uri FROM data ORDER BY createdDate ASC LIMIT 1', (err, row) => {
if(err){
f(err)
return
}
insert = new InsertStream({}, DB)
s( {stopAtReportURI: row ? row.uri : false} )
})
})
)
.then( async (scraperOptions) => {
//Parameters passed by env variables have priority
if( process.env.MORPH_START_PAGE && process.env.MORPH_START_REPORT ){
return Object.assign( scraperOptions, {
startAtReportURI: process.env.MORPH_START_REPORT,
startAtPageURL: process.env.MORPH_START_PAGE
})
}
//Check if they are any error from which to start from
const error = await errorTable.get( dbHandle )
//Consume the error
await errorTable.clear( dbHandle )
if(!error){
return scraperOptions
}
return Object.assign( scraperOptions, {
startAtReportURI: error.reportURI != "NA" ? error.reportURI : false,
startAtPageURL: error.pageURL
})
})
.then( (scraperOptions) => scrape(scraperOptions, true) )
.then( (scraperStream) => {
source = scraperStream
})
.then( () => new Promise( (s,f) => {
//Catch error from the streams
source.on('error', f)
insert.on('error', f)
//done!
insert.on('end', () => s('ok') )
try{
//Start scraping and inserting
source.pipe(insert)
.pipe(toStringStream)
.pipe(process.stdout)
}catch(e){
f(e)
}
}) )
.catch( (e) => {
const reportURI = e.reportURI || 'NA'
const pageURL = source.currentPageURL
const cause = JSON.stringify( e.message )
//save th error to the database
return errorTable.insert( reportURI, cause, pageURL, dbHandle )
.then( () => {
throw e
})
})
module.exports = pipeline