diff --git a/.gitignore b/.gitignore index ad46b30..4db5660 100644 --- a/.gitignore +++ b/.gitignore @@ -59,3 +59,6 @@ typings/ # next.js build output .next + +# sqlite db +*.sqlite diff --git a/jest.config.js b/jest.config.js index 125957b..82c3d11 100644 --- a/jest.config.js +++ b/jest.config.js @@ -3,5 +3,5 @@ const path = require('path') module.exports = { "verbose": false, - "rootDir": path.resolve(__dirname,'..'), + "rootDir": path.resolve(__dirname) } diff --git a/package-lock.json b/package-lock.json index 04432a1..4d69ebf 100644 --- a/package-lock.json +++ b/package-lock.json @@ -4740,7 +4740,7 @@ } }, "rwv-sqlite": { - "version": "git+https://github.com/kremio/rwv-sqlite-js.git#8428faca00d48366acf523a93898f1dafe111194", + "version": "git+https://github.com/kremio/rwv-sqlite-js.git#c7cb01b40e71718853d3831bf102afd49c3f5cf6", "from": "git+https://github.com/kremio/rwv-sqlite-js.git", "requires": { "db-migrate": "^0.11.3", diff --git a/scrape.js b/scrape.js index 6c078cb..d73f1e8 100644 --- a/scrape.js +++ b/scrape.js @@ -31,7 +31,9 @@ class ReportStream extends Readable{ return this.push(data) } this.queue.onEnd = (data) => { - this.push(data) + if(data){ + this.push(data) + } this.push(null) } } @@ -110,10 +112,8 @@ class RequestsQueue { this.done = true } - await Promise.all( nextBatch.map( (url) => scrapeReport(url) ) ) .then( async (reports) => { - if( this.isDone() ){ clearTimeout( this.timeout ) this.timeout = undefined @@ -142,7 +142,7 @@ class RequestsQueue { } } -const scrape = async (options) => { +const scrape = async (options, verbose = false) => { //Override defaults with given options const opts = Object.assign({ groupSize: 5, @@ -150,6 +150,10 @@ const scrape = async (options) => { stopAtReportURI: false }, options) + if(verbose){ + console.log("## Scraper setup:", options) + } + const {reportsURLs, pageCount} = await scrapeIndex( DEFAULT_INDEX_URL ) const queue = new RequestsQueue( pageCount, reportsURLs, opts.groupSize, opts.groupInterval, opts.stopAtReportURI ) diff --git a/scraper.js b/scraper.js index 2d331de..4893997 100644 --- a/scraper.js +++ b/scraper.js @@ -3,25 +3,35 @@ const path = require('path') const getDB = require('rwv-sqlite/lib/db') const InsertStream = require('rwv-sqlite/lib/stream') -//const {launch,JSONToString} = require('rwv-sqlite') +const {JSONToString} = require('rwv-sqlite') const scrape = require('./scrape') +const toStringStream = new JSONToString() + +let insert //Setup the database -getDB( path.resolve('./config/database.json') ) +getDB( path.resolve('./config/database.json'), false ) .then( ({DB, migrations}) => new Promise( (s,f) => { //Get the last inserted report, if any - DB.db.get('SELECT uri FROM data ORDER BY createdDate DESC LIMIT 1', (err, row) => { + DB.db.get('SELECT uri FROM data ORDER BY createdDate ASC LIMIT 1', (err, row) => { if(err){ f(err) return } - s(DB, row ? row.uri : false) + insert = new InsertStream({}, DB) + s( row ? row.uri : false ) }) }) - ).then( (DB, stopAtReportURI) => { - //Scrape and insert - const insertStream = new InsertStream({}, DB) - scrape({stopAtReportURI}).pipe(insertStream) + ).then( (stopAtReportURI) => scrape({stopAtReportURI}, true) ) + .then( (source) => { + try{ + source.pipe(insert) + .pipe(toStringStream) + .pipe(process.stdout) + }catch(e){ + console.log(e) + } }).catch( (e) => { console.error(e) + console.error(e.stack) }) diff --git a/tests/scrape.test.js b/tests/scrape.test.js index bcb6a19..77c1a14 100644 --- a/tests/scrape.test.js +++ b/tests/scrape.test.js @@ -218,4 +218,25 @@ describe('Scraper stream', () => { }) + test( 'Nothing to scrape', async (done) => { + scrapeIndex.mockImplementationOnce(() => ({ //page 1 + reportsURLs: [1], + pageCount: 3 + })) + + const reportStream = await scrape( { groupSize: 2, groupInterval: 1, stopAtReportURI: 1 } ) + + + reportStream.on('data', (chunk) => { + done.fail( new Error('No chunk should have been emitted by the stream') ) + }) + + reportStream.on('end', () => { + expect( scrapeIndex ).toHaveBeenCalledTimes(1) + expect( scrapeReport ).not.toHaveBeenCalled() + done() + }) + + }) + })