Skip to content

Commit

Permalink
Ensure that the scraper only scrape to the latest inserted report
Browse files Browse the repository at this point in the history
  • Loading branch information
kremio committed Oct 23, 2018
1 parent bf29aeb commit dd70063
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 14 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Expand Up @@ -59,3 +59,6 @@ typings/

# next.js build output
.next

# sqlite db
*.sqlite
2 changes: 1 addition & 1 deletion jest.config.js
Expand Up @@ -3,5 +3,5 @@ const path = require('path')

module.exports = {
"verbose": false,
"rootDir": path.resolve(__dirname,'..'),
"rootDir": path.resolve(__dirname)
}
2 changes: 1 addition & 1 deletion package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 8 additions & 4 deletions scrape.js
Expand Up @@ -31,7 +31,9 @@ class ReportStream extends Readable{
return this.push(data)
}
this.queue.onEnd = (data) => {
this.push(data)
if(data){
this.push(data)
}
this.push(null)
}
}
Expand Down Expand Up @@ -110,10 +112,8 @@ class RequestsQueue {
this.done = true
}


await Promise.all( nextBatch.map( (url) => scrapeReport(url) ) )
.then( async (reports) => {

if( this.isDone() ){
clearTimeout( this.timeout )
this.timeout = undefined
Expand Down Expand Up @@ -142,14 +142,18 @@ class RequestsQueue {
}
}

const scrape = async (options) => {
const scrape = async (options, verbose = false) => {
//Override defaults with given options
const opts = Object.assign({
groupSize: 5,
groupInterval: 30000, //in ms
stopAtReportURI: false
}, options)

if(verbose){
console.log("## Scraper setup:", options)
}

const {reportsURLs, pageCount} = await scrapeIndex( DEFAULT_INDEX_URL )

const queue = new RequestsQueue( pageCount, reportsURLs, opts.groupSize, opts.groupInterval, opts.stopAtReportURI )
Expand Down
26 changes: 18 additions & 8 deletions scraper.js
Expand Up @@ -3,25 +3,35 @@
const path = require('path')
const getDB = require('rwv-sqlite/lib/db')
const InsertStream = require('rwv-sqlite/lib/stream')
//const {launch,JSONToString} = require('rwv-sqlite')
const {JSONToString} = require('rwv-sqlite')
const scrape = require('./scrape')

const toStringStream = new JSONToString()

let insert
//Setup the database
getDB( path.resolve('./config/database.json') )
getDB( path.resolve('./config/database.json'), false )
.then( ({DB, migrations}) => new Promise( (s,f) => {
//Get the last inserted report, if any
DB.db.get('SELECT uri FROM data ORDER BY createdDate DESC LIMIT 1', (err, row) => {
DB.db.get('SELECT uri FROM data ORDER BY createdDate ASC LIMIT 1', (err, row) => {
if(err){
f(err)
return
}
s(DB, row ? row.uri : false)
insert = new InsertStream({}, DB)
s( row ? row.uri : false )
})
})
).then( (DB, stopAtReportURI) => {
//Scrape and insert
const insertStream = new InsertStream({}, DB)
scrape({stopAtReportURI}).pipe(insertStream)
).then( (stopAtReportURI) => scrape({stopAtReportURI}, true) )
.then( (source) => {
try{
source.pipe(insert)
.pipe(toStringStream)
.pipe(process.stdout)
}catch(e){
console.log(e)
}
}).catch( (e) => {
console.error(e)
console.error(e.stack)
})
21 changes: 21 additions & 0 deletions tests/scrape.test.js
Expand Up @@ -218,4 +218,25 @@ describe('Scraper stream', () => {

})

test( 'Nothing to scrape', async (done) => {
scrapeIndex.mockImplementationOnce(() => ({ //page 1
reportsURLs: [1],
pageCount: 3
}))

const reportStream = await scrape( { groupSize: 2, groupInterval: 1, stopAtReportURI: 1 } )


reportStream.on('data', (chunk) => {
done.fail( new Error('No chunk should have been emitted by the stream') )
})

reportStream.on('end', () => {
expect( scrapeIndex ).toHaveBeenCalledTimes(1)
expect( scrapeReport ).not.toHaveBeenCalled()
done()
})

})

})

0 comments on commit dd70063

Please sign in to comment.