Ensure that the scraper only scrape to the latest inserted report

kremio · Oct 23, 2018 · dd70063 · dd70063
1 parent bf29aeb
commit dd70063
Show file tree

Hide file tree

Showing 6 changed files with 52 additions and 14 deletions.
diff --git a/.gitignore b/.gitignore
@@ -59,3 +59,6 @@ typings/
 
 # next.js build output
 .next
+
+# sqlite db
+*.sqlite
diff --git a/jest.config.js b/jest.config.js
@@ -3,5 +3,5 @@ const path = require('path')
 
 module.exports = {
   "verbose": false,
-  "rootDir": path.resolve(__dirname,'..'),
+  "rootDir": path.resolve(__dirname)
 }
diff --git a/package-lock.json b/package-lock.json
diff --git a/scrape.js b/scrape.js
@@ -31,7 +31,9 @@ class ReportStream extends Readable{
       return this.push(data)
     }
     this.queue.onEnd = (data) => {
-      this.push(data)
+      if(data){
+        this.push(data)
+      }
       this.push(null)
     }
   }
@@ -110,10 +112,8 @@ class RequestsQueue  {
       this.done = true
     }
 
-
     await Promise.all( nextBatch.map( (url) => scrapeReport(url) ) )
       .then( async (reports) => {
-
         if( this.isDone() ){
           clearTimeout( this.timeout )
           this.timeout = undefined
@@ -142,14 +142,18 @@ class RequestsQueue  {
   }
 }
 
-const scrape = async (options) => {
+const scrape = async (options, verbose = false) => {
   //Override defaults with given options
   const opts = Object.assign({
     groupSize: 5,
     groupInterval: 30000, //in ms
     stopAtReportURI: false
   }, options)
 
+  if(verbose){
+    console.log("## Scraper setup:", options)
+  }
+
   const {reportsURLs, pageCount} = await scrapeIndex( DEFAULT_INDEX_URL )
 
   const queue = new RequestsQueue( pageCount, reportsURLs, opts.groupSize, opts.groupInterval, opts.stopAtReportURI )

diff --git a/scraper.js b/scraper.js
@@ -3,25 +3,35 @@
 const path = require('path')
 const getDB = require('rwv-sqlite/lib/db')
 const InsertStream = require('rwv-sqlite/lib/stream')
-//const {launch,JSONToString} = require('rwv-sqlite')
+const {JSONToString} = require('rwv-sqlite')
 const scrape = require('./scrape')
 
+const toStringStream = new JSONToString()
+
+let insert
 //Setup the database
-getDB( path.resolve('./config/database.json') )
+getDB( path.resolve('./config/database.json'), false )
   .then( ({DB, migrations}) => new Promise( (s,f) => {
     //Get the last inserted report, if any
-    DB.db.get('SELECT uri FROM data ORDER BY createdDate DESC LIMIT 1', (err, row) => {
+    DB.db.get('SELECT uri FROM data ORDER BY createdDate ASC LIMIT 1', (err, row) => {
       if(err){
         f(err)
         return
       }
-      s(DB, row ? row.uri : false)
+      insert = new InsertStream({}, DB)
+      s( row ? row.uri : false )
     })
   })
-  ).then( (DB, stopAtReportURI) => {
-    //Scrape and insert
-    const insertStream = new InsertStream({}, DB)
-    scrape({stopAtReportURI}).pipe(insertStream)
+  ).then( (stopAtReportURI) => scrape({stopAtReportURI}, true) )
+  .then( (source) => {
+    try{
+    source.pipe(insert)
+      .pipe(toStringStream)
+      .pipe(process.stdout)
+    }catch(e){
+      console.log(e)
+    }
   }).catch( (e) => {
     console.error(e)
+    console.error(e.stack)
   })
diff --git a/tests/scrape.test.js b/tests/scrape.test.js
@@ -218,4 +218,25 @@ describe('Scraper stream', () => {
 
   })
 
+  test( 'Nothing to scrape', async (done) => {
+    scrapeIndex.mockImplementationOnce(() => ({ //page 1
+      reportsURLs: [1],
+      pageCount: 3
+    }))
+
+    const reportStream = await scrape( { groupSize: 2, groupInterval: 1, stopAtReportURI: 1  } )
+
+
+    reportStream.on('data', (chunk) => {
+      done.fail( new Error('No chunk should have been emitted by the stream') )
+    })
+
+    reportStream.on('end', () => {
+      expect( scrapeIndex ).toHaveBeenCalledTimes(1)
+      expect( scrapeReport ).not.toHaveBeenCalled()
+      done()
+    })
+
+  })
+
 })