Skip to content
Browse files

FIX: Test scraping based on the Open Graph Protocol.

  • Loading branch information...
1 parent 9e67960 commit 7ed41ab9601522680320fe2ea6dbadf5322860ee @lbdremy committed
Showing with 106 additions and 10 deletions.
  1. +4 −4 lib/defaults/index.js
  2. +1 −1 lib/utils/index.js
  3. +46 −0 test/resources/page-open-graph.html
  4. +55 −5 test/use-defaults-test.js
View
8 lib/defaults/index.js
@@ -155,25 +155,25 @@ function scrapImage(window){
function scrapTitle(window){
var $ = window.$;
- var url = window.url;
+ var url = window.location.href;
// Tags or attributes whom can contain a nice title for the page
var titleTag = $('title').text().trim();
var metaTitleTag = $('meta[name="title"]').attr('content');
+ var openGraphTitle = $('meta[property="og:title"]').attr('content');
var h1Tag = $('h1:first').text().trim();
var itempropNameTag = $('[itemprop="name"]').text().trim();
- var titles = [titleTag, metaTitleTag, h1Tag, itempropNameTag];
+ var titles = [titleTag, metaTitleTag, openGraphTitle, h1Tag, itempropNameTag];
// Regex of the web site name
var nameWebsite = utils.getWebsiteName(url);
var regex = new RegExp(nameWebsite,'i');
-
// Sort to find the best title
var titlesNotEmpty = titles.filter(function(value){
return !!value;
});
var titlesBest = titlesNotEmpty.filter(function(value){
- return regex.test(value);
+ return !regex.test(value);
});
var bestTitle = (titlesBest && titlesBest[0]) || (titlesNotEmpty && titlesNotEmpty[0]) || '';
return utils.inline(bestTitle);
View
2 lib/utils/index.js
@@ -31,7 +31,7 @@ exports.isURL = function isURL(path) {
exports.toURL = function toURL(path,uri){
var absolutePath = path;
- if(!isURL(path)){
+ if(!exports.isURL(path)){
var explodeURL = url.parse(uri);
// 2 cases: absolute path and relative path to the current pathname
if( path.charAt(0) === '/'){
View
46 test/resources/page-open-graph.html
@@ -0,0 +1,46 @@
+<!DOCTYPE html>
+<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
+<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
+<!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
+ <head>
+ <meta charset="utf-8">
+ <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
+ <title>localhost</title>
+ <meta name="description" content="">
+ <meta name="viewport" content="width=device-width">
+
+ <!-- Place favicon.ico and apple-touch-icon.png in the root directory -->
+
+ <link rel="stylesheet" href="css/normalize.css">
+ <link rel="stylesheet" href="css/main.css">
+ <script src="js/vendor/modernizr-2.6.2.min.js"></script>
+
+ <!-- Open graph tags -->
+ <meta property="og:title" content="The Rock" />
+ <meta property="og:image" content="http://ia.media-imdb.com/images/rock.jpg" />
+ <meta property="og:description" content="Sean Connery found fame and fortune as the suave, sophisticated British agent, James Bond"/>
+ <meta property="og:video" content="http://example.com/awesome.flv" />
+ </head>
+ <body>
+ <!--[if lt IE 7]>
+ <p class="chromeframe">You are using an <strong>outdated</strong> browser. Please <a href="http://browsehappy.com/">upgrade your browser</a> or <a href="http://www.google.com/chromeframe/?redirect=true">activate Google Chrome Frame</a> to improve your experience.</p>
+ <![endif]-->
+
+ <!-- Add your site or application content here -->
+ <p>Hello world! This is HTML5 Boilerplate.</p>
+
+ <script src="//ajax.googleapis.com/ajax/libs/jquery/1.9.0/jquery.min.js"></script>
+ <script>window.jQuery || document.write('<script src="js/vendor/jquery-1.9.0.min.js"><\/script>')</script>
+ <script src="js/plugins.js"></script>
+ <script src="js/main.js"></script>
+
+ <!-- Google Analytics: change UA-XXXXX-X to be your site's ID. -->
+ <script>
+ var _gaq=[['_setAccount','UA-XXXXX-X'],['_trackPageview']];
+ (function(d,t){var g=d.createElement(t),s=d.getElementsByTagName(t)[0];
+ g.src=('https:'==location.protocol?'//ssl':'//www')+'.google-analytics.com/ga.js';
+ s.parentNode.insertBefore(g,s)}(document,'script'));
+ </script>
+ </body>
+</html>
View
60 test/use-defaults-test.js
@@ -6,7 +6,26 @@ var mocha = require('mocha'),
assert = require('chai').assert,
libPath = process.env['SCRAPINODE_COV'] ? '../lib-cov' : '../lib',
scrapinode = require( libPath + '/scrapinode'),
- ScrapinodeError = require(libPath + '/error/scrapinode-error');
+ ScrapinodeError = require(libPath + '/error/scrapinode-error'),
+ express = require('express'),
+ filed = require('filed');
+
+// Web app
+var app = express();
+
+app.get('/page-open-graph.html',function(req,res){
+ req.pipe(filed(__dirname + '/resources/page-open-graph.html')).pipe(res);
+});
+
+app.get('/page-schema-org.html',function(req,res){
+ req.pipe(filed(__dirname + '/resources/page-schema-org.html')).pipe(res);
+});
+
+app.get('/page-generic-tags.html',function(req,res){
+ req.pipe(filed(__dirname + '/resources/page-generic-tags.html')).pipe(res);
+});
+
+app.listen(1102);
// Test suite
@@ -14,18 +33,49 @@ var mocha = require('mocha'),
function runTestSuite(engine){
describe('scraper using the default operators thanks to ' + engine,function(){
+ beforeEach(function(){
+ scrapinode.clearRouter();
+ });
describe('and scraping by following the Open Graph protocol',function(){
describe('#get("title")',function(){
- it('should retrieve the text representating the title');
+ it('should retrieve the text representating the title',function(done){
+ scrapinode.useAll(scrapinode.defaults());
+ scrapinode.createScraper('http://localhost:1102/page-open-graph.html',function(err,scraper){
+ assert.isNull(err);
+ assert.equal(scraper.get('title'),'The Rock');
+ done();
+ });
+ });
});
describe('#get("descriptions")',function(){
- it('should retrieve a list of text representating the descriptions');
+ it('should retrieve a list of text representating the descriptions',function(done){
+ scrapinode.useAll(scrapinode.defaults());
+ scrapinode.createScraper('http://localhost:1102/page-open-graph.html',function(err,scraper){
+ assert.isNull(err);
+ assert.deepEqual(scraper.get('descriptions'),['Sean Connery found fame and fortune as the suave, sophisticated British agent, James Bond']);
+ done();
+ });
+ });
});
describe('#get("images")',function(){
- it('should retrieve a list of images url');
+ it('should retrieve a list of images url',function(done){
+ scrapinode.useAll(scrapinode.defaults());
+ scrapinode.createScraper('http://localhost:1102/page-open-graph.html',function(err,scraper){
+ assert.isNull(err);
+ assert.deepEqual(scraper.get('images'),['http://ia.media-imdb.com/images/rock.jpg']);
+ done();
+ });
+ });
});
describe('#get("videos")',function(){
- it('should retrieve a list of videos [urls or html representations of the videos]');
+ it('should retrieve a list of videos [urls or html representations of the videos]',function(done){
+ scrapinode.useAll(scrapinode.defaults());
+ scrapinode.createScraper('http://localhost:1102/page-open-graph.html',function(err,scraper){
+ assert.isNull(err);
+ assert.deepEqual(scraper.get('videos'),['http://example.com/awesome.flv']);
+ done();
+ });
+ });
});
});
describe('and scraping by following the Schema.org specifications',function(){

0 comments on commit 7ed41ab

Please sign in to comment.
Something went wrong with that request. Please try again.