Browse files

GoogleAdSection

  • Loading branch information...
1 parent 9e8561d commit 127112ff7a3c51531f6fc9469600d7d536a5f740 @tarao tarao committed Sep 9, 2009
Showing with 112 additions and 29 deletions.
  1. +89 −16 lib/extract-content.js
  2. +1 −1 lib/scoring-words.js
  3. +21 −11 sketch/extract-content.test.js
  4. +1 −1 sketch/suggest-tag.test.js
View
105 lib/extract-content.js
@@ -7,7 +7,10 @@ if (typeof ExtractContentJS == 'undefined') {
var A = ns.Lib.A;
var DOM = ns.Lib.DOM;
- var Leaf = Util.inherit(function(node, depth, inside, limit) {
+ var Leaf = Util.inherit(function(node/*, depth, inside, limit*/) {
+ var depth = arguments[1] || 0;
+ var inside = arguments[2] || {};
+ var limit = arguments[3] || 1048576;
var leaf = { node: node, depth: depth, inside: inside };
leaf.statistics = function() {
@@ -44,6 +47,19 @@ if (typeof ExtractContentJS == 'undefined') {
}
});
+ var Block = function(leaves) {
+ leaves = A.filter(leaves, function(v) {
+ var s = DOM.text(v.node) || '';
+ s = s.replace(/\s+/g, '');
+ return s.length != 0;
+ });
+ var block = { score: 0, leaves: leaves };
+ block.commonAncestor = function() {
+ return Leaf.commonAncestor.apply(null, block.leaves);
+ };
+ return block;
+ };
+
var Content = function(c) {
var self = { _content: c };
@@ -64,13 +80,16 @@ if (typeof ExtractContentJS == 'undefined') {
}, '');
return self._textFragment;
};
- self.toString = function() {
+ self.asText = function() {
if (self._text) return self._text;
// covering node
var node = self.asNode();
self._text = node ? DOM.text(node) : '';
return self._text;
};
+ self.toString = function() {
+ return self.asTextFragment();
+ };
return self;
};
@@ -83,6 +102,7 @@ if (typeof ExtractContentJS == 'undefined') {
if (typeof ns.LayeredExtractor.Handler != 'undefined') {
return new ns.LayeredExtractor.Handler[name];
}
+ return null;
}
};
@@ -126,6 +146,7 @@ if (typeof ExtractContentJS == 'undefined') {
ns.LayeredExtractor.Handler.Heuristics = function(/*option, pattern*/) {
var self = {
+ name: 'Heuristics',
content: [],
opt: Util.inherit(arguments[0], {
threshold: 60,
@@ -181,14 +202,8 @@ if (typeof ExtractContentJS == 'undefined') {
})
};
- var Block = Util.inherit(function(leaves) {
- leaves = A.filter(leaves, function(v) {
- var s = DOM.text(v.node) || '';
- s = s.replace(/\s+/g, '');
- return s.length != 0;
- });
- var n = leaves.length;
- var block = { leaves: leaves };
+ var MyBlock = Util.inherit(function(leaves) {
+ var block = new Block(leaves);
block.eliminateLinks = function() {
var st = A.map(block.leaves, function(v){
@@ -271,10 +286,6 @@ if (typeof ExtractContentJS == 'undefined') {
return block;
};
- block.commonAncestor = function() {
- return Leaf.commonAncestor.apply(null, block.leaves);
- };
-
return block;
}, {
split: function(node) {
@@ -285,7 +296,7 @@ if (typeof ExtractContentJS == 'undefined') {
var flush = function(flag) {
if (flag && buf.length) {
- r.push(new Block(buf));
+ r.push(new MyBlock(buf));
buf = [];
}
};
@@ -338,7 +349,7 @@ if (typeof ExtractContentJS == 'undefined') {
var score = 0;
var res = [];
- var blocks = Block.split(d.body);
+ var blocks = MyBlock.split(d.body);
var last;
var len = blocks.length;
@@ -380,5 +391,67 @@ if (typeof ExtractContentJS == 'undefined') {
return self;
};
+
+ ns.LayeredExtractor.Handler.GoogleAdSection = function(/*opt*/) {
+ var self = {
+ name: 'GoogleAdSection',
+ content: [],
+ state: [],
+ opt: Util.inherit(arguments[0], {
+ limit: {
+ leaves: 800,
+ recursion: 20
+ },
+ debug: false
+ })
+ };
+
+ var pat = {
+ ignore: /google_ad_section_start\(weight=ignore\)/i,
+ section: /google_ad_section_start/i,
+ end: /google_ad_section_end/i
+ };
+ var stIgnore = 1;
+ var stSection = 2;
+
+ self.inSection = function(){return A.last(self.state)==stSection;};
+ self.ignore = function(){self.state.push(stIgnore);}
+ self.section = function(){self.state.push(stSection);}
+ self.end = function(){ if (self.state.length) self.state.pop(); };
+ self.parse = function(node/*, depth*/) {
+ var depth = arguments[1] || 0;
+ if (node.nodeName == '#comment') {
+ if (pat.ignore.test(node.nodeValue)) {
+ self.ignore();
+ } else if (pat.section.test(node.nodeValue)) {
+ self.section();
+ } else if (pat.end.test(node.nodeValue)) {
+ self.end();
+ }
+ return;
+ }
+
+ if (self.content.length >= self.opt.limit.leaves) return;
+ if (depth >= self.opt.limit.recursion) return;
+ var children = node.childNodes;
+ var len = children.length;
+ for (var i=0; i < len; i++) {
+ var c = children[i];
+ self.parse(c, depth+1);
+ }
+ if (!len && self.inSection()) {
+ self.content.push(new Leaf(node, depth));
+ }
+ return;
+ };
+
+ self.extract = function(d/*, url, res*/) {
+ self.parse(d);
+ self.blocks = [ new Block(self.content) ];
+ return self.content;
+ };
+
+ return self;
+ };
})(ExtractContentJS);
View
2 lib/scoring-words.js
@@ -74,7 +74,7 @@ if (typeof ExtractContentJS == 'undefined') {
var w = t.toLowerCase();
tf += Util.countMatchTokenized(content, w);
tf += Util.countMatchTokenized(title, w);
- tf += Util.countMatchTokenized(url, w);
+// tf += Util.countMatchTokenized(url, w);
scores[t] = tf/idf;
if (scores[t] > max) max = scores[t];
View
32 sketch/extract-content.test.js
@@ -55,27 +55,37 @@
var ex = new ExtractContentJS.LayeredExtractor();
// ex.addHandler( ex.factory.getHandler('Description') );
// ex.addHandler( ex.factory.getHandler('Scraper'));
-// ex.addHandler( ex.factory.getHandler('GoogleAdsence') );
+// ex.addHandler( ex.factory.getHandler('GoogleAdSection') );
ex.addHandler( ex.factory.getHandler('Heuristics') );
timer.start('extract');
var res = ex.extract(d);
var time = timer.stop('extract').elapsed;
if (!res.isSuccess) {
return d.createTextNode('failed');
- } else if (!debug) {
+ }
+
+ var div = d.createElement('div');
+ var h1 = d.createElement('h1');
+ h1.appendChild(d.createTextNode(res.engine.name));
+ div.appendChild(h1);
+
+ if (!debug) {
if (l.ExtractContentTest.asText) {
- return d.createTextNode(res.content.toString());
+ var text = res.content.toString();
+ div.appendChild(d.createTextNode(text));
} else if (l.ExtractContentTest.asTextFragment) {
- return d.createTextNode(res.content.asTextFragment());
- }
- var node = res.content.asNode();
- if (node != d.body) {
- return node.cloneNode(true);
+ var text = res.content.asTextFragment();
+ div.appendChild(d.createTextNode(text));
+ } else {
+ var node = res.content.asNode();
+ if (node != d.body) {
+ div.appendChild(node.cloneNode(true));
+ }
}
+ return div;
} else { // debug
var blocks = res.engine.blocks || [ res.content.asLeaves() ];
- var div = d.createElement('div');
var pTimer = d.createElement('p');
pTimer.appendChild(d.createTextNode(time+'msec'));
@@ -99,9 +109,9 @@
ul.appendChild(li);
});
div.appendChild(ul);
-
- return div;
}
+
+ return div;
};
l.ExtractContentTest.doTest = function() {
View
2 sketch/suggest-tag.test.js
@@ -453,7 +453,7 @@
var ex = new ns.LayeredExtractor();
// ex.addHandler( ex.factory.getHandler('Description') );
// ex.addHandler( ex.factory.getHandler('Scraper'));
-// ex.addHandler( ex.factory.getHandler('GoogleAdsence') );
+// ex.addHandler( ex.factory.getHandler('GoogleAdSection') );
ex.addHandler( ex.factory.getHandler('Heuristics') );
timer.start('extract');
var res = ex.extract(d);

0 comments on commit 127112f

Please sign in to comment.