Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

GoogleAdSection

  • Loading branch information...
commit 127112ff7a3c51531f6fc9469600d7d536a5f740 1 parent 9e8561d
INA Lintaro tarao authored
105 lib/extract-content.js
@@ -7,7 +7,10 @@ if (typeof ExtractContentJS == 'undefined') {
7 7 var A = ns.Lib.A;
8 8 var DOM = ns.Lib.DOM;
9 9
10   - var Leaf = Util.inherit(function(node, depth, inside, limit) {
  10 + var Leaf = Util.inherit(function(node/*, depth, inside, limit*/) {
  11 + var depth = arguments[1] || 0;
  12 + var inside = arguments[2] || {};
  13 + var limit = arguments[3] || 1048576;
11 14 var leaf = { node: node, depth: depth, inside: inside };
12 15
13 16 leaf.statistics = function() {
@@ -44,6 +47,19 @@ if (typeof ExtractContentJS == 'undefined') {
44 47 }
45 48 });
46 49
  50 + var Block = function(leaves) {
  51 + leaves = A.filter(leaves, function(v) {
  52 + var s = DOM.text(v.node) || '';
  53 + s = s.replace(/\s+/g, '');
  54 + return s.length != 0;
  55 + });
  56 + var block = { score: 0, leaves: leaves };
  57 + block.commonAncestor = function() {
  58 + return Leaf.commonAncestor.apply(null, block.leaves);
  59 + };
  60 + return block;
  61 + };
  62 +
47 63 var Content = function(c) {
48 64 var self = { _content: c };
49 65
@@ -64,13 +80,16 @@ if (typeof ExtractContentJS == 'undefined') {
64 80 }, '');
65 81 return self._textFragment;
66 82 };
67   - self.toString = function() {
  83 + self.asText = function() {
68 84 if (self._text) return self._text;
69 85 // covering node
70 86 var node = self.asNode();
71 87 self._text = node ? DOM.text(node) : '';
72 88 return self._text;
73 89 };
  90 + self.toString = function() {
  91 + return self.asTextFragment();
  92 + };
74 93
75 94 return self;
76 95 };
@@ -83,6 +102,7 @@ if (typeof ExtractContentJS == 'undefined') {
83 102 if (typeof ns.LayeredExtractor.Handler != 'undefined') {
84 103 return new ns.LayeredExtractor.Handler[name];
85 104 }
  105 + return null;
86 106 }
87 107 };
88 108
@@ -126,6 +146,7 @@ if (typeof ExtractContentJS == 'undefined') {
126 146
127 147 ns.LayeredExtractor.Handler.Heuristics = function(/*option, pattern*/) {
128 148 var self = {
  149 + name: 'Heuristics',
129 150 content: [],
130 151 opt: Util.inherit(arguments[0], {
131 152 threshold: 60,
@@ -181,14 +202,8 @@ if (typeof ExtractContentJS == 'undefined') {
181 202 })
182 203 };
183 204
184   - var Block = Util.inherit(function(leaves) {
185   - leaves = A.filter(leaves, function(v) {
186   - var s = DOM.text(v.node) || '';
187   - s = s.replace(/\s+/g, '');
188   - return s.length != 0;
189   - });
190   - var n = leaves.length;
191   - var block = { leaves: leaves };
  205 + var MyBlock = Util.inherit(function(leaves) {
  206 + var block = new Block(leaves);
192 207
193 208 block.eliminateLinks = function() {
194 209 var st = A.map(block.leaves, function(v){
@@ -271,10 +286,6 @@ if (typeof ExtractContentJS == 'undefined') {
271 286 return block;
272 287 };
273 288
274   - block.commonAncestor = function() {
275   - return Leaf.commonAncestor.apply(null, block.leaves);
276   - };
277   -
278 289 return block;
279 290 }, {
280 291 split: function(node) {
@@ -285,7 +296,7 @@ if (typeof ExtractContentJS == 'undefined') {
285 296
286 297 var flush = function(flag) {
287 298 if (flag && buf.length) {
288   - r.push(new Block(buf));
  299 + r.push(new MyBlock(buf));
289 300 buf = [];
290 301 }
291 302 };
@@ -338,7 +349,7 @@ if (typeof ExtractContentJS == 'undefined') {
338 349 var score = 0;
339 350
340 351 var res = [];
341   - var blocks = Block.split(d.body);
  352 + var blocks = MyBlock.split(d.body);
342 353 var last;
343 354
344 355 var len = blocks.length;
@@ -380,5 +391,67 @@ if (typeof ExtractContentJS == 'undefined') {
380 391
381 392 return self;
382 393 };
  394 +
  395 + ns.LayeredExtractor.Handler.GoogleAdSection = function(/*opt*/) {
  396 + var self = {
  397 + name: 'GoogleAdSection',
  398 + content: [],
  399 + state: [],
  400 + opt: Util.inherit(arguments[0], {
  401 + limit: {
  402 + leaves: 800,
  403 + recursion: 20
  404 + },
  405 + debug: false
  406 + })
  407 + };
  408 +
  409 + var pat = {
  410 + ignore: /google_ad_section_start\(weight=ignore\)/i,
  411 + section: /google_ad_section_start/i,
  412 + end: /google_ad_section_end/i
  413 + };
  414 + var stIgnore = 1;
  415 + var stSection = 2;
  416 +
  417 + self.inSection = function(){return A.last(self.state)==stSection;};
  418 + self.ignore = function(){self.state.push(stIgnore);}
  419 + self.section = function(){self.state.push(stSection);}
  420 + self.end = function(){ if (self.state.length) self.state.pop(); };
  421 + self.parse = function(node/*, depth*/) {
  422 + var depth = arguments[1] || 0;
  423 + if (node.nodeName == '#comment') {
  424 + if (pat.ignore.test(node.nodeValue)) {
  425 + self.ignore();
  426 + } else if (pat.section.test(node.nodeValue)) {
  427 + self.section();
  428 + } else if (pat.end.test(node.nodeValue)) {
  429 + self.end();
  430 + }
  431 + return;
  432 + }
  433 +
  434 + if (self.content.length >= self.opt.limit.leaves) return;
  435 + if (depth >= self.opt.limit.recursion) return;
  436 + var children = node.childNodes;
  437 + var len = children.length;
  438 + for (var i=0; i < len; i++) {
  439 + var c = children[i];
  440 + self.parse(c, depth+1);
  441 + }
  442 + if (!len && self.inSection()) {
  443 + self.content.push(new Leaf(node, depth));
  444 + }
  445 + return;
  446 + };
  447 +
  448 + self.extract = function(d/*, url, res*/) {
  449 + self.parse(d);
  450 + self.blocks = [ new Block(self.content) ];
  451 + return self.content;
  452 + };
  453 +
  454 + return self;
  455 + };
383 456 })(ExtractContentJS);
384 457
2  lib/scoring-words.js
@@ -74,7 +74,7 @@ if (typeof ExtractContentJS == 'undefined') {
74 74 var w = t.toLowerCase();
75 75 tf += Util.countMatchTokenized(content, w);
76 76 tf += Util.countMatchTokenized(title, w);
77   - tf += Util.countMatchTokenized(url, w);
  77 +// tf += Util.countMatchTokenized(url, w);
78 78
79 79 scores[t] = tf/idf;
80 80 if (scores[t] > max) max = scores[t];
32 sketch/extract-content.test.js
@@ -55,7 +55,7 @@
55 55 var ex = new ExtractContentJS.LayeredExtractor();
56 56 // ex.addHandler( ex.factory.getHandler('Description') );
57 57 // ex.addHandler( ex.factory.getHandler('Scraper'));
58   -// ex.addHandler( ex.factory.getHandler('GoogleAdsence') );
  58 +// ex.addHandler( ex.factory.getHandler('GoogleAdSection') );
59 59 ex.addHandler( ex.factory.getHandler('Heuristics') );
60 60 timer.start('extract');
61 61 var res = ex.extract(d);
@@ -63,19 +63,29 @@
63 63
64 64 if (!res.isSuccess) {
65 65 return d.createTextNode('failed');
66   - } else if (!debug) {
  66 + }
  67 +
  68 + var div = d.createElement('div');
  69 + var h1 = d.createElement('h1');
  70 + h1.appendChild(d.createTextNode(res.engine.name));
  71 + div.appendChild(h1);
  72 +
  73 + if (!debug) {
67 74 if (l.ExtractContentTest.asText) {
68   - return d.createTextNode(res.content.toString());
  75 + var text = res.content.toString();
  76 + div.appendChild(d.createTextNode(text));
69 77 } else if (l.ExtractContentTest.asTextFragment) {
70   - return d.createTextNode(res.content.asTextFragment());
71   - }
72   - var node = res.content.asNode();
73   - if (node != d.body) {
74   - return node.cloneNode(true);
  78 + var text = res.content.asTextFragment();
  79 + div.appendChild(d.createTextNode(text));
  80 + } else {
  81 + var node = res.content.asNode();
  82 + if (node != d.body) {
  83 + div.appendChild(node.cloneNode(true));
  84 + }
75 85 }
  86 + return div;
76 87 } else { // debug
77 88 var blocks = res.engine.blocks || [ res.content.asLeaves() ];
78   - var div = d.createElement('div');
79 89
80 90 var pTimer = d.createElement('p');
81 91 pTimer.appendChild(d.createTextNode(time+'msec'));
@@ -99,9 +109,9 @@
99 109 ul.appendChild(li);
100 110 });
101 111 div.appendChild(ul);
102   -
103   - return div;
104 112 }
  113 +
  114 + return div;
105 115 };
106 116
107 117 l.ExtractContentTest.doTest = function() {
2  sketch/suggest-tag.test.js
@@ -453,7 +453,7 @@
453 453 var ex = new ns.LayeredExtractor();
454 454 // ex.addHandler( ex.factory.getHandler('Description') );
455 455 // ex.addHandler( ex.factory.getHandler('Scraper'));
456   -// ex.addHandler( ex.factory.getHandler('GoogleAdsence') );
  456 +// ex.addHandler( ex.factory.getHandler('GoogleAdSection') );
457 457 ex.addHandler( ex.factory.getHandler('Heuristics') );
458 458 timer.start('extract');
459 459 var res = ex.extract(d);

0 comments on commit 127112f

Please sign in to comment.
Something went wrong with that request. Please try again.