Skip to content

Commit

Permalink
Merge pull request #288 from netarchivesuite/inline-image-filter
Browse files Browse the repository at this point in the history
Attempt to filter out embedded images.
  • Loading branch information
ato committed Jan 15, 2020
2 parents 3a8b589 + 204491c commit 8e875bb
Showing 1 changed file with 13 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -445,18 +445,20 @@ protected void processGeneralTag(CrawlURI curi, CharSequence element,
} else if (attr.start(5) > -1) {
// SRC etc.
CharSequence context = elementContext(element, attr.group(5));

// true, if we expect another HTML page instead of an image etc.
final Hop hop;

if(!framesAsEmbeds
&& (elementStr.equalsIgnoreCase(FRAME) || elementStr
.equalsIgnoreCase(IFRAME))) {
hop = Hop.NAVLINK;
} else {
hop = Hop.EMBED;
if (!context.toString().toLowerCase().startsWith("data:")) {

// true, if we expect another HTML page instead of an image etc.
final Hop hop;

if (!framesAsEmbeds
&& (elementStr.equalsIgnoreCase(FRAME) || elementStr
.equalsIgnoreCase(IFRAME))) {
hop = Hop.NAVLINK;
} else {
hop = Hop.EMBED;
}
processEmbed(curi, value, context, hop);
}
processEmbed(curi, value, context, hop);
} else if (attr.start(6) > -1) {
// CODEBASE
codebase = (value instanceof String)?
Expand Down

0 comments on commit 8e875bb

Please sign in to comment.