Skip to content

Commit

Permalink
Use IO::HTML for <meta> encoding sniffing
Browse files Browse the repository at this point in the history
Replaces HTTP::Parser, which requires a C compiler
To match previous version, do not require http-equiv="Content-Type"
(which means the search is laxer than the current HTML5 spec)
  • Loading branch information
madsen authored and gisle committed May 27, 2012
1 parent 5b82832 commit 6a4293c
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 29 deletions.
2 changes: 1 addition & 1 deletion Makefile.PL
Expand Up @@ -17,7 +17,7 @@ WriteMakefile(
'HTTP::Date' => 6, 'HTTP::Date' => 6,
'MIME::Base64' => "2.1", 'MIME::Base64' => "2.1",
'MIME::QuotedPrint' => 0, 'MIME::QuotedPrint' => 0,
'HTML::Parser' => "3.33", 'IO::HTML' => 0,
'Encode' => "2.12", 'Encode' => "2.12",
'Encode::Locale' => 1, 'Encode::Locale' => 1,
'LWP::MediaTypes' => 6, 'LWP::MediaTypes' => 6,
Expand Down
33 changes: 5 additions & 28 deletions lib/HTTP/Message.pm
Expand Up @@ -236,34 +236,11 @@ sub content_charset
elsif ($self->content_is_html) { elsif ($self->content_is_html) {
# look for <META charset="..."> or <META content="..."> # look for <META charset="..."> or <META content="...">
# http://dev.w3.org/html5/spec/Overview.html#determining-the-character-encoding # http://dev.w3.org/html5/spec/Overview.html#determining-the-character-encoding
my $charset; require IO::HTML;
require HTML::Parser; # Use relaxed search to match previous versions of HTTP::Message:
my $p = HTML::Parser->new( my $encoding = IO::HTML::find_charset_in($$cref, { encoding => 1,
start_h => [sub { need_pragma => 0 });
my($tag, $attr, $self) = @_; return $encoding->mime_name if $encoding;
$charset = $attr->{charset};
unless ($charset) {
# look at $attr->{content} ...
if (my $c = $attr->{content}) {
require HTTP::Headers::Util;
my @v = HTTP::Headers::Util::split_header_words($c);
return unless @v;
my($ct, undef, %ct_param) = @{$v[0]};
$charset = $ct_param{charset};
}
return unless $charset;
}
if ($charset =~ /^utf-?16/i) {
# converted document, assume UTF-8
$charset = "UTF-8";
}
$self->eof;
}, "tagname, attr, self"],
report_tags => [qw(meta)],
utf8_mode => 1,
);
$p->parse($$cref);
return $charset if $charset;
} }
elsif ($self->content_type eq "application/json") { elsif ($self->content_type eq "application/json") {
for ($$cref) { for ($$cref) {
Expand Down

0 comments on commit 6a4293c

Please sign in to comment.