Browse files

Use IO::HTML for <meta> encoding sniffing

Replaces HTTP::Parser, which requires a C compiler
To match previous version, do not require http-equiv="Content-Type"
(which means the search is laxer than the current HTML5 spec)
  • Loading branch information...
1 parent 5b82832 commit 6a4293c9063f9866e63da368229a95559a41a7c3 @madsen madsen committed with gisle Jan 28, 2012
Showing with 6 additions and 29 deletions.
  1. +1 −1 Makefile.PL
  2. +5 −28 lib/HTTP/Message.pm
View
2 Makefile.PL
@@ -17,7 +17,7 @@ WriteMakefile(
'HTTP::Date' => 6,
'MIME::Base64' => "2.1",
'MIME::QuotedPrint' => 0,
- 'HTML::Parser' => "3.33",
+ 'IO::HTML' => 0,
'Encode' => "2.12",
'Encode::Locale' => 1,
'LWP::MediaTypes' => 6,
View
33 lib/HTTP/Message.pm
@@ -236,34 +236,11 @@ sub content_charset
elsif ($self->content_is_html) {
# look for <META charset="..."> or <META content="...">
# http://dev.w3.org/html5/spec/Overview.html#determining-the-character-encoding
- my $charset;
- require HTML::Parser;
- my $p = HTML::Parser->new(
- start_h => [sub {
- my($tag, $attr, $self) = @_;
- $charset = $attr->{charset};
- unless ($charset) {
- # look at $attr->{content} ...
- if (my $c = $attr->{content}) {
- require HTTP::Headers::Util;
- my @v = HTTP::Headers::Util::split_header_words($c);
- return unless @v;
- my($ct, undef, %ct_param) = @{$v[0]};
- $charset = $ct_param{charset};
- }
- return unless $charset;
- }
- if ($charset =~ /^utf-?16/i) {
- # converted document, assume UTF-8
- $charset = "UTF-8";
- }
- $self->eof;
- }, "tagname, attr, self"],
- report_tags => [qw(meta)],
- utf8_mode => 1,
- );
- $p->parse($$cref);
- return $charset if $charset;
+ require IO::HTML;
+ # Use relaxed search to match previous versions of HTTP::Message:
+ my $encoding = IO::HTML::find_charset_in($$cref, { encoding => 1,
+ need_pragma => 0 });
+ return $encoding->mime_name if $encoding;
}
elsif ($self->content_type eq "application/json") {
for ($$cref) {

0 comments on commit 6a4293c

Please sign in to comment.