Skip to content

Commit

Permalink
Use charlock to detect binary blobs
Browse files Browse the repository at this point in the history
  • Loading branch information
josh committed Sep 7, 2011
1 parent 753f880 commit 7ed26bf
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 12 deletions.
28 changes: 25 additions & 3 deletions lib/linguist/blob_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
require 'linguist/mime'
require 'linguist/pathname'

require 'charlock_holmes'
require 'escape_utils'
require 'pygments'
require 'yaml'
Expand Down Expand Up @@ -52,7 +53,7 @@ def mime_type
#
# Returns a content type String.
def content_type
pathname.content_type
@content_type ||= binary? ? mime_type : "text/plain; charset=#{encoding.downcase}"
end

# Public: Get the Content-Disposition header value
Expand All @@ -71,11 +72,30 @@ def disposition
end
end

def encoding
if hash = detect_encoding
hash[:encoding]
end
end

# Try to guess the encoding
#
# Returns: a Hash, with :encoding, :confidence, :type
# this will return nil if an error occurred during detection or
# no valid encoding could be found
def detect_encoding
@detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data
end

# Public: Is the blob binary?
#
# Return true or false
def binary?
pathname.binary?
if mime_type = Mime.lookup_mime_type_for(pathname.extname)
mime_type.binary?
else
detect_encoding.nil? || detect_encoding[:type] == :binary
end
end

# Public: Is the blob text?
Expand Down Expand Up @@ -529,6 +549,8 @@ def shebang_language
# Returns html String
def colorize(options = {})
return if !text? || large?
options[:options] ||= {}
options[:options][:encoding] ||= encoding
lexer.highlight(data, options)
end

Expand All @@ -540,7 +562,7 @@ def colorize(options = {})
# Returns html String
def colorize_without_wrapper(options = {})
return if !text? || large?
if text = lexer.highlight(data, options)
if text = colorize(options)
text[%r{<div class="highlight"><pre>(.*?)</pre>\s*</div>}m, 1]
else
''
Expand Down
22 changes: 13 additions & 9 deletions test/test_blob.rb
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,11 @@ def test_content_type
assert_equal "application/octet-stream", blob("dog.o").content_type
assert_equal "application/pdf", blob("foo.pdf").content_type
assert_equal "image/png", blob("foo.png").content_type
assert_equal "text/plain; charset=utf8", blob("README").content_type
assert_equal "text/plain; charset=utf8", blob("foo.html").content_type
assert_equal "text/plain; charset=utf8", blob("foo.pl").content_type
assert_equal "text/plain; charset=utf8", blob("foo.py").content_type
assert_equal "text/plain; charset=utf8", blob("foo.rb").content_type
assert_equal "text/plain; charset=utf8", blob("foo.sh").content_type
assert_equal "text/plain; charset=utf8", blob("foo.xhtml").content_type
assert_equal "text/plain; charset=utf8", blob("foo.xml").content_type
assert_equal "text/plain; charset=iso-8859-2", blob("README").content_type
assert_equal "text/plain; charset=iso-8859-1", blob("script.pl").content_type
assert_equal "text/plain; charset=iso-8859-1", blob("script.py").content_type
assert_equal "text/plain; charset=iso-8859-1", blob("script.rb").content_type
assert_equal "text/plain; charset=iso-8859-1", blob("script.sh").content_type
end

def test_disposition
Expand Down Expand Up @@ -79,13 +76,21 @@ def test_sloc
assert_equal 2, blob("foo.rb").sloc
end

def test_encoding
assert_equal "ISO-8859-2", blob("README").encoding
assert_equal "ISO-8859-1", blob("dump.sql").encoding
assert_equal "UTF-8", blob("file.txt").encoding
assert_nil blob("dog.o").encoding
end

def test_binary
assert blob("git.deb").binary?
assert blob("git.exe").binary?
assert blob("hello.pbc").binary?
assert blob("linguist.gem").binary?
assert blob("octocat.ai").binary?
assert blob("octocat.png").binary?
assert blob("zip").binary?
assert !blob("README").binary?
assert !blob("file.txt").binary?
assert !blob("foo.rb").binary?
Expand Down Expand Up @@ -330,7 +335,6 @@ def test_language
assert_equal Language['Parrot Assembly'], blob("hello.pasm").language

# http://gosu-lang.org
assert_equal Language['Gosu'], blob("Hello.gs").language
assert_equal Language['Gosu'], blob("Hello.gsx").language
assert_equal Language['Gosu'], blob("hello.gsp").language
assert_equal Language['Gosu'], blob("Hello.gst").language
Expand Down

0 comments on commit 7ed26bf

Please sign in to comment.