diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fd1fcb3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +Gemfile.lock +/pkg diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..680d34b --- /dev/null +++ b/.travis.yml @@ -0,0 +1,18 @@ +sudo: false +language: ruby + +script: bundle exec ruby spec/unicode_confusable_spec.rb + +rvm: +- 2.3.0 +- 2.2 +- ruby-head +- rbx-2 +- jruby-head +- jruby-9.0.5.0 + +cache: +- bundler + +# matrix: +# fast_finish: true diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..e99a443 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,6 @@ +## CHANGELOG + +### 1.0.0 + +* Inital release + diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..1e1a5b2 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,74 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to making participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, gender identity and expression, level of experience, +nationality, personal appearance, race, religion, or sexual identity and +orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or +advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic + address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces +when an individual is representing the project or its community. Examples of +representing a project or community include using an official project e-mail +address, posting via an official social media account, or acting as an appointed +representative at an online or offline event. Representation of a project may be +further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at opensource@janlelis.com. All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at [http://contributor-covenant.org/version/1/4][version] + +[homepage]: http://contributor-covenant.org +[version]: http://contributor-covenant.org/version/1/4/ diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..c1a2bfe --- /dev/null +++ b/Gemfile @@ -0,0 +1,5 @@ +source 'https://rubygems.org' + +gemspec + +gem 'minitest' diff --git a/MIT-LICENSE.txt b/MIT-LICENSE.txt new file mode 100644 index 0000000..5b3b8a8 --- /dev/null +++ b/MIT-LICENSE.txt @@ -0,0 +1,20 @@ +Copyright (c) 2016 Jan Lelis, mail@janlelis.de + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..a0b39f3 --- /dev/null +++ b/README.md @@ -0,0 +1,40 @@ +# Unicode::Confusable [![[version]](https://badge.fury.io/rb/unicode-confusable.svg)](http://badge.fury.io/rb/unicode-confusable) [![[travis]](https://travis-ci.org/janlelis/unicode-confusable.png)](https://travis-ci.org/janlelis/unicode-confusable) + +Compares two strings if they are visually confusable as described in [Unicode® Technical Standard #39](http://www.unicode.org/reports/tr39/#Confusable_Detection): Both strings get transformed into a skeleton format before comparing them. The skeleton is generated by normalizing the string, replacing [confusable characters](ftp://ftp.unicode.org/Public/security/8.0.0/confusables.txt), and normalizing the string again. Please note: The skeleton is an intermediate representation, not meant for any other use than testing confusability. + +Unicode version: **8.0.0** + +Supported Rubies: **2.3**, **2.2** + +## `Gemfile` + +```ruby +gem "unicode-confusable" +``` + +## Usage + +```ruby +require "unicode/confusable" + +Unicode::Confusable.confusable? "a", "b" # => false +Unicode::Confusable.confusable? "ℜ𝘂ᖯʏ", "Ruby" # => true +Unicode::Confusable.confusable? "Michael", "Michae1" # => true +Unicode::Confusable.confusable? "⁇", "?" # => false +Unicode::Confusable.confusable? "⁇", "??" # => true +``` + +## No Advanced Detection + +TR 39 also describes mechanisms for a more exact recognition of confusables, also within the same string: + +- Single-script confusable +- Mixed-script confusable +- Whole-script confusable + +This is (currently) **not** supported by this gem. + +## MIT License + +- Copyright (C) 2016 Jan Lelis . Released under the MIT license. +- Unicode data: http://www.unicode.org/copyright.html#Exhibit1 diff --git a/Rakefile b/Rakefile new file mode 100644 index 0000000..26ba90f --- /dev/null +++ b/Rakefile @@ -0,0 +1,37 @@ +# # # +# Get gemspec info + +gemspec_file = Dir['*.gemspec'].first +gemspec = eval File.read(gemspec_file), binding, gemspec_file +info = "#{gemspec.name} | #{gemspec.version} | " \ + "#{gemspec.runtime_dependencies.size} dependencies | " \ + "#{gemspec.files.size} files" + +# # # +# Gem build and install task + +desc info +task :gem do + puts info + "\n\n" + print " "; sh "gem build #{gemspec_file}" + FileUtils.mkdir_p 'pkg' + FileUtils.mv "#{gemspec.name}-#{gemspec.version}.gem", 'pkg' + puts; sh %{gem install --no-document pkg/#{gemspec.name}-#{gemspec.version}.gem} +end + +# # # +# Start an IRB session with the gem loaded + +desc "#{gemspec.name} | IRB" +task :irb do + sh "irb -I ./lib -r #{gemspec.name.gsub '-','/'}" +end + +# # # +# Run Specs + +desc "#{gemspec.name} | Spec" +task :spec do + sh "for file in spec/*.rb; do ruby $file; done" +end +task default: :spec diff --git a/data/confusable.marshal.gz b/data/confusable.marshal.gz new file mode 100644 index 0000000..44448d2 Binary files /dev/null and b/data/confusable.marshal.gz differ diff --git a/lib/unicode/confusable.rb b/lib/unicode/confusable.rb new file mode 100644 index 0000000..bd453b2 --- /dev/null +++ b/lib/unicode/confusable.rb @@ -0,0 +1,22 @@ +require_relative "confusable/constants" +require_relative "confusable/index" + +require 'unicode_normalize/normalize' + +module Unicode + module Confusable + def self.confusable?(string1, string2) + skeleton(string1) == skeleton(string2) + end + + def self.skeleton(string) + require_relative 'display_width/index' unless defined? ::Unicode::Confusable::INDEX + UnicodeNormalize.normalize( + UnicodeNormalize.normalize(string, :nfd).each_codepoint.map{ |codepoint| + INDEX[codepoint] || codepoint + }.flatten.pack("U*"), :nfd + ) + end + end +end + diff --git a/lib/unicode/confusable/constants.rb b/lib/unicode/confusable/constants.rb new file mode 100644 index 0000000..9104317 --- /dev/null +++ b/lib/unicode/confusable/constants.rb @@ -0,0 +1,8 @@ +module Unicode + module Confusable + VERSION = "1.0.0".freeze + DATA_DIRECTORY = File.expand_path(File.dirname(__FILE__) + '/../../../data/').freeze + INDEX_FILENAME = (DATA_DIRECTORY + '/confusable.marshal.gz').freeze + end +end + diff --git a/lib/unicode/confusable/index.rb b/lib/unicode/confusable/index.rb new file mode 100644 index 0000000..7f99d96 --- /dev/null +++ b/lib/unicode/confusable/index.rb @@ -0,0 +1,7 @@ +require_relative 'constants' + +module Unicode + module Confusable + INDEX = Marshal.load(Gem.gunzip(File.binread(INDEX_FILENAME))) + end +end diff --git a/lib/unicode/confusable/string_ext.rb b/lib/unicode/confusable/string_ext.rb new file mode 100644 index 0000000..a378978 --- /dev/null +++ b/lib/unicode/confusable/string_ext.rb @@ -0,0 +1,8 @@ +require_relative "../confusable" + +class String + # Optional core extension for your convenience + def confusable?(other) + Unicode::Confusable.compare(self, other) + end +end diff --git a/spec/unicode_confusable_spec.rb b/spec/unicode_confusable_spec.rb new file mode 100644 index 0000000..f22900d --- /dev/null +++ b/spec/unicode_confusable_spec.rb @@ -0,0 +1,17 @@ +require_relative "../lib/unicode/confusable" +require "minitest/autorun" + +describe Unicode::Confusable do + it "will detect official confusables" do + assert_equal true, Unicode::Confusable.confusable?("1", "l") + assert_equal true, Unicode::Confusable.confusable?("ℜ𝘂ᖯʏ", "Ruby") + assert_equal true, Unicode::Confusable.confusable?("Michael", "Michae1") + assert_equal true, Unicode::Confusable.confusable?("⁇", "??") + end + + it "will return false for non-confusables" do + assert_equal false, Unicode::Confusable.confusable?("a", "b") + assert_equal false, Unicode::Confusable.confusable?("⁇", "?") + end +end + diff --git a/unicode-confusable.gemspec b/unicode-confusable.gemspec new file mode 100644 index 0000000..98a81f9 --- /dev/null +++ b/unicode-confusable.gemspec @@ -0,0 +1,21 @@ +# -*- encoding: utf-8 -*- + +require File.dirname(__FILE__) + "/lib/unicode/confusable/constants" + +Gem::Specification.new do |gem| + gem.name = "unicode-confusable" + gem.version = Unicode::Confusable::VERSION + gem.summary = "Detect characters that look visually similar." + gem.description = "Compares two strings if they are visually confusable as described in Unicode® Technical Standard #39: Both strings get transformed into a skeleton format before comparing them. The skeleton is generated by normalizing the string, replacing confusable characters, and normalizing the string again." + gem.authors = ["Jan Lelis"] + gem.email = ["mail@janlelis.de"] + gem.homepage = "https://github.com/janlelis/unicode-confusable" + gem.license = "MIT" + + gem.files = Dir["{**/}{.*,*}"].select{ |path| File.file?(path) && path !~ /^pkg/ } + gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) } + gem.test_files = gem.files.grep(%r{^(test|spec|features)/}) + gem.require_paths = ["lib"] + + gem.required_ruby_version = "~> 2.2" +end