Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
tree: a567307a7f
Fetching contributors…

Octocat-spinner-32-eaf2f5

Cannot retrieve contributors at this time

file 64 lines (53 sloc) 1.839 kb
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
module Bundler
  class SimilarityDetector
    SimilarityScore = Struct.new(:string, :distance)

    # initialize with an array of words to be matched against
    def initialize(corpus)
      @corpus = corpus
    end

    # return an array of words similar to 'word' from the corpus
    def similar_words(word, limit=3)
      words_by_similarity = @corpus.map{|w| SimilarityScore.new(w, levenshtein_distance(word, w))}
      words_by_similarity.select{|s| s.distance<=limit}.sort_by(&:distance).map(&:string)
    end

    # return the result of 'similar_words', concatenated into a list
    # (eg "a, b, or c")
    def similar_word_list(word, limit=3)
      words = similar_words(word,limit)
      if words.length==1
        words[0]
      elsif words.length>1
        [words[0..-2].join(', '), words[-1]].join(' or ')
      end
    end


  protected
    # http://www.informit.com/articles/article.aspx?p=683059&seqNum=36
    def levenshtein_distance(this, that, ins=2, del=2, sub=1)
      # ins, del, sub are weighted costs
      return nil if this.nil?
      return nil if that.nil?
      dm = [] # distance matrix

      # Initialize first row values
      dm[0] = (0..this.length).collect { |i| i * ins }
      fill = [0] * (this.length - 1)

      # Initialize first column values
      for i in 1..that.length
        dm[i] = [i * del, fill.flatten]
      end

      # populate matrix
      for i in 1..that.length
        for j in 1..this.length
          # critical comparison
          dm[i][j] = [
               dm[i-1][j-1] +
                 (this[j-1] == that[i-1] ? 0 : sub),
                   dm[i][j-1] + ins,
               dm[i-1][j] + del
         ].min
        end
      end

      # The last value in matrix is the Levenshtein distance between the strings
      dm[that.length][this.length]
    end

  end
end
Something went wrong with that request. Please try again.