diff --git a/examples/pattern-discovery.rb b/examples/pattern-discovery.rb index 0b4adda..01e10a9 100644 --- a/examples/pattern-discovery.rb +++ b/examples/pattern-discovery.rb @@ -2,7 +2,7 @@ # require "rubygems" -require "grok" +require "grok-pure" require "pp" grok = Grok.new @@ -10,16 +10,22 @@ # Load some default patterns that ship with grok. # See also: # http://code.google.com/p/semicomplete/source/browse/grok/patterns/base -grok.add_patterns_from_file("/usr/local/share/grok/patterns/base") +grok.add_patterns_from_file("patterns/pure-ruby/base") # Using the patterns we know, try to build a grok pattern that best matches # a string we give. Let's try Time.now.to_s, which has this format; # => Fri Apr 16 19:15:27 -0700 2010 -input = "Time is #{Time.now}" +input = "http://www.google.com/ and 00:de:ad:be:ef:00 with 'Something Nice'" pattern = grok.discover(input) +#g = Grok.new +#g.add_patterns_from_file("patterns/pure-ruby/base") +#g.compile("%{MAC}") +#p g.match("00:de:ad:be:ef:00").captures + puts "Input: #{input}" puts "Pattern: #{pattern}" +exit grok.compile(pattern) # Sleep to change time. diff --git a/grok.gemspec b/grok.gemspec index 718a17a..e235fed 100644 --- a/grok.gemspec +++ b/grok.gemspec @@ -5,7 +5,7 @@ Gem::Specification.new do |spec| #svnrev = %x{svn info}.split("\n").grep(/Revision:/).first.split(" ").last.to_i spec.name = "jls-grok" - spec.version = "0.9.6" + spec.version = "0.10.0" spec.summary = "grok bindings for ruby" spec.description = "Grok ruby bindings - pattern match/extraction tool" diff --git a/lib/grok-pure.rb b/lib/grok-pure.rb index c527c25..abb034f 100644 --- a/lib/grok-pure.rb +++ b/lib/grok-pure.rb @@ -4,9 +4,17 @@ # TODO(sissel): Check if 'grok' c-ext has been loaded and abort? class Grok + # The pattern input attr_accessor :pattern + + # The fully-expanded pattern (in regex form) attr_accessor :expanded_pattern + + # The logger attr_accessor :logger + + # The dictionary of pattern names to pattern expressions + attr_accessor :patterns PATTERN_RE = \ /%\{ # match '%{' not prefixed with '\' @@ -72,7 +80,7 @@ def compile(pattern) iterations_left = 1000 @pattern = pattern - @expanded_pattern = pattern + @expanded_pattern = pattern.clone index = 0 # Replace any instances of '%{FOO}' with that pattern. @@ -151,8 +159,9 @@ def discover(input) private def init_discover - @discover = GrokDiscover.new(self) - @discover.logmask = logmask + require "grok/pure/discovery" + @discover = Grok::Discovery.new(self) + @discover.logger = @logger end # def init_discover public diff --git a/lib/grok/pure/discovery.rb b/lib/grok/pure/discovery.rb new file mode 100644 index 0000000..a109eb2 --- /dev/null +++ b/lib/grok/pure/discovery.rb @@ -0,0 +1,78 @@ +require "grok-pure" +require "logger" + +class Grok::Discovery + attr_accessor :logger + + def initialize(grok) + @grok = grok + @logger = Cabin::Channel.new + @logger.subscribe(Logger.new(STDOUT)) + @logger.level = :warn + end # def initialize + + def discover(text) + text = text.clone + # TODO(sissel): Sort patterns by complexity, most complex first. + # - For each pattern, compile it in a grok by itself. + # - Make a dictionary of { "name" => Grok } for each pattern + # - Sort groks by complexity of the Grok#expanded_pattern + groks = {} + @grok.patterns.each do |name, expression| + grok = Grok.new + # Copy in the same grok patterns from the parent + grok.patterns.merge!(@grok.patterns) + grok.compile("%{#{name}}") + groks[name] = grok + end + + patterns = groks.sort { |a, b| compare(a, b) } + + done = false + while !done + done = true # will reset this if we are not done later. + patterns.each do |name, grok| + # Skip patterns that lack complexity (SPACE, NOTSPACE, DATA, etc) + next if complexity(grok.expanded_pattern) < 20 + m = grok.match(text) + # Skip non-matches + next unless m + part = text[m.start ... m.end] + # Only include things that have word boundaries (not just words) + next if part !~ /.\b./ + # Skip over parts that appear to include %{pattern} already + next if part =~ /%{[^}+]}/ + acting = true + text[m.start ... m.end] = "%{#{name}}" + + # Start the loop over again + done = false + break + end + end + + return text + end # def discover + + private + def compare(a, b) + # a and be are each: [ name, grok ] + # sort highest complexity first + return complexity(b.last.expanded_pattern) <=> complexity(a.last.expanded_pattern) + end # def compare + + private + def complexity(expression) + score = expression.count("|") # number of branches in the pattern + score += expression.length # the length of the pattern + end # def complexity + +end # class Grok::Discovery + +#/* Compute the relative complexity of a pattern */ +#static int complexity(const grok_t *grok) { + #int score; + #score += string_count(grok->full_pattern, "|"); + #score += strlen(grok->full_pattern) / 2; + #return -score; /* Sort most-complex first */ +#}