Permalink
Browse files

- add Grok::Discovery in pure ruby grok

  • Loading branch information...
1 parent c74b2d4 commit 65ad1ccaae19710b50853008ddf65bcb7e9990f4 @jordansissel committed Feb 3, 2012
Showing with 100 additions and 7 deletions.
  1. +9 −3 examples/pattern-discovery.rb
  2. +1 −1 grok.gemspec
  3. +12 −3 lib/grok-pure.rb
  4. +78 −0 lib/grok/pure/discovery.rb
@@ -2,24 +2,30 @@
#
require "rubygems"
-require "grok"
+require "grok-pure"
require "pp"
grok = Grok.new
# Load some default patterns that ship with grok.
# See also:
# http://code.google.com/p/semicomplete/source/browse/grok/patterns/base
-grok.add_patterns_from_file("/usr/local/share/grok/patterns/base")
+grok.add_patterns_from_file("patterns/pure-ruby/base")
# Using the patterns we know, try to build a grok pattern that best matches
# a string we give. Let's try Time.now.to_s, which has this format;
# => Fri Apr 16 19:15:27 -0700 2010
-input = "Time is #{Time.now}"
+input = "http://www.google.com/ and 00:de:ad:be:ef:00 with 'Something Nice'"
pattern = grok.discover(input)
+#g = Grok.new
+#g.add_patterns_from_file("patterns/pure-ruby/base")
+#g.compile("%{MAC}")
+#p g.match("00:de:ad:be:ef:00").captures
+
puts "Input: #{input}"
puts "Pattern: #{pattern}"
+exit
grok.compile(pattern)
# Sleep to change time.
View
@@ -5,7 +5,7 @@ Gem::Specification.new do |spec|
#svnrev = %x{svn info}.split("\n").grep(/Revision:/).first.split(" ").last.to_i
spec.name = "jls-grok"
- spec.version = "0.9.6"
+ spec.version = "0.10.0"
spec.summary = "grok bindings for ruby"
spec.description = "Grok ruby bindings - pattern match/extraction tool"
View
@@ -4,9 +4,17 @@
# TODO(sissel): Check if 'grok' c-ext has been loaded and abort?
class Grok
+ # The pattern input
attr_accessor :pattern
+
+ # The fully-expanded pattern (in regex form)
attr_accessor :expanded_pattern
+
+ # The logger
attr_accessor :logger
+
+ # The dictionary of pattern names to pattern expressions
+ attr_accessor :patterns
PATTERN_RE = \
/%\{ # match '%{' not prefixed with '\'
@@ -72,7 +80,7 @@ def compile(pattern)
iterations_left = 1000
@pattern = pattern
- @expanded_pattern = pattern
+ @expanded_pattern = pattern.clone
index = 0
# Replace any instances of '%{FOO}' with that pattern.
@@ -151,8 +159,9 @@ def discover(input)
private
def init_discover
- @discover = GrokDiscover.new(self)
- @discover.logmask = logmask
+ require "grok/pure/discovery"
+ @discover = Grok::Discovery.new(self)
+ @discover.logger = @logger
end # def init_discover
public
View
@@ -0,0 +1,78 @@
+require "grok-pure"
+require "logger"
+
+class Grok::Discovery
+ attr_accessor :logger
+
+ def initialize(grok)
+ @grok = grok
+ @logger = Cabin::Channel.new
+ @logger.subscribe(Logger.new(STDOUT))
+ @logger.level = :warn
+ end # def initialize
+
+ def discover(text)
+ text = text.clone
+ # TODO(sissel): Sort patterns by complexity, most complex first.
+ # - For each pattern, compile it in a grok by itself.
+ # - Make a dictionary of { "name" => Grok } for each pattern
+ # - Sort groks by complexity of the Grok#expanded_pattern
+ groks = {}
+ @grok.patterns.each do |name, expression|
+ grok = Grok.new
+ # Copy in the same grok patterns from the parent
+ grok.patterns.merge!(@grok.patterns)
+ grok.compile("%{#{name}}")
+ groks[name] = grok
+ end
+
+ patterns = groks.sort { |a, b| compare(a, b) }
+
+ done = false
+ while !done
+ done = true # will reset this if we are not done later.
+ patterns.each do |name, grok|
+ # Skip patterns that lack complexity (SPACE, NOTSPACE, DATA, etc)
+ next if complexity(grok.expanded_pattern) < 20
+ m = grok.match(text)
+ # Skip non-matches
+ next unless m
+ part = text[m.start ... m.end]
+ # Only include things that have word boundaries (not just words)
+ next if part !~ /.\b./
+ # Skip over parts that appear to include %{pattern} already
+ next if part =~ /%{[^}+]}/
+ acting = true
+ text[m.start ... m.end] = "%{#{name}}"
+
+ # Start the loop over again
+ done = false
+ break
+ end
+ end
+
+ return text
+ end # def discover
+
+ private
+ def compare(a, b)
+ # a and be are each: [ name, grok ]
+ # sort highest complexity first
+ return complexity(b.last.expanded_pattern) <=> complexity(a.last.expanded_pattern)
+ end # def compare
+
+ private
+ def complexity(expression)
+ score = expression.count("|") # number of branches in the pattern
+ score += expression.length # the length of the pattern
+ end # def complexity
+
+end # class Grok::Discovery
+
+#/* Compute the relative complexity of a pattern */
+#static int complexity(const grok_t *grok) {
+ #int score;
+ #score += string_count(grok->full_pattern, "|");
+ #score += strlen(grok->full_pattern) / 2;
+ #return -score; /* Sort most-complex first */
+#}

0 comments on commit 65ad1cc

Please sign in to comment.