Skip to content

Commit

Permalink
- add Grok::Discovery in pure ruby grok
Browse files Browse the repository at this point in the history
  • Loading branch information
jordansissel committed Feb 3, 2012
1 parent c74b2d4 commit 65ad1cc
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 7 deletions.
12 changes: 9 additions & 3 deletions examples/pattern-discovery.rb
Expand Up @@ -2,24 +2,30 @@
#

require "rubygems"
require "grok"
require "grok-pure"
require "pp"

grok = Grok.new

# Load some default patterns that ship with grok.
# See also:
# http://code.google.com/p/semicomplete/source/browse/grok/patterns/base
grok.add_patterns_from_file("/usr/local/share/grok/patterns/base")
grok.add_patterns_from_file("patterns/pure-ruby/base")

# Using the patterns we know, try to build a grok pattern that best matches
# a string we give. Let's try Time.now.to_s, which has this format;
# => Fri Apr 16 19:15:27 -0700 2010
input = "Time is #{Time.now}"
input = "http://www.google.com/ and 00:de:ad:be:ef:00 with 'Something Nice'"
pattern = grok.discover(input)

#g = Grok.new
#g.add_patterns_from_file("patterns/pure-ruby/base")
#g.compile("%{MAC}")
#p g.match("00:de:ad:be:ef:00").captures

puts "Input: #{input}"
puts "Pattern: #{pattern}"
exit
grok.compile(pattern)

# Sleep to change time.
Expand Down
2 changes: 1 addition & 1 deletion grok.gemspec
Expand Up @@ -5,7 +5,7 @@ Gem::Specification.new do |spec|

#svnrev = %x{svn info}.split("\n").grep(/Revision:/).first.split(" ").last.to_i
spec.name = "jls-grok"
spec.version = "0.9.6"
spec.version = "0.10.0"

spec.summary = "grok bindings for ruby"
spec.description = "Grok ruby bindings - pattern match/extraction tool"
Expand Down
15 changes: 12 additions & 3 deletions lib/grok-pure.rb
Expand Up @@ -4,9 +4,17 @@

# TODO(sissel): Check if 'grok' c-ext has been loaded and abort?
class Grok
# The pattern input
attr_accessor :pattern

# The fully-expanded pattern (in regex form)
attr_accessor :expanded_pattern

# The logger
attr_accessor :logger

# The dictionary of pattern names to pattern expressions
attr_accessor :patterns

PATTERN_RE = \
/%\{ # match '%{' not prefixed with '\'
Expand Down Expand Up @@ -72,7 +80,7 @@ def compile(pattern)

iterations_left = 1000
@pattern = pattern
@expanded_pattern = pattern
@expanded_pattern = pattern.clone
index = 0

# Replace any instances of '%{FOO}' with that pattern.
Expand Down Expand Up @@ -151,8 +159,9 @@ def discover(input)

private
def init_discover
@discover = GrokDiscover.new(self)
@discover.logmask = logmask
require "grok/pure/discovery"
@discover = Grok::Discovery.new(self)
@discover.logger = @logger
end # def init_discover

public
Expand Down
78 changes: 78 additions & 0 deletions lib/grok/pure/discovery.rb
@@ -0,0 +1,78 @@
require "grok-pure"
require "logger"

class Grok::Discovery
attr_accessor :logger

def initialize(grok)
@grok = grok
@logger = Cabin::Channel.new
@logger.subscribe(Logger.new(STDOUT))
@logger.level = :warn
end # def initialize

def discover(text)
text = text.clone
# TODO(sissel): Sort patterns by complexity, most complex first.
# - For each pattern, compile it in a grok by itself.
# - Make a dictionary of { "name" => Grok } for each pattern
# - Sort groks by complexity of the Grok#expanded_pattern
groks = {}
@grok.patterns.each do |name, expression|
grok = Grok.new
# Copy in the same grok patterns from the parent
grok.patterns.merge!(@grok.patterns)
grok.compile("%{#{name}}")
groks[name] = grok
end

patterns = groks.sort { |a, b| compare(a, b) }

done = false
while !done
done = true # will reset this if we are not done later.
patterns.each do |name, grok|
# Skip patterns that lack complexity (SPACE, NOTSPACE, DATA, etc)
next if complexity(grok.expanded_pattern) < 20
m = grok.match(text)
# Skip non-matches
next unless m
part = text[m.start ... m.end]
# Only include things that have word boundaries (not just words)
next if part !~ /.\b./
# Skip over parts that appear to include %{pattern} already
next if part =~ /%{[^}+]}/
acting = true
text[m.start ... m.end] = "%{#{name}}"

# Start the loop over again
done = false
break
end
end

return text
end # def discover

private
def compare(a, b)
# a and be are each: [ name, grok ]
# sort highest complexity first
return complexity(b.last.expanded_pattern) <=> complexity(a.last.expanded_pattern)
end # def compare

private
def complexity(expression)
score = expression.count("|") # number of branches in the pattern
score += expression.length # the length of the pattern
end # def complexity

end # class Grok::Discovery

#/* Compute the relative complexity of a pattern */
#static int complexity(const grok_t *grok) {
#int score;
#score += string_count(grok->full_pattern, "|");
#score += strlen(grok->full_pattern) / 2;
#return -score; /* Sort most-complex first */
#}

0 comments on commit 65ad1cc

Please sign in to comment.