Permalink
Browse files

Constructing nook for high-speed replay/synthesis of HTTP requests --…

… abstracted apache log parser into own code, wrote a faraday middleware that dummies out requests
  • Loading branch information...
1 parent 0cc0f36 commit 5e1c547629a7f6b58778be2de417be09d5831915 Philip (flip) Kromer committed Feb 12, 2011
@@ -1,58 +1,18 @@
-#!/usr/bin/env ruby
+#!/usr/bin/env ruby -E ASCII-8BIT
require 'rubygems'
require 'wukong/script'
+$: << File.dirname(__FILE__)
+require 'logline'
-module ApacheLogParser
- class Mapper < Wukong::Streamer::LineStreamer
-
- #
- # Regular expression to parse an apache log line.
- #
- # 83.240.154.3 - - [07/Jun/2008:20:37:11 +0000] "GET /faq HTTP/1.1" 200 569 "http://infochimps.org/search?query=CAC" "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
- #
- LOG_RE = Regexp.compile(%r{\A
- (\S+) # ip 83.240.154.3
- \s(\S+) # j1 -
- \s(\S+) # j2 -
- \s\[(\d+)/(\w+)/(\d+) # date part [07/Jun/2008
- :(\d+):(\d+):(\d+) # time part :20:37:11
- \s(\+.*)\] # timezone +0000]
- \s\"(?:(\S+) # http_method "GET
- \s(\S+) # path /faq
- \s(\S+)|-)" # protocol HTTP/1.1"
- \s(\d+) # response_code 200
- \s(\d+) # duration 569
- \s\"([^\"]*)\" # referer "http://infochimps.org/search?query=CAC"
- \s\"([^\"]*)\" # ua "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
- \z}x)
- MONTHS = { 'Jan' => '01', 'Feb' => '02', 'Mar' => '03', 'Apr' => '04', 'May' => '05', 'Jun' => '06', 'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12', }
-
- # Use the regex to break line into fields
- # Emit each record as flat line
- def process line
- line.chomp
- m = LOG_RE.match(line)
- if m
- (ip, j1, j2,
- ts_day, ts_mo, ts_year,
- ts_hour, ts_min, ts_sec, tz,
- http_method, path, protocol,
- response_code, duration,
- referer, ua, *cruft) = m.captures
- date = [ts_year, MONTHS[ts_mo], ts_day].join("")
- time = [ts_hour, ts_min, ts_sec].join("")
- yield [:logline, ip, date, time, http_method, protocol, path, response_code, duration, referer, ua, tz]
- else
- yield [:unparseable, line]
- end
- end
+class ApacheLogParser < Wukong::Streamer::LineStreamer
+ # create a Logline object from each record and serialize it flat to disk
+ def process line
+ yield Logline.parse(line)
end
end
-Wukong.run(ApacheLogParser::Mapper, nil, :sort_fields => 7)
-
-# 55.55.155.55 - - [04/Feb/2008:11:37:52 +0000] 301 "GET /robots.txt HTTP/1.1" 185 "-" "WebAlta Crawler/2.0 (http://www.webalta.net/ru/about_webmaster.html) (Windows; U; Windows NT 5.1; ru-RU)" "-"
+Wukong.run( ApacheLogParser, nil, :sort_fields => 7 ) if $0 == __FILE__
@@ -1,9 +1,6 @@
-#!/usr/bin/env ruby
-require 'rubygems'
-require 'wukong/script'
-
class Logline < Struct.new(
- :ip, :date, :time, :http_method, :protocol, :path, :response_code, :duration, :referer, :ua, :tz)
+ :ip, :dt, :tm, :http_method, :protocol, :path, :response_code, :size, :referer, :ua, :tz, :j1, :j2)
+ # 1 2 3 4 5 6 7 8 9 10 11
def page_type
case
@@ -17,11 +14,44 @@ def page_type
def is_page?
page_type == :page
end
-end
-class PageFilter < Wukong::Streamer::StructStreamer
- def process visit, *args
- yield visit.ua if visit.
+
+
+ #
+ # Regular expression to parse an apache log line.
+ #
+ # 83.240.154.3 - - [07/Jun/2008:20:37:11 +0000] "GET /faq HTTP/1.1" 200 569 "http://infochimps.org/search?query=CAC" "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
+ #
+ LOG_RE = Regexp.compile(%r{\A
+ (\S+) # ip 83.240.154.3
+ \s(\S+) # j1 -
+ \s(\S+) # j2 -
+ \s\[(\d+)/(\w+)/(\d+) # date part [07/Jun/2008
+ :(\d+):(\d+):(\d+) # time part :20:37:11
+ \s(\+.*)\] # timezone +0000]
+ \s\"(?:(\S+) # http_method "GET
+ \s(\S+) # path /faq
+ \s(\S+)|-)" # protocol HTTP/1.1"
+ \s(\d+) # response_code 200
+ \s(\d+) # size 569
+ \s\"([^\"]*)\" # referer "http://infochimps.org/search?query=CAC"
+ \s\"([^\"]*)\" # ua "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
+ \z}x)
+ MONTHS = { 'Jan' => '01', 'Feb' => '02', 'Mar' => '03', 'Apr' => '04', 'May' => '05', 'Jun' => '06', 'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12', }
+
+ # Use the regex to break line into fields
+ # Emit each record as flat line
+ def self.parse line
+ m = LOG_RE.match(line.chomp) or return BadRecord.new(line)
+ (ip, j1, j2,
+ ts_day, ts_mo, ts_year,
+ ts_hour, ts_min, ts_sec, tz,
+ http_method, path, protocol,
+ response_code, size,
+ referer, ua, *cruft) = m.captures
+ dt = [ts_year, MONTHS[ts_mo], ts_day].join("")
+ tm = [ts_hour, ts_min, ts_sec].join("")
+ self.new( ip, dt, tm, http_method, protocol, path, response_code, size, referer, ua, tz, j1, j2 )
end
+
end
-Wukong.run(PageFilter)
@@ -0,0 +1,42 @@
+#!/usr/bin/env ruby -E BINARY
+require 'rubygems'
+require 'faraday'
+require 'wukong/script'
+require 'json'
+$: << File.dirname(__FILE__)
+require 'apache_log_parser'
+require 'nook/faraday_dummy_adapter'
+
+Settings.define :target_host, :default => 'localhost', :description => "The host name or IP address to target"
+Settings.define :target_scheme, :default => 'http', :description => "Request scheme (http, https)"
+
+#
+# A Nook consumes its input stream and, for each input, generates an HTTP
+# request against a remote host. Please use it for good and never for evil.
+#
+# You can use it from your command line:
+# zcat /var/www/app/current/log/*access*.log.gz | ./nook.rb --map --host=http://my_own_host.com
+#
+#
+class NookMapper < ApacheLogParser
+
+ # create a Logline object from each record and serialize it flat to disk
+ def process line
+ super(line) do |logline|
+ # yield logline
+ resp = fetcher.get("/your/mom")
+ yield [resp.status, resp.body, resp.headers.inspect]
+ end
+ end
+
+ # a mock fetcher with a uniformly distributed variable delay
+ def fetcher
+ @fetcher ||= Faraday::Connection.new do |f|
+ f.use Faraday::Adapter::Dummy do |dummy|
+ dummy.delay = Proc.new{|env| 0.1 + 0.9 * rand() }
+ end
+ end
+ end
+end
+
+Wukong.run( NookMapper, nil, :sort_fields => 7 )
@@ -0,0 +1,94 @@
+
+module Faraday
+ class Adapter
+
+ # test = Faraday::Connection.new do |f|
+ # f.use Faraday::Adapter::Dummy do |dummy|
+ # dummy.status 404
+ # dummy.delay 1
+ # end
+ # end
+ #
+ # # this will delay 0.2s, returning 404 with
+ # resp = text.get("/your/mom", :dummy_delay => 0.2)
+ # resp.body # => {"method":"get","url":"/your/mom","request_headers":{"Dummy-Delay":"0.2","dummy_delay":0.2},"request":{"proxy":null},"ssl":{}}
+ #
+ # More example:
+ #
+ # test = Faraday::Connection.new do |f|
+ # f.use Faraday::Adapter::Dummy, :status => 503
+ # end
+ #
+ # test = Faraday::Connection.new do |f|
+ # f.use Faraday::Adapter::Dummy do |dummy|
+ # dummy.delay = Proc.new{|env| 0.1 + 0.8 * rand() }
+ # end
+ # end
+ #
+ class Dummy < Middleware
+ include Addressable
+ attr_reader :config
+ def self.loaded?() false end
+
+ # gets value from environment if set, configured instance variable otherwise
+ def value_for env, key
+ val = env[:request_headers]["Dummy-#{header_hash_key(key)}"] || config[key]
+ if val.respond_to?(:call)
+ val = val.call(env)
+ end
+ val
+ end
+
+ # With an optional delay, constructs a [status, headers, response] based on the first of:
+ # * request header field (Dummy-Status, Dummy-Headers, Dummy-Resonse)
+ # * adapter's configuration:
+ # * Unless one of the above is set, body will return a json string taken from the request hash
+ #
+ def call(env)
+ status = value_for(env, :status)
+ headers = value_for(env, :headers)
+ headers = JSON.load(headers) if headers.is_a? String
+ body = value_for(env, :body) ||
+ env.dup.tap{|hsh| [:response, :parallel_manager, :body].each{|k| hsh.delete k} }.to_json
+ delay = value_for(env, :delay).to_f
+ sleep delay if delay > 0
+ headers[:dummy_delay] = delay
+ env.update(
+ :status => status,
+ :response_headers => headers,
+ :body => body)
+ @app.call(env)
+ end
+
+ class Configurator < Struct.new(:status, :headers, :delay, :body)
+ def status(val=nil) self.status = val if val ; super() end
+ def headers(val=nil) self.headers = val if val ; super() end
+ def body(val=nil) self.body = val if val ; super() end
+ def delay(val=nil) self.delay = val if val ; super() end
+ def self.from_hash hsh
+ new().tap{|config| hsh.each{|k,v| config.send("#{k}=", v) } }
+ end
+ end
+
+ def initialize(app, defaults={}, &block)
+ super(app)
+ @config = Configurator.from_hash(defaults.reverse_merge(:status => 200, :delay => 0, :headers => {}))
+ configure(&block) if block
+ end
+
+ def configure
+ yield config
+ end
+
+ # same as in Faraday::Utils -- turns :dummy_response_status into 'Dummy-Response-Status'
+ def header_hash_key(str)
+ str.to_s.split('_').each{|w| w.capitalize! }.join('-')
+ end
+
+ def create_multipart(env, params, boundary = nil)
+ stream = super
+ stream.read
+ end
+ end
+ end
+end

0 comments on commit 5e1c547

Please sign in to comment.