Skip to content
This repository
Browse code

apache log parsing examples -- including histograms, sessionizing, pa…

…ge-page covisit graph
  • Loading branch information...
commit 37c692dfa3d2ff172ae426af4c4f89ff86c8111c 1 parent d1f15d8
Philip (flip) Kromer mrflip authored
16 examples/server_logs/apache_log_parser.rb
... ... @@ -1,19 +1,13 @@
1   -#!/usr/bin/env ruby -E ASCII-8BIT
2   -require 'rubygems'
3   -require 'wukong/script'
4   -$: << File.dirname(__FILE__)
5   -require 'logline'
  1 +#!/usr/bin/env ruby
  2 +$LOAD_PATH.unshift File.expand_path('../../lib', File.dirname(__FILE__))
  3 +require 'wukong/script'
  4 +require_relative './logline'
6 5
7 6 class ApacheLogParser < Wukong::Streamer::LineStreamer
8   -
9 7 # create a Logline object from each record and serialize it flat to disk
10 8 def process line
11 9 yield Logline.parse(line)
12 10 end
13 11 end
14 12
15   -Wukong.run( ApacheLogParser, nil, :sort_fields => 7 ) if $0 == __FILE__
16   -
17   -
18   -
19   -
  13 +Wukong.run( ApacheLogParser )
40 examples/server_logs/breadcrumb_edges.rb
... ... @@ -0,0 +1,40 @@
  1 +#!/usr/bin/env ruby
  2 +$LOAD_PATH.unshift File.expand_path('../../lib', File.dirname(__FILE__))
  3 +require 'wukong/script'
  4 +require_relative './logline'
  5 +
  6 +class BreadcrumbsMapper < Wukong::Streamer::ModelStreamer
  7 + self.model_klass = Logline
  8 + def process visit, *args
  9 + # return unless Settings.page_types.include?(visit.page_type)
  10 + yield [visit.ip, visit.visit_time.to_i, visit.path]
  11 + end
  12 +end
  13 +
  14 +class BreadcrumbEdgesReducer < Wukong::Streamer::Reducer
  15 + def get_key ip, itime, path
  16 + [ip]
  17 + end
  18 + def start!(*args)
  19 + @paths = Set.new
  20 + super
  21 + end
  22 + def accumulate ip, itime, path
  23 + @paths << path
  24 + end
  25 +
  26 + # for each pair of paths, emit the edge in both directions
  27 + def finalize
  28 + @paths = @paths.to_a
  29 + while @paths.present?
  30 + from = @paths.shift
  31 + @paths.each do |into|
  32 + yield [key, from, into]
  33 + yield [key, into, from]
  34 + end
  35 + end
  36 + end
  37 +end
  38 +
  39 +
  40 +Wukong.run( BreadcrumbsMapper, BreadcrumbEdgesReducer, :sort_fields => 2 )
56 examples/server_logs/breadcrumbs.rb
... ... @@ -1,27 +1,9 @@
1 1 #!/usr/bin/env ruby
2   -require 'rubygems'
3   -require 'wukong/script'
4   -
5   -class Logline < Struct.new(
6   - :ip, :date, :time, :http_method, :protocol, :path, :response_code, :duration, :referer, :ua, :tz)
7   -
8   - def page_type
9   - case
10   - when path =~ /\.(css|js)$/ then :asset
11   - when path =~ /\.(png|gif|ico)$/ then :image
12   - when path =~ /\.(pl|s?html?|asp|jsp|cgi)$/ then :page
13   - else :other
14   - end
15   - end
16   -
17   - def is_page?
18   - page_type == :page
19   - end
20   -
21   - def day_hr
22   - visit.date + visit.time[0..1]
23   - end
24   -end
  2 +$LOAD_PATH.unshift File.expand_path('../../lib', File.dirname(__FILE__))
  3 +require 'configliere'
  4 +Settings.define :page_types, type: Array, default: ['page', 'video'], description: "Acceptable page types"
  5 +require 'wukong/script'
  6 +require_relative './logline'
25 7
26 8
27 9 #
@@ -38,9 +20,11 @@ def day_hr
38 20 #
39 21 # where the partition key is visitor_id, and we sort by visitor_id and datetime.
40 22 #
41   -class VisitorDatePath < Wukong::Streamer::StructStreamer
  23 +class BreadcrumbsMapper < Wukong::Streamer::ModelStreamer
  24 + self.model_klass = Logline
42 25 def process visit, *args
43   - yield [visit.ip, visit.day_hr, visit.path]
  26 + # return unless Settings.page_types.include?(visit.page_type)
  27 + yield [visit.ip, visit.day_hr, visit.visit_time.to_i, visit.path]
44 28 end
45 29 end
46 30
@@ -65,11 +49,23 @@ def process visit, *args
65 49 # page_trails <pagen> <n_pages_in_visit> <duration> <timestamp> < page1,page2,... >
66 50 #
67 51 # to discover all trails passing through a given page.
68   -class VisitorDatePath < Wukong::Streamer::Reducer
69   - def get_key ip, day_hr, path, *args
70   - [ip, day_hr]
  52 +class BreadcrumbsReducer < Wukong::Streamer::Reducer
  53 + def get_key ip, day_hr, itime, path, *args
  54 + [ip]
  55 + end
  56 + def start!(*args)
  57 + @path_times = []
  58 + super
71 59 end
72   - def process_group visit, *args
73   - yield [visit.ip, visit.day_hr, visit.path]
  60 + def accumulate ip, day_hr, itime, path, *args
  61 + # @path_times << "(#{itime},#{path})"
  62 + @path_times << "#{itime}:#{path}"
  63 + end
  64 + def finalize
  65 + # yield [key, "{" << @path_times.join(",") << "}"]
  66 + yield [key, @path_times.join("|")]
74 67 end
75 68 end
  69 +
  70 +
  71 +Wukong.run( BreadcrumbsMapper, BreadcrumbsReducer, :sort_fields => 2 )
33 examples/server_logs/histograms.rb
... ... @@ -0,0 +1,33 @@
  1 +#!/usr/bin/env ruby
  2 +$LOAD_PATH.unshift File.expand_path('../../lib', File.dirname(__FILE__))
  3 +require 'wukong/script'
  4 +require_relative './logline'
  5 +
  6 +# cat data/swk-100.tsv | ./histograms.rb --map | sort > data/swk-hist-map.tsv
  7 +# cat data/swk-hist-map.tsv | ./histograms.rb --reduce > data/swk-hist.tsv
  8 +
  9 +class HistogramsMapper < Wukong::Streamer::ModelStreamer
  10 + self.model_klass = Logline
  11 + def process visit
  12 + yield [visit.path, visit.day_hr]
  13 + end
  14 +end
  15 +
  16 +class HistogramsReducer < Wukong::Streamer::Reducer
  17 + def get_key path, day_hr
  18 + [path, day_hr]
  19 + end
  20 + def start!(*args)
  21 + @count = 0
  22 + super
  23 + end
  24 + def accumulate path, day_hr
  25 + @count += 1
  26 + end
  27 + def finalize
  28 + yield [key, @count]
  29 + end
  30 +end
  31 +
  32 +# Wukong.run( HistogramsMapper )
  33 +Wukong.run( HistogramsMapper, HistogramsReducer, :sort_fields => 3 )
126 examples/server_logs/logline.rb
... ... @@ -1,51 +1,103 @@
1   -class Logline < Struct.new(
2   - :ip, :dt, :tm, :http_method, :protocol, :path, :response_code, :size, :referer, :ua, :tz, :j1, :j2)
3   - # 1 2 3 4 5 6 7 8 9 10 11
4 1
5   - def page_type
6   - case
7   - when path =~ /\.(css|js)$/ then :asset
8   - when path =~ /\.(png|gif|ico)$/ then :image
9   - when path =~ /\.(pl|s?html?|asp|jsp|cgi)$/ then :page
10   - else :other
11   - end
12   - end
  2 +# # Parse logs to TSV
  3 +#
  4 +# bzcat data/star_wars_kid.log.bz2 | head -n 100200 | tail -n 100 > data/swk-100.log
  5 +# cat data/swk-100.tsv
  6 +# cat data/swk-100.log | ./apache_log_parser.rb --map | wu-lign | cutc 150
  7 +# cat data/swk-100.log | ./apache_log_parser.rb --map > data/swk-100.tsv
  8 +# ./histograms.rb --run data/star_wars_kid.log data/star_wars_kid.tsv
  9 +
  10 +# # Histograms
  11 +#
  12 +# cat data/swk-100.tsv | ./histograms.rb --map | wu-lign
  13 +# cat data/swk-hist-map.tsv | ./histograms.rb --reduce
  14 +# ./histograms.rb --run data/star_wars_kid.tsv data/star_wars_kid-pages_by_hour.tsv
  15 +
  16 +# # Sessionize
  17 +#
  18 +# cat data/swk-100.tsv | ./histograms.rb --map | wu-lign
  19 +# cat data/swk-hist-map.tsv | ./histograms.rb --reduce
  20 +# ./histograms.rb --run data/star_wars_kid.tsv data/star_wars_kid-pages_by_hour.tsv
  21 +
  22 +
  23 +class Logline
  24 + include Gorillib::Model
  25 + include Gorillib::Model::PositionalFields
  26 +
  27 + field :ip, String
  28 + field :junk1, String
  29 + field :junk2, String
  30 + #
  31 + field :visit_time, Time
  32 + field :http_method, String
  33 + field :path, String
  34 + field :protocol, String
  35 + field :response_code, Integer
  36 + field :size, Integer, blankish: ['', nil, '-']
  37 + field :referer, String
  38 + field :ua, String
  39 + field :cruft, String
13 40
14 41 #
15 42 # Regular expression to parse an apache log line.
16 43 #
17 44 # 83.240.154.3 - - [07/Jun/2008:20:37:11 +0000] "GET /faq HTTP/1.1" 200 569 "http://infochimps.org/search?query=CAC" "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
18 45 #
  46 + # fails if the referer string has a '"' in it.
  47 + #
19 48 LOG_RE = Regexp.compile(%r{\A
20   - (\S+) # ip 83.240.154.3
21   - \s(\S+) # j1 -
22   - \s(\S+) # j2 -
23   - \s\[(\d+)/(\w+)/(\d+) # date part [07/Jun/2008
24   - :(\d+):(\d+):(\d+) # time part :20:37:11
25   - \s(\+.*)\] # timezone +0000]
26   - \s\"(?:(\S+) # http_method "GET
27   - \s(\S+) # path /faq
28   - \s(\S+)|-)" # protocol HTTP/1.1"
29   - \s(\d+) # response_code 200
30   - \s(\d+) # size 569
31   - \s\"([^\"]*)\" # referer "http://infochimps.org/search?query=CAC"
32   - \s\"([^\"]*)\" # ua "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
33   - \z}x)
34   - MONTHS = { 'Jan' => '01', 'Feb' => '02', 'Mar' => '03', 'Apr' => '04', 'May' => '05', 'Jun' => '06', 'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12', }
  49 + ([\d\.]+) # ip 83.240.154.3
  50 + \s(\S+) # j1 -
  51 + \s(\S+) # j2 -
  52 + \s\[(\d+/\w+/\d+ # date part [07/Jun/2008
  53 + :\d+:\d+:\d+ # time part :20:37:11
  54 + \s[\+\-]\S*)\] # timezone +0000]
  55 + \s\"(?:(\S+) # http_method "GET
  56 + \s(\S+) # path /faq
  57 + \s+(HTTP/[\d\.]+)|-)\" # protocol HTTP/1.1"
  58 + \s(\d+) # response_code 200
  59 + \s(\d+|-) # size 569
  60 + \s\"([^\"]*)\" # referer "http://infochimps.org/search?query=CAC"
  61 + \s\"([^\"]*)\" # ua "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
  62 + \z}x)
  63 + MONTHS = { 'Jan' => 1, 'Feb' => 2, 'Mar' => 3, 'Apr' => 4, 'May' => 5, 'Jun' => 6, 'Jul' => 7, 'Aug' => 8, 'Sep' => 9, 'Oct' => 10, 'Nov' => 11, 'Dec' => 12, }
  64 +
  65 +
  66 +
  67 +
  68 + def receive_visit_time(val)
  69 + if %r{(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+)\s([\+\-]\d\d)(\d\d)} === val
  70 + day, mo, yr, hour, min, sec, tz1, tz2 = [$1, $2, $3, $4, $5, $6, $7, $8]
  71 + val = Time.new(yr.to_i, MONTHS[mo], day.to_i,
  72 + hour.to_i, min.to_i, sec.to_i, "#{tz1}:#{tz2}")
  73 + end
  74 + super(val)
  75 + end
35 76
36 77 # Use the regex to break line into fields
37 78 # Emit each record as flat line
38   - def self.parse line
39   - m = LOG_RE.match(line.chomp) or return BadRecord.new(line)
40   - (ip, j1, j2,
41   - ts_day, ts_mo, ts_year,
42   - ts_hour, ts_min, ts_sec, tz,
43   - http_method, path, protocol,
44   - response_code, size,
45   - referer, ua, *cruft) = m.captures
46   - dt = [ts_year, MONTHS[ts_mo], ts_day].join("")
47   - tm = [ts_hour, ts_min, ts_sec].join("")
48   - self.new( ip, dt, tm, http_method, protocol, path, response_code, size, referer, ua, tz, j1, j2 )
  79 + def self.parse(line)
  80 + match = LOG_RE.match(line.chomp)
  81 + unless match then warn(line) ; return BadRecord.new('no match', line) ; end
  82 + new(* match.captures)
49 83 end
50 84
  85 + FILE_EXT_RE = %r{\.[^/]+\z}
  86 + def page_type
  87 + file_ext = path[FILE_EXT_RE]
  88 + case file_ext
  89 + when nil then 'page'
  90 + when '.wmv' then 'video'
  91 + when '.html','.shtml' then 'page'
  92 + when '.css', '.js' then 'asset'
  93 + when '.png', '.gif', '.ico' then 'image'
  94 + when '.wmv' then 'image'
  95 + when '.pl','.asp','.jsp','.cgi' then 'page'
  96 + else 'other'
  97 + end
  98 + end
  99 +
  100 + def day_hr
  101 + [visit_time.year, visit_time.month, visit_time.day, visit_time.hour].join
  102 + end
51 103 end
48 examples/server_logs/page_counts.pig
... ... @@ -0,0 +1,48 @@
  1 +
  2 +
  3 +LOAD common_pages FROM 'data/common_pages' AS (ip:chararray, from_path:chararray, into_path:chararray);
  4 +
  5 +--
  6 +-- Build adjacency list <A pr B,C,D> from edges (<A B>, <A C>, <A D>)
  7 +--
  8 +
  9 +adj_list_j = GROUP common_pages BY from_path;
  10 +adj_list = FOREACH adj_list_j GENERATE
  11 + group AS from_path,
  12 + 1.0F AS pagerank:float,
  13 + common_pages.(dest) AS into_paths
  14 + ;
  15 +STORE adj_list INTO 'data/pagerank/pr_iter_00';
  16 +
  17 +
  18 +--
  19 +-- Iterate pagerank <A pr_00 B,C,D> to become <A pr_01 B,C,D>
  20 +--
  21 +
  22 +-- find partial share: A.rank / A.into_paths.length
  23 +-- dispatch <into_path partial_share> to each page
  24 +sent_shares = FOREACH adj_list GENERATE
  25 + FLATTEN(into_paths) AS path,
  26 + (float)(pagerank / (float)SIZE(into_paths)) AS share:float;
  27 +
  28 +-- dispatch <from_path into_paths> to yourself, so you have the links still around
  29 +sent_edges = FOREACH adj_list GENERATE
  30 + from_path AS path, into_paths;
  31 +
  32 +-- assemble all the received shared, and the self-sent edge list;
  33 +rcvd_shares = COGROUP sent_edges BY path INNER, sent_shares BY path PARALLEL $PARALLEL;
  34 +
  35 +-- calculate the new rank, and emit a record that looked just like the input.
  36 +next_iter = FOREACH rcvd_shares {
  37 + raw_rank = (float)SUM(sent_shares.share);
  38 + -- treat the case that a node has no in links
  39 + damped_rank = ((raw_rank IS NOT NULL AND raw_rank > 1.0e-12f) ? raw_rank*0.85f + 0.15f : 0.0f);
  40 + GENERATE
  41 + group AS from_path,
  42 + damped_rank AS rank,
  43 + FLATTEN(sent_edges.into_paths)
  44 + ; };
  45 +
  46 +STORE next_iter INTO 'data/pagerank/pr_iter_01';
  47 +
  48 +

0 comments on commit 37c692d

Please sign in to comment.
Something went wrong with that request. Please try again.