Skip to content
Browse files

simple word count

  • Loading branch information...
1 parent 2d1e4dc commit 4abef245e77c491ae6191e00f899acf551cb50d7 Philip (flip) Kromer committed Jan 29, 2011
Showing with 136 additions and 5 deletions.
  1. +53 −0 examples/corpus/sentence_bigrams.rb
  2. +1 −5 examples/corpus/sentence_coocurrence.rb
  3. +82 −0 examples/simple_word_count.rb
View
53 examples/corpus/sentence_bigrams.rb
@@ -0,0 +1,53 @@
+#!/usr/bin/env ruby
+$: << File.dirname(__FILE__)
+require 'rubygems'
+require 'wukong/script'
+require 'bucket_counter'
+
+#
+# Coocurrence counts
+#
+
+#
+# Input is a list of document-idx-sentences, each field is tab-separated
+# title idx word_a word_b word_c ...
+#
+# This emits each co-courring pair exactly once; in the case of a three-word
+# sentence the output would be
+#
+# word_a word_b
+# word_a word_c
+# word_b word_c
+#
+class SentenceBigrams < Wukong::Streamer::RecordStreamer
+ def process title, idx, *words
+ words[0..-2].zip(words[1..-1]).each do |word_a, word_b|
+ yield [word_a, word_b]
+ end
+ end
+end
+
+#
+# Combine multiple bucket counts into a single on
+#
+class CombineBuckets < Wukong::Streamer::AccumulatingReducer
+ def get_key *fields
+ fields[0..1]
+ end
+ def start! *args
+ @total = 0
+ end
+ def accumulate *fields
+ @total += 1
+ end
+ def finalize
+ yield [@total, key].flatten
+ end
+end
+
+Wukong.run(
+ SentenceBigrams,
+ CombineBuckets,
+ :io_sort_record_percent => 0.3,
+ :io_sort_mb => 300
+ )
View
6 examples/corpus/sentence_coocurrence.rb
@@ -26,11 +26,7 @@ def initialize *args
end
def process title, idx, *words
- words.each_with_index do |word_a, idx|
- words[(idx+1) .. -1].each do |word_b|
- @bucket << [word_a, word_b]
- end
- end
+ @bucket << words[0..-2].zip(words[1..-1])
dump_bucket if @bucket.full?
end
View
82 examples/simple_word_count.rb
@@ -0,0 +1,82 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+require 'wukong/script'
+
+module WordCount
+ class Mapper < Wukong::Streamer::LineStreamer
+ #
+ # Emit each word in each line.
+ #
+ def process line
+ tokenize(line).each{|word| yield [word, 1] }
+ end
+
+ #
+ # Split a string into its constituent words.
+ #
+ # This is pretty simpleminded:
+ # * downcase the word
+ # * Split at any non-alphanumeric boundary, including '_'
+ # * However, preserve the special cases of 's, 'd or 't at the end of a
+ # word.
+ #
+ # tokenize("Ability is a poor man's wealth #johnwoodenquote")
+ # # => ["ability", "is", "a", "poor", "man's", "wealth", "johnwoodenquote"]
+ #
+ def tokenize str
+ return [] if str.blank?
+ str = str.downcase;
+ # kill off all punctuation except [stuff]'s or [stuff]'t
+ # this includes hyphens (words are split)
+ str = str.
+ gsub(/[^a-zA-Z0-9\']+/, ' ').
+ gsub(/(\w)\'([std])\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
+ # Busticate at whitespace
+ words = str.split(/\s+/)
+ words.reject!{|w| w.blank? }
+ words
+ end
+ end
+
+ #
+ # A bit kinder to your memory manager: accumulate the sum record-by-record:
+ #
+ class Reducer2 < Wukong::Streamer::AccumulatingReducer
+
+ def start!(*args)
+ @key_count = 0
+ end
+
+ def accumulate(*args)
+ @key_count += 1
+ end
+
+ def finalize
+ yield [ key, @key_count ]
+ end
+ end
+
+ #
+ # You can stack up all the values in a list then sum them at once.
+ #
+ # This isn't good style, as it means the whole list is held in memory
+ #
+ class Reducer1 < Wukong::Streamer::ListReducer
+ def finalize
+ yield [ key, values.map(&:last).map(&:to_i).inject(0){|x,tot| x+tot } ]
+ end
+ end
+
+ #
+ # ... easiest of all, though: this is common enough that it's already included
+ #
+ require 'wukong/streamer/count_keys'
+ class Reducer3 < Wukong::Streamer::CountKeys
+ end
+end
+
+# Execute the script
+Wukong.run(
+ WordCount::Mapper,
+ WordCount::Reducer2
+ )

0 comments on commit 4abef24

Please sign in to comment.
Something went wrong with that request. Please try again.