Skip to content
Browse files

! Fixes issue #38: IO is now read using gets(nil, len)

This switches from byte based parsing to character based parsing, as defined
by the IO that feeds the Parslet::Source. Only properly works on Ruby 1.9,
Ruby 1.8 users fall back to the old behaviour
  • Loading branch information...
1 parent 7e02f6e commit 3b4f00011a595659577dda34ad33ac157fc78680 @kschiess committed Jun 5, 2011
Showing with 236 additions and 311 deletions.
  1. +36 −0 example/sentence.rb
  2. +8 −52 lib/parslet/slice.rb
  3. +20 −61 lib/parslet/source.rb
  4. +24 −0 spec/acceptance/regression_spec.rb
  5. +8 −84 spec/parslet/slice_spec.rb
  6. +134 −114 spec/parslet/source_spec.rb
  7. +6 −0 spec/spec_helper.rb
View
36 example/sentence.rb
@@ -0,0 +1,36 @@
+# encoding: UTF-8
+
+# A small example contributed by John Mettraux (jmettraux) that demonstrates
+# working with Unicode. This only works on Ruby 1.9.
+
+$:.unshift File.dirname(__FILE__) + "/../lib"
+
+require 'parslet'
+
+class Parser < Parslet::Parser
+ rule(:sentence) { (match('[^。]').repeat(1) >> str("")).as(:sentence) }
+ rule(:sentences) { sentence.repeat }
+ root(:sentences)
+end
+
+class Transformer < Parslet::Transform
+ rule(:sentence => simple(:sen)) { sen.to_s }
+end
+
+string =
+ "RubyKaigi2009のテーマは、「変わる/変える」です。 前回の" +
+ "RubyKaigi2008のテーマであった「多様性」の言葉の通り、 " +
+ "2008年はRubyそのものに関しても、またRubyの活躍する舞台に関しても、 " +
+ "ますます多様化が進みつつあります。RubyKaigi2008は、そのような " +
+ "Rubyの生態系をあらためて認識する場となりました。 しかし、" +
+ "こうした多様化が進む中、異なる者同士が単純に距離を 置いたままでは、" +
+ "その違いを認識したところであまり意味がありません。 異なる実装、" +
+ "異なる思想、異なる背景といった、様々な多様性を理解しつつ、 " +
+ "すり合わせるべきものをすり合わせ、変えていくべきところを " +
+ "変えていくことが、豊かな未来へとつながる道に違いありません。"
+
+parser = Parser.new
+transformer = Transformer.new
+
+tree = parser.parse(string)
+p transformer.apply(tree)
View
60 lib/parslet/slice.rb
@@ -48,13 +48,11 @@
#
class Parslet::Slice
attr_reader :str, :offset
- attr_reader :parent
attr_reader :source
- def initialize(string, offset, source=nil, parent=nil)
+ def initialize(string, offset, source=nil)
@str, @offset = string, offset
@source = source
- @parent = parent
end
# Compares slices to other slices or strings.
@@ -69,60 +67,18 @@ def match(regexp)
str.match(regexp)
end
- # Returns a slice that starts at offset start and that has length characters.
- # Whenever possible, return parts of the parent buffer that this slice was
- # cut out of.
+ # Returns the slices size in characters.
#
- def slice(start, length)
- # NOTE: At a later stage, we might not want to create huge trees of slices.
- # The fact that the root of the tree creates slices that link to it makes
- # the tree already rather flat.
-
- if parent
- parent.slice(offset - parent.offset + start, length)
- else
- self.class.new(str.slice(start, length), offset+start, source, self)
- end
- end
-
- # Returns a slice that starts at file offset start and that has length
- # characters in it.
- #
- def abs_slice(start, length)
- slice(start-offset, length)
- end
-
- # True if this slice can satisfy an original input request to the
- # range ofs, len.
- #
- def satisfies?(ofs, len)
- ofs >= offset && (ofs-offset+len-1)<str.size
- end
-
def size
str.size
end
+
+ # Concatenate two slices; it is assumed that the second slice begins
+ # where the first one ends. The offset of the resulting slice is the same
+ # as the one of this slice.
+ #
def +(other)
- raise ArgumentError,
- "Cannot concat something other than a slice to a slice." \
- unless other.respond_to?(:to_slice)
-
- raise Parslet::InvalidSliceOperation,
- "Cannot join slices that aren't adjacent."+
- " (#{self.inspect} + #{other.inspect})" \
- if offset+size != other.offset
-
- raise Parslet::InvalidSliceOperation, "Not from the same source." \
- if source != other.source
-
- # If both slices stem from the same bigger buffer, we can reslice that
- # buffer to (probably) avoid a buffer copy, as long as the strings are
- # not modified.
- if parent && parent == other.parent
- return parent.abs_slice(offset, size+other.size)
- end
-
- self.class.new(str + other.str, offset, source)
+ self.class.new(str + other.to_s, offset, source)
end
# Returns a <line, column> tuple referring to the original input.
View
81 lib/parslet/source.rb
@@ -14,32 +14,25 @@ def initialize(io)
end
@io = io
- @virtual_position = @io.pos
- @eof_position = nil
-
@line_cache = LineCache.new
-
- # Stores an array of <offset, buffer> tuples.
- @slices = []
end
# Reads n chars from the input and returns a Range instance.
#
def read(n)
- slice = read_from_cache(@virtual_position, n)
- @virtual_position += slice.size
-
- slice
+ raise ArgumentError, "Cannot read <= 1 characters at a time." \
+ if n < 1
+ read_slice(n)
end
def eof?
- @eof_position && @virtual_position >= @eof_position
+ @io.eof?
end
def pos
- @virtual_position
+ @io.pos
end
def pos=(new_pos)
- @virtual_position = new_pos
+ @io.pos = new_pos
end
# Returns a <line, column> tuple for the given position. If no position is
@@ -51,59 +44,25 @@ def line_and_column(position=nil)
end
private
- # Minimal size of a single read
- MIN_READ_SIZE = 10 * 1024
- # Number of slices to keep
- BUFFER_CACHE_SIZE = 10
-
- # Reads and returns a piece of the input that contains length chars starting
- # at offset.
- #
- def read_from_cache(offset, length)
- # Do we already have a buffer that contains the given range?
- # Return that.
- slice = @slices.find { |slice|
- slice.satisfies?(offset, length) }
- return slice.abs_slice(offset, length) if slice
-
- # Read a new buffer: Can the demand be satisfied by sequentially reading
- # from the current position?
- needed = offset-@io.pos+length
- if @io.pos <= offset && needed<MIN_READ_SIZE
- # read the slice
- slice = read_slice(needed)
- return slice.abs_slice(offset, length)
- end
-
- # Otherwise seek and read enough so that we can satisfy the demand.
- @io.pos = offset
-
- slice = read_slice(needed)
- return slice.abs_slice(offset, length)
- end
-
def read_slice(needed)
start = @io.pos
- request = [MIN_READ_SIZE, needed].max
- buf = @io.read(request)
-
- # remember eof position
- if !buf || buf.size<request
- @eof_position = @io.pos
- end
+ buf = @io.gets(nil, needed)
# cache line ends
@line_cache.scan_for_line_endings(start, buf)
- slice = Parslet::Slice.new(buf || '', start, self)
-
- # Don't cache empty slices.
- return slice unless buf
-
- # cache the buffer (and eject old entries)
- @slices << slice
- @slices.shift if @slices.size > BUFFER_CACHE_SIZE
-
- slice
+ Parslet::Slice.new(buf || '', start, self)
+ end
+
+ if RUBY_VERSION !~ /^1.9/
+ def read_slice(needed)
+ start = @io.pos
+ buf = @io.read(needed)
+
+ # cache line ends
+ @line_cache.scan_for_line_endings(start, buf)
+
+ Parslet::Slice.new(buf || '', start, self)
+ end
end
end
View
24 spec/acceptance/regression_spec.rb
@@ -180,4 +180,28 @@ class UnicodeLanguage < Parslet::Parser
subject.should parse('RubyKaigi2009のテーマは、「変わる/変える」です。 前回の').as('RubyKaigi2009のテーマは、「変わる/変える」です。 前回の')
end
end
+
+ class UnicodeSentenceLanguage < Parslet::Parser
+ rule(:sentence) { (match('[^。]').repeat(1) >> str("")).as(:sentence) }
+ rule(:sentences) { sentence.repeat }
+ root(:sentences)
+ end
+ describe UnicodeSentenceLanguage, :ruby => 1.9 do
+ let(:string) {
+ "RubyKaigi2009のテーマは、「変わる/変える」です。 前回の" +
+ "RubyKaigi2008のテーマであった「多様性」の言葉の通り、 " +
+ "2008年はRubyそのものに関しても、またRubyの活躍する舞台に関しても、 " +
+ "ますます多様化が進みつつあります。RubyKaigi2008は、そのような " +
+ "Rubyの生態系をあらためて認識する場となりました。 しかし、" +
+ "こうした多様化が進む中、異なる者同士が単純に距離を 置いたままでは、" +
+ "その違いを認識したところであまり意味がありません。 異なる実装、" +
+ "異なる思想、異なる背景といった、様々な多様性を理解しつつ、 " +
+ "すり合わせるべきものをすり合わせ、変えていくべきところを " +
+ "変えていくことが、豊かな未来へとつながる道に違いありません。"
+ }
+
+ it "should parse sentences" do
+ subject.should parse(string)
+ end
+ end
end
View
92 spec/parslet/slice_spec.rb
@@ -5,10 +5,6 @@
it "should construct from an offset and a string" do
described_class.new('foobar', 40)
end
- it "should construct from offset, string and parent slice" do
- parent = described_class.new('foobarfoobar', 40)
- described_class.new('foobarfoobar'.slice(0,5), 40, parent)
- end
end
context "('foobar', 40)" do
let(:slice) { described_class.new('foobar', 40) }
@@ -62,60 +58,6 @@
end
end
end
- describe "slices" do
- describe "<- #slice(start, length)" do
- context "when a common parent is available" do
- before(:each) {
- flexmock(slice, :source => :correct_parent)
- }
- let(:small) { slice.slice(1,3) }
-
- it "should copy the parents source" do
- small.source.should == :correct_parent
- end
- it "should reslice its parent if available" do
- small.should == 'oob'
- small.parent.should == slice
-
- # Mocks parent.slice to return its arguments as a tuple
- flexmock(small.parent).should_receive(:slice).and_return { |*args| args }
-
- small.slice(0,1).should == [1, 1]
- small.slice(2,1).should == [3, 1]
- small.slice(1,2).should == [2, 2]
- end
- it "should reslice correctly (regression from issue 34)" do
- buffer = described_class.new('"foo"', 0)
- foo = buffer.slice(0, buffer.size)
- foo.should == %("foo")
-
- foo.slice(1, 3).should == 'foo'
- end
- end
- it "should return slices that have a correct offset" do
- as = slice.slice(4,1)
- as.offset.should == 44
- as.should == 'a'
- end
- end
- describe "<- #abs_slice(offset, length)" do
- it "should call relative slice with the correct offsets" do
- flexmock(slice).should_receive(:slice).with(1,1).once
- slice.abs_slice(41, 1)
- end
- end
- end
- describe "satisfies? test" do
- it "should answer true if offset/length is within the slice" do
- slice.satisfies?(40, 5).should == true
- slice.satisfies?(41, 1).should == true
- slice.satisfies?(45, 1).should == true
- end
- it "should answer false otherwise" do
- slice.satisfies?(39, 3).should == false
- slice.satisfies?(40, 10).should == false
- end
- end
describe "string methods" do
describe "matching" do
it "should match as a string would" do
@@ -130,33 +72,15 @@
subject { slice.size }
it { should == 6 }
end
- describe "<- #+(other)" do
- it "should check that sources are compatible" do
- a = slice.slice(0,1)
- b = slice.slice(1,2)
- flexmock(b, :source => :incompatible)
- lambda {
- a + b
- }.should raise_error(Parslet::InvalidSliceOperation)
+ describe "<- #+" do
+ let(:other) { described_class.new('baz', 10) }
+ subject { slice + other }
+
+ it "should concat like string does" do
+ subject.size.should == 9
+ subject.should == 'foobarbaz'
+ subject.offset.should == 40
end
- it "should return a slice that represents the extended range" do
- other = described_class.new('foobar', 46)
- (slice + other).should eq(described_class.new('foobarfoobar', 40))
- end
- it "should fail when adding slices that aren't adjacent" do
- other = described_class.new('foobar', 100)
- lambda { slice + other
- }.should raise_error(Parslet::InvalidSliceOperation)
- end
- context "when slices stem from a bigger buffer" do
- let(:buffer) { described_class.new('foobarfoobar', 10) }
- let!(:slice1) { buffer.slice(0,3) }
- let!(:slice2) { buffer.slice(3,3) }
- it "should reslice instead of concatenating" do
- flexmock(buffer).should_receive(:abs_slice).with(10,6).once
- slice1 + slice2
- end
- end
end
end
describe "conversion" do
View
248 spec/parslet/source_spec.rb
@@ -1,137 +1,157 @@
+# Encoding: UTF-8
+
require 'spec_helper'
describe Parslet::Source do
- let(:io) { StringIO.new("a"*100 + "\n" + "a"*100 + "\n") }
- let(:source) { described_class.new(io) }
-
- describe "<- #initialize" do
- it "should turn a string into an IO" do
- source = described_class.new("foo")
- source.read(1).to_s.should == 'f'
- end
- end
- describe "<- #read(n)" do
- it "should not raise nil error when retval is nil" do
- described_class.new('').read(1)
- end
- it "should return 100 'a's when reading a kilobyte" do
- source.read(100).should == 'a'*100
- end
- end
- describe "<- #eof?" do
- subject { source.eof? }
-
- it { should be_false }
- context "after depleting the source" do
- before(:each) { source.read(10000) }
-
- it { should be_true }
+ describe "using simple input" do
+ let(:io) { StringIO.new("a"*100 + "\n" + "a"*100 + "\n") }
+ let(:source) { described_class.new(io) }
+
+ describe "<- #initialize" do
+ it "should turn a string into an IO" do
+ source = described_class.new("foo")
+ source.read(1).to_s.should == 'f'
+ end
end
- end
- describe "<- #pos" do
- subject { source.pos }
-
- it { should == 0 }
- context "after reading a few bytes" do
- it "should still be correct" do
- pos = 0
- 10.times do
- pos += n = rand(10)
- source.read(n)
-
- source.pos.should == pos
- end
+ describe "<- #read(n)" do
+ it "should not raise error when the return value is nil" do
+ described_class.new('').read(1)
end
- end
- end
- describe "<- #pos=(n)" do
- subject { source.pos }
- 10.times do
- pos = rand(200)
- context "setting position #{pos}" do
- before(:each) { source.pos = pos }
-
- it { should == pos }
+ it "should return 100 'a's when reading 100 chars" do
+ source.read(100).should == 'a'*100
end
- end
- end
- describe "<- #column & #line" do
- subject { source.line_and_column }
-
- it { should == [1,1] }
-
- context "on the first line" do
- it "should increase column with every read" do
- 10.times do |i|
- source.line_and_column.last.should == 1+i
- source.read(1)
- end
+ it "should raise ArgumentError when reading <= 1 chars" do
+ expect {
+ source.read(0)
+ }.to raise_error(ArgumentError)
end
end
- context "on the second line" do
- before(:each) { source.read(101) }
- it { should == [2, 1]}
- end
- context "after reading everything" do
- before(:each) { source.read(10000) }
+ describe "<- #eof?" do
+ subject { source.eof? }
- context "when seeking to 9" do
- before(:each) { source.pos = 9 }
- it { should == [1, 10] }
- end
- context "when seeking to 100" do
- before(:each) { source.pos = 100 }
- it { should == [1, 101] }
- end
- context "when seeking to 101" do
- before(:each) { source.pos = 101 }
- it { should == [2, 1] }
- end
- context "when seeking to 102" do
- before(:each) { source.pos = 102 }
- it { should == [2, 2] }
+ it { should be_false }
+ context "after depleting the source" do
+ before(:each) { source.read(10000) }
+
+ it { should be_true }
end
- context "when seeking beyond eof" do
- it "should not throw an error" do
- source.pos = 1000
+ end
+ describe "<- #pos" do
+ subject { source.pos }
+
+ it { should == 0 }
+ context "after reading a few bytes" do
+ it "should still be correct" do
+ pos = 0
+ 10.times do
+ pos += (n = rand(10)+1)
+ source.read(n)
+
+ source.pos.should == pos
+ end
end
end
end
- context "reading char by char, storing the results" do
- attr_reader :results
- before(:each) {
- @results = {}
- while not source.eof?
- pos = source.pos
- @results[pos] = source.line_and_column
- source.read(1)
+ describe "<- #pos=(n)" do
+ subject { source.pos }
+ 10.times do
+ pos = rand(200)
+ context "setting position #{pos}" do
+ before(:each) { source.pos = pos }
+
+ it { should == pos }
end
-
- @results.should have(202).entries
- @results
- }
-
- context "when using pos argument" do
- it "should return the same results" do
- results.each do |pos, result|
- source.line_and_column(pos).should == result
+ end
+ end
+ describe "<- #column & #line" do
+ subject { source.line_and_column }
+
+ it { should == [1,1] }
+
+ context "on the first line" do
+ it "should increase column with every read" do
+ 10.times do |i|
+ source.line_and_column.last.should == 1+i
+ source.read(1)
end
end
end
- it "should give the same results when seeking" do
- results.each do |pos, result|
- source.pos = pos
- source.line_and_column.should == result
+ context "on the second line" do
+ before(:each) { source.read(101) }
+ it { should == [2, 1]}
+ end
+ context "after reading everything" do
+ before(:each) { source.read(10000) }
+
+ context "when seeking to 9" do
+ before(:each) { source.pos = 9 }
+ it { should == [1, 10] }
+ end
+ context "when seeking to 100" do
+ before(:each) { source.pos = 100 }
+ it { should == [1, 101] }
+ end
+ context "when seeking to 101" do
+ before(:each) { source.pos = 101 }
+ it { should == [2, 1] }
+ end
+ context "when seeking to 102" do
+ before(:each) { source.pos = 102 }
+ it { should == [2, 2] }
+ end
+ context "when seeking beyond eof" do
+ it "should not throw an error" do
+ source.pos = 1000
+ end
end
end
- it "should give the same results when reading" do
- cur = source.pos = 0
- while not source.eof?
- source.line_and_column.should == results[cur]
- cur += 1
- source.read(1)
+ context "reading char by char, storing the results" do
+ attr_reader :results
+ before(:each) {
+ @results = {}
+ while not source.eof?
+ pos = source.pos
+ @results[pos] = source.line_and_column
+ source.read(1)
+ end
+
+ @results.should have(202).entries
+ @results
+ }
+
+ context "when using pos argument" do
+ it "should return the same results" do
+ results.each do |pos, result|
+ source.line_and_column(pos).should == result
+ end
+ end
end
- end
+ it "should give the same results when seeking" do
+ results.each do |pos, result|
+ source.pos = pos
+ source.line_and_column.should == result
+ end
+ end
+ it "should give the same results when reading" do
+ cur = source.pos = 0
+ while not source.eof?
+ source.line_and_column.should == results[cur]
+ cur += 1
+ source.read(1)
+ end
+ end
+ end
end
+
+ end
+
+ describe "reading encoded input", :ruby => 1.9 do
+ let(:source) { described_class.new("éö変わる") }
+
+ it "should read characters, not bytes" do
+ source.read(1).should == "é"
+ source.read(1).should == "ö"
+ source.read(1).should == ""
+ end
end
end
View
6 spec/spec_helper.rb
@@ -7,4 +7,10 @@
RSpec.configure do |config|
config.mock_with :flexmock
+
+ # Exclude other ruby versions by giving :ruby => 1.8 or :ruby => 1.9
+ #
+ config.filter_run_excluding :ruby => lambda { |version|
+ RUBY_VERSION.to_s !~ /^#{Regexp.escape(version.to_s)}/
+ }
end

0 comments on commit 3b4f000

Please sign in to comment.
Something went wrong with that request. Please try again.