Rework iterating over byte sequences

This removes std::io::StringReader and std::io::ByteArrayReader, in favour of String and ByteArray providing a dedicated Bytes iterator that also implements Read. The Bytes iterator implements the new std::iter::Bytes trait. This trait is a regular iterator, but also provides the "next_byte" method to pull bytes out of the iterator without wrapping them in an Option type. This makes the Bytes iterator more efficient when dealing with large input streams. Changelog: changed
inko-lang · Oct 5, 2022 · 6620c44 · 6620c44
1 parent 9d0884c
commit 6620c44
Show file tree

Hide file tree

Showing 7 changed files with 251 additions and 147 deletions.
diff --git a/libstd/src/std/byte_array.inko b/libstd/src/std/byte_array.inko
@@ -5,7 +5,8 @@ import std::drop::Drop
 import std::fmt::(Format, Formatter)
 import std::hash::(Hash, Hasher)
 import std::index::(bounds_check, Index, SetIndex)
-import std::iter::(Enum, Iter)
+import std::io::Read
+import std::iter::(Bytes as BytesTrait, EOF, Enum, Iter)
 import std::option::Option
 import std::string::(IntoString, ToString)
 
@@ -263,9 +264,9 @@ class builtin ByteArray {
     iter.to_array
   }
 
-  # Returns an `Iter` that iterates over all values in `self`.
-  fn pub iter -> Iter[Int, Never] {
-    Enum.indexed(length) fn (i) { _INKO.byte_array_get(self, i) }
+  # Returns an iterator over the bytes in `self`.
+  fn pub iter -> Bytes {
+    Bytes { @bytes = self, @index = 0 }
   }
 }
 
@@ -412,3 +413,53 @@ impl Format for ByteArray {
     formatter.write(']')
   }
 }
+
+# An iterator over the bytes in a `ByteArray`.
+#
+# This iterator supports mutating the underlying `ByteArray` during iteration,
+# though it's recommended not to do so as the results may be confusing.
+class pub Bytes {
+  let @bytes: ref ByteArray
+  let @index: Int
+}
+
+impl Iter[Int, Never] for Bytes {
+  fn pub mut next -> Option[Int] {
+    match next_byte {
+      case EOF -> Option.None
+      case byte -> Option.Some(byte)
+    }
+  }
+}
+
+impl BytesTrait[Never] for Bytes {
+  fn pub mut next_byte -> Int {
+    if @index < @bytes.length {
+      _INKO.byte_array_get(@bytes, @index := @index + 1)
+    } else {
+      EOF
+    }
+  }
+}
+
+impl Read for Bytes {
+  fn pub mut read(into: mut ByteArray, size: Int) -> Int {
+    let mut read = 0
+
+    while read < size {
+      match next_byte {
+        case EOF -> break
+        case byte -> {
+          into.push(byte)
+          read += 1
+        }
+      }
+    }
+
+    read
+  }
+
+  fn pub mut read_all(bytes: mut ByteArray) -> Int {
+    read(into: bytes, size: @bytes.length - @index)
+  }
+}
diff --git a/libstd/src/std/io.inko b/libstd/src/std/io.inko
@@ -262,81 +262,3 @@ trait pub Seek {
   # the end.
   fn pub mut seek(position: Int) !! Error -> Int
 }
-
-# A type that allows reading of a String.
-#
-# A `StringReader` is useful when you have a `String` and want to use it where
-# a `Read` type is expected.
-#
-# # Error handling
-#
-# The implementations of `read` and `read_all` never throw an error, so no error
-# handling is needed when using a value typed as `StringReader`.
-class pub StringReader {
-  let @string: ref String
-  let @index: Int
-
-  fn pub static new(string: ref String) -> Self {
-    Self { @string = string, @index = 0 }
-  }
-}
-
-impl Read for StringReader {
-  fn pub mut read(into: mut ByteArray, size: Int) -> Int {
-    let mut read = 0
-    let max = @string.size
-
-    while read < size and @index < max {
-      into.push(@string.byte(@index := @index + 1))
-      read += 1
-    }
-
-    read
-  }
-
-  fn pub mut read_all(bytes: mut ByteArray) -> Int {
-    read(into: bytes, size: @string.size - @index)
-  }
-}
-
-# A type that allows reading of a `ByteArray`.
-#
-# A `ByteArrayReader` is useful when you have a `ByteArray` and want to use it
-# where a `Read` type is expected.
-#
-# Since a `ByteArray` is mutable and this reader takes an immutable reference to
-# a `ByteArray`, it's possible for the `ByteArray` to be mutated while this
-# reader exists. This reader supports this without issue, though it's
-# recommended to not mutate the `ByteArray` while reading from it, as this may
-# lead to confusing results.
-#
-# # Error handling
-#
-# The implementations of `read` and `read_all` never throw an error, so no error
-# handling is needed when using a value typed as `ByteArrayReader`.
-class pub ByteArrayReader {
-  let @bytes: ref ByteArray
-  let @index: Int
-
-  fn pub static new(bytes: ref ByteArray) -> Self {
-    Self { @bytes = bytes, @index = 0 }
-  }
-}
-
-impl Read for ByteArrayReader {
-  fn pub mut read(into: mut ByteArray, size: Int) -> Int {
-    let mut read = 0
-    let max = @bytes.length
-
-    while read < size and @index < max {
-      into.push(@bytes[@index := @index + 1])
-      read += 1
-    }
-
-    read
-  }
-
-  fn pub mut read_all(bytes: mut ByteArray) -> Int {
-    read(into: bytes, size: @bytes.length - @index)
-  }
-}
diff --git a/libstd/src/std/iter.inko b/libstd/src/std/iter.inko
@@ -24,6 +24,9 @@ import std::cmp::Equal
 import std::option::Option
 import std::string::(ToString, StringBuffer)
 
+# The "byte" that signals the end in a `Bytes` iterator.
+let pub EOF = -1
+
 # A generic iterator over a sequence of values of type `T`.
 #
 # The type parameter `T` is the type of values that is produced. The type
@@ -458,6 +461,42 @@ impl Iter[T, E] for Enum {
   }
 }
 
+# An iterator over a sequence of bytes.
+#
+# A `Bytes` is a regular iterator, but introduces the extra method `next_byte`.
+# This method is similar to `Iter.next` in that it advances the iterator, but
+# instead of returning an `Option[Int]` it returns an `Int`. This allows `Bytes`
+# to be used as both a regular iterator and a more specialised (and more
+# efficient) iterator over (large) sequences of bytes.
+#
+# When implementing `Bytes` for a type, you must also implement `Iter`. The
+# easiest way of doing this is to have `Iter.next` reuse the implementation of
+# `Bytes.next_byte` like so:
+#
+#     impl Iter[Int, Never] for MyType {
+#       fn pub mut next -> Option[Int] {
+#         match next_byte {
+#           case EOF -> Option.None
+#           case byte -> Option.Some(byte)
+#         }
+#       }
+#     }
+#
+#     impl Bytes[Never] for MyType {
+#       fn pub mut next_byte -> Int {
+#         # ...
+#       }
+#     }
+#
+# The type parameter `E` specifies the error that `next` may throw. If a stream
+# can't throw, this parameter should be assigned to `Never`.
+trait pub Bytes[E]: Iter[Int, E] {
+  # Returns the next byte in the iterator.
+  #
+  # If all input is consumed, this method must return `std::iter::EOF`.
+  fn pub mut next_byte !! E -> Int
+}
+
 # Joins the values of an iterator together using a separator.
 #
 # # Examples

diff --git a/libstd/src/std/string.inko b/libstd/src/std/string.inko
@@ -13,7 +13,8 @@ import std::fmt::(Format, Formatter)
 import std::fs::path::(IntoPath, Path, ToPath)
 import std::hash::(Hash, Hasher)
 import std::index::(bounds_check)
-import std::iter::(Enum, Iter)
+import std::io::Read
+import std::iter::(Bytes as BytesTrait, EOF, Enum, Iter)
 import std::ops::Add
 
 let TAB_BYTE = 9
@@ -236,8 +237,8 @@ class builtin String {
   }
 
   # Returns an iterator over the bytes in `self`.
-  fn pub bytes -> Enum[Int, Never] {
-    Enum.indexed(size) fn (index) { _INKO.string_byte(self, index) }
+  fn pub bytes -> Bytes {
+    Bytes { @string = self, @index = 0 }
   }
 
   # Splits `self` into an iterator of `Strings`, each separated by the given
@@ -582,6 +583,53 @@ impl Drop for Characters {
   }
 }
 
+# An iterator over the bytes in a `String`.
+class pub Bytes {
+  let @string: ref String
+  let @index: Int
+}
+
+impl Iter[Int, Never] for Bytes {
+  fn pub mut next -> Option[Int] {
+    match next_byte {
+      case EOF -> Option.None
+      case byte -> Option.Some(byte)
+    }
+  }
+}
+
+impl BytesTrait[Never] for Bytes {
+  fn pub mut next_byte -> Int {
+    if @index < @string.size {
+      _INKO.string_byte(@string, @index := @index + 1)
+    } else {
+      EOF
+    }
+  }
+}
+
+impl Read for Bytes {
+  fn pub mut read(into: mut ByteArray, size: Int) -> Int {
+    let mut read = 0
+
+    while read < size {
+      match next_byte {
+        case EOF -> break
+        case byte -> {
+          into.push(byte)
+          read += 1
+        }
+      }
+    }
+
+    read
+  }
+
+  fn pub mut read_all(bytes: mut ByteArray) -> Int {
+    read(into: bytes, size: @string.size - @index)
+  }
+}
+
 # A buffer for efficiently concatenating `String` objects together.
 #
 # When concatenating multiple `String` objects together, intermediate `String`

diff --git a/libstd/test/std/test_byte_array.inko b/libstd/test/std/test_byte_array.inko
@@ -1,4 +1,5 @@
 import helpers::(fmt, hash)
+import std::iter::EOF
 import std::test::Tests
 
 fn pub tests(t: mut Tests) {
@@ -165,4 +166,64 @@ fn pub tests(t: mut Tests) {
     t.equal(fmt(ByteArray.from_array([10])), '[10]')
     t.equal(fmt(ByteArray.from_array([10, 20])), '[10, 20]')
   }
+
+  t.test('Bytes.next') fn (t) {
+    let bytes = 'abc'.to_byte_array
+    let iter = bytes.iter
+
+    t.equal(iter.next, Option.Some(97))
+    t.equal(iter.next, Option.Some(98))
+    t.equal(iter.next, Option.Some(99))
+    t.equal(iter.next, Option.None)
+  }
+
+  t.test('Bytes.next_byte') fn (t) {
+    let bytes = 'abc'.to_byte_array
+    let iter = bytes.iter
+
+    t.equal(iter.next_byte, 97)
+    t.equal(iter.next_byte, 98)
+    t.equal(iter.next_byte, 99)
+    t.equal(iter.next_byte, EOF)
+  }
+
+  t.test('Bytes.read') fn (t) {
+    let input = 'foo'.to_byte_array
+    let iter = input.iter
+    let buff = ByteArray.new
+
+    t.equal(iter.read(into: buff, size: 2), 2)
+    t.equal(buff.to_string, 'fo')
+    t.equal(iter.read(into: buff, size: 2), 1)
+    t.equal(buff.to_string, 'foo')
+    t.equal(iter.read(into: buff, size: 2), 0)
+    t.equal(buff.to_string, 'foo')
+  }
+
+  t.test('Bytes.read with a mutated ByteArray') fn (t) {
+    let input = 'foo'.to_byte_array
+    let iter = input.iter
+    let buff = ByteArray.new
+
+    t.equal(iter.read(into: buff, size: 1), 1)
+    t.equal(buff.to_string, 'f')
+
+    input.pop
+    t.equal(iter.read(into: buff, size: 2), 1)
+    t.equal(buff.to_string, 'fo')
+
+    input.push(111)
+    t.equal(iter.read(into: buff, size: 2), 1)
+    t.equal(buff.to_string, 'foo')
+  }
+
+  t.test('Bytes.read_all') fn (t) {
+    let input = 'foo'.to_byte_array
+    let iter = input.iter
+    let buff = ByteArray.new
+
+    t.equal(iter.read_all(buff), 3)
+    t.equal(iter.read_all(buff), 0)
+    t.equal(buff.to_string, 'foo')
+  }
 }