Fix nim-lang#4305: Make split proc for set[char] consistent

jyapayne · Jun 17, 2016 · feb45fb · feb45fb
1 parent 9837b12
commit feb45fb
Showing 1 changed file with 63 additions and 70 deletions.
diff --git a/lib/pure/strutils.nim b/lib/pure/strutils.nim
@@ -326,17 +326,49 @@ proc toOctal*(c: char): string {.noSideEffect, rtl, extern: "nsuToOctal".} =
     result[i] = chr(val mod 8 + ord('0'))
     val = val div 8
 
+proc substrEq(s: string, pos: int, substr: string): bool =
+  var i = 0
+  var length = substr.len
+  while i < length and s[pos+i] == substr[i]:
+    inc i
+
+  return i == length
+
+# --------- Private templates for different split separators -----------
+
+template stringHasSep(s: string, index: int, seps: set[char]): bool =
+  s[index] in seps
+
+template stringHasSep(s: string, index: int, sep: char): bool =
+  s[index] == sep
+
+template stringHasSep(s: string, index: int, sep: string): bool =
+  s.substrEq(index, sep)
+
+template splitCommon(s, sep, maxsplit, sepLen) =
+  ## Common code for split procedures
+  var last = 0
+  var splits = maxsplit
+
+  if len(s) > 0:
+    while last <= len(s):
+      var first = last
+      while last < len(s) and not stringHasSep(s, last, sep):
+        inc(last)
+      if splits == 0: last = len(s)
+      yield substr(s, first, last-1)
+      if splits == 0: break
+      dec(splits)
+      inc(last, sepLen)
+
 iterator split*(s: string, seps: set[char] = Whitespace,
                 maxsplit: int = -1): string =
   ## Splits the string `s` into substrings using a group of separators.
   ##
-  ## Substrings are separated by a substring containing only `seps`. Note
-  ## that whole sequences of characters found in ``seps`` will be counted as
-  ## a single split point and leading/trailing separators will be ignored.
-  ## The following example:
+  ## Substrings are separated by a substring containing only `seps`.
   ##
   ## .. code-block:: nim
-  ##   for word in split("  this is an  example  "):
+  ##   for word in split("this\lis an\texample"):
   ##     writeLine(stdout, word)
   ##
   ## ...generates this output:
@@ -350,7 +382,7 @@ iterator split*(s: string, seps: set[char] = Whitespace,
   ## And the following code:
   ##
   ## .. code-block:: nim
-  ##   for word in split(";;this;is;an;;example;;;", {';'}):
+  ##   for word in split("this:is;an$example", {';', ':', '$'}):
   ##     writeLine(stdout, word)
   ##
   ## ...produces the same output as the first example. The code:
@@ -371,26 +403,13 @@ iterator split*(s: string, seps: set[char] = Whitespace,
   ##   "08"
   ##   "08.398990"
   ##
-  var last = 0
-  var splits = maxsplit
-  assert(not ('\0' in seps))
-  while last < len(s):
-    while s[last] in seps: inc(last)
-    var first = last
-    while last < len(s) and s[last] notin seps: inc(last) # BUGFIX!
-    if first <= last-1:
-      if splits == 0: last = len(s)
-      yield substr(s, first, last-1)
-      if splits == 0: break
-      dec(splits)
+  splitCommon(s, seps, maxsplit, 1)
 
 iterator split*(s: string, sep: char, maxsplit: int = -1): string =
   ## Splits the string `s` into substrings using a single separator.
   ##
   ## Substrings are separated by the character `sep`.
-  ## Unlike the version of the iterator which accepts a set of separator
-  ## characters, this proc will not coalesce groups of the
-  ## separator, returning a string for each found character. The code:
+  ## The code:
   ##
   ## .. code-block:: nim
   ##   for word in split(";;this;is;an;;example;;;", ';'):
@@ -410,56 +429,27 @@ iterator split*(s: string, sep: char, maxsplit: int = -1): string =
   ##   ""
   ##   ""
   ##
-  var last = 0
-  var splits = maxsplit
-  assert('\0' != sep)
-  if len(s) > 0:
-    # `<=` is correct here for the edge cases!
-    while last <= len(s):
-      var first = last
-      while last < len(s) and s[last] != sep: inc(last)
-      if splits == 0: last = len(s)
-      yield substr(s, first, last-1)
-      if splits == 0: break
-      dec(splits)
-      inc(last)
-
-proc substrEq(s: string, pos: int, substr: string): bool =
-  var i = 0
-  var length = substr.len
-  while i < length and s[pos+i] == substr[i]:
-    inc i
-
-  return i == length
+  splitCommon(s, sep, maxsplit, 1)
 
 iterator split*(s: string, sep: string, maxsplit: int = -1): string =
   ## Splits the string `s` into substrings using a string separator.
   ##
   ## Substrings are separated by the string `sep`.
-  var last = 0
-  var splits = maxsplit
-
-  if len(s) > 0:
-    while last <= len(s):
-      var first = last
-      while last < len(s) and not s.substrEq(last, sep):
-        inc(last)
-      if splits == 0: last = len(s)
-      yield substr(s, first, last-1)
-      if splits == 0: break
-      dec(splits)
-      inc(last, sep.len)
-
-# --------- Private templates for different rsplit separators -----------
-
-template stringHasSep(s: string, index: int, seps: set[char]): bool =
-  s[index] in seps
-
-template stringHasSep(s: string, index: int, sep: char): bool =
-  s[index] == sep
+  ## The code:
+  ##
+  ## .. code-block:: nim
+  ##   for word in split("thisDATAisDATAcorrupted", "DATA"):
+  ##     writeLine(stdout, word)
+  ##
+  ## Results in:
+  ##
+  ## .. code-block::
+  ##   "this"
+  ##   "is"
+  ##   "corrupted"
+  ##
 
-template stringHasSep(s: string, index: int, sep: string): bool =
-  s.substrEq(index, sep)
+  splitCommon(s, sep, maxsplit, sep.len)
 
 template rsplitCommon(s, sep, maxsplit, sepLen) =
   ## Common code for rsplit functions
@@ -2218,11 +2208,14 @@ bar
     bar
   """.unindent() == "foo\nfoo\nbar\n"
 
-  let s = " this   is     an example   "
-  doAssert s.split() == @["this", "is", "an", "example"]
-  doAssert s.split(maxsplit=4) == @["this", "is", "an", "example"]
-  doAssert s.split(' ', maxsplit=4) == @["", "this", "", "", "is     an example   "]
-  doAssert s.split(" ", maxsplit=4) == @["", "this", "", "", "is     an example   "]
+  let s = " this is an example  "
+  let s2 = ":this;is;an:example;;"
+
+  doAssert s.split() == @["", "this", "is", "an", "example", "", ""]
+  doAssert s2.split(seps={':', ';'}) == @["", "this", "is", "an", "example", "", ""]
+  doAssert s.split(maxsplit=4) == @["", "this", "is", "an", "example  "]
+  doAssert s.split(' ', maxsplit=1) == @["", "this is an example  "]
+  doAssert s.split(" ", maxsplit=4) == @["", "this", "is", "an", "example  "]
 
   block: # formatEng tests
     doAssert formatEng(0, 2, trim=false) == "0.00"