Permalink
Browse files

Add PostgreSQL array parser

This adds a C-based PostgreSQL array parser.  The original
implementation is from the pg_array_parser library, but I've
heavily modified it.  This C-based parser is 5-500 times faster
than the pure ruby parser that Sequel uses by default.  5 times
faster for an empty array, and 500 times faster for an array
with a single 10MB string.

Because the pg_array extension can be loaded before or after
sequel_pg, handle the case where it is loaded after by switching
the Creator class to use the C based parser instead of the pure
ruby parser.
  • Loading branch information...
jeremyevans committed Jun 25, 2012
1 parent d2ff76c commit dc584223802327a2d8f69b3f27c4ffb1a1976627
Showing with 158 additions and 1 deletion.
  1. +4 −0 CHANGELOG
  2. +29 −1 MIT-LICENSE
  3. +111 −0 ext/sequel_pg/sequel_pg.c
  4. +14 −0 lib/sequel_pg/sequel_pg.rb
View
@@ -1,3 +1,7 @@
+=== HEAD
+
+* Add C-based PostgreSQL array parser, for major speedup in parsing arrays (Dan McClain, jeremyevans)
+
=== 1.4.0 (2012-06-01)
* Add support for streaming on PostgreSQL 9.2 using PQsetRowProcessor (jeremyevans)
View
@@ -1,4 +1,4 @@
-Copyright (c) 2010-2011 Jeremy Evans
+Copyright (c) 2010-2012 Jeremy Evans
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -17,3 +17,31 @@ THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+The original array parsing code (parse_pg_array, read_array) was taken from
+the pg_array_parser library (https://github.com/dockyard/pg_array_parser)
+and has the following license:
+
+Copyright (c) 2012 Dan McClain
+
+MIT License
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
View
@@ -114,6 +114,115 @@ static int enc_get_index(VALUE val)
}
#endif
+static VALUE read_array(int *index, char *c_pg_array_string, int array_string_length, char *word, VALUE converter)
+{
+ int word_index = 0;
+
+ /* The current character in the input string. */
+ char c;
+
+ /* 0: Currently outside a quoted string, current word never quoted
+ * 1: Currently inside a quoted string
+ * -1: Currently outside a quoted string, current word previously quoted */
+ int openQuote = 0;
+
+ /* Inside quoted input means the next character should be treated literally,
+ * instead of being treated as a metacharacter.
+ * Outside of quoted input, means that the word shouldn't be pushed to the array,
+ * used when the last entry was a subarray (which adds to the array itself). */
+ int escapeNext = 0;
+
+ VALUE array = rb_ary_new();
+
+ /* Special case the empty array, so it doesn't need to be handled manually inside
+ * the loop. */
+ if(((*index) < array_string_length) && c_pg_array_string[(*index)] == '}')
+ {
+ return array;
+ }
+
+ for(;(*index) < array_string_length; ++(*index))
+ {
+ c = c_pg_array_string[*index];
+ if(openQuote < 1)
+ {
+ if(c == ',' || c == '}')
+ {
+ if(!escapeNext)
+ {
+ if(openQuote == 0 && word_index == 4 && !strncmp(word, "NULL", word_index))
+ {
+ rb_ary_push(array, Qnil);
+ }
+ else if (RTEST(converter))
+ {
+ rb_ary_push(array, rb_funcall(converter, spg_id_call, 1, rb_str_new(word, word_index)));
+ }
+ else
+ {
+ rb_ary_push(array, rb_str_new(word, word_index));
+ }
+ }
+ if(c == '}')
+ {
+ return array;
+ }
+ escapeNext = 0;
+ openQuote = 0;
+ word_index = 0;
+ }
+ else if(c == '"')
+ {
+ openQuote = 1;
+ }
+ else if(c == '{')
+ {
+ (*index)++;
+ rb_ary_push(array, read_array(index, c_pg_array_string, array_string_length, word, converter));
+ escapeNext = 1;
+ }
+ else
+ {
+ word[word_index] = c;
+ word_index++;
+ }
+ }
+ else if (escapeNext) {
+ word[word_index] = c;
+ word_index++;
+ escapeNext = 0;
+ }
+ else if (c == '\\')
+ {
+ escapeNext = 1;
+ }
+ else if (c == '"')
+ {
+ openQuote = -1;
+ }
+ else
+ {
+ word[word_index] = c;
+ word_index++;
+ }
+ }
+
+ return array;
+}
+
+static VALUE parse_pg_array(VALUE self, VALUE pg_array_string, VALUE converter) {
+
+ /* convert to c-string, create additional ruby string buffer of
+ * the same length, as that will be the worst case. */
+ char *c_pg_array_string = StringValueCStr(pg_array_string);
+ int array_string_length = RSTRING_LEN(pg_array_string);
+ VALUE buf = rb_str_buf_new(array_string_length);
+ char *word = RSTRING_PTR(buf);
+ int index = 1;
+
+ return read_array(&index, c_pg_array_string, array_string_length, word, converter);
+}
+
static VALUE spg_time(const char *s) {
VALUE now;
int hour, minute, second, tokens;
@@ -973,5 +1082,7 @@ void Init_sequel_pg(void) {
rb_define_private_method(c, "with_row_processor", spg_with_row_processor, 3);
#endif
+ rb_define_singleton_method(spg_Postgres, "parse_pg_array", parse_pg_array, 2);
+
rb_require("sequel_pg/sequel_pg");
}
View
@@ -83,3 +83,17 @@ def optimize_model_load?
(rp = row_proc).is_a?(Class) && (rp < Sequel::Model) && optimize_model_load && !opts[:use_cursor] && !opts[:graph]
end
end
+
+if defined?(Sequel::Postgres::PGArray)
+ # pg_array extension previously loaded
+
+ class Sequel::Postgres::PGArray::Creator
+ # Override Creator to use sequel_pg's C-based parser instead of the pure ruby parser.
+ def call(string)
+ Sequel::Postgres::PGArray.new(Sequel::Postgres.parse_pg_array(string, @converter), @type)
+ end
+ end
+
+ # Remove the pure-ruby parser, no longer needed.
+ Sequel::Postgres::PGArray.send(:remove_const, :Parser)
+end

0 comments on commit dc58422

Please sign in to comment.