Bug #32738705: "OUT OF SORT MEMORY ERROR" HAS AN INCONSISTENT RELATIO…

…NSHIP WITH THE BUFFER SIZE Bug #33501541: Unmanageable Sort Buffer Behavior in 8.0.20+ Implement direct disk-to-disk copies of large packed addons during the filesort merge phase; if a single row is so large that its addons do not fit into its slice of the sort buffer during merging (even after emptying that slice of all other rows), but the sort key _does_ fit, simply sort the truncated row as usual, and then copy the rest of the addon incrementally from the input to the output, 4 kB at a time, when the row is to be written to the merge output. This is possible because the addon itself doesn't need to be in RAM for the row to be compared against other rows; only the sort key must. This greatly relaxes the sort buffer requirements for successful merging, especially when it comes to JSON rows or small blobs (which are typically used as packed addons, not sort keys). The rules used to be: 1. During initial chunk generation: The sort buffer must be at least as large as the largest row to be sorted. 2. During merging: Merging is guaranteed to pass if the sort buffer is at least 15 times as large as the largest row (sort key + addons), but one may be lucky and pass with only the demands from #1. Now, for sorts implemented using packed addons (which is the common case for small blobs and JSON), the new rules are: 1. Unchanged from #1 above. 2. During merging: Merging is guaranteed to pass if the sort buffer is at least 15 times are large as the largest _sort key_ (plus 4-byte length marker), but one may be lucky and pass with only the demands from #1. In practice, this means that filesort merging will almost never fail due to insufficient buffer space anymore; the query will either fail because a single row is too large in the sort step, or it will pass nearly all of the time. However, do note that while such merges will work, they will not always be very performant, as having lots of 1-row merge chunks will mean many merge passes and little work being done during the initial in-memory sort. Thus, the main use of this functionality is to be able to do sorts where there are a few rows with large JSON values or similar, but where most fit comfortably into the buffer. Also note that since requirement #1 is unchanged, one still cannot sort e.g. 500 kB JSON values using the default 256 kB sort buffer. Older recommendations to keep sort buffers small at nearly any cost are no longer valid, and have not been for a while. Sort buffers should be sized to as much RAM as one can afford without interfering with other tasks (such as the buffer pool, join buffers, or other concurrent sorts), and small sorts are not affected by the maximum sort buffer size being set to a larger value, as the sort buffer is incrementally allocated. Change-Id: I85745cd513402a42ed5fc4f5b7ddcf13c5793100
inikep · Nov 11, 2021 · 463d78d · 463d78d
1 parent 1de8c52
commit 463d78d
Show file tree

Hide file tree

Showing 4 changed files with 155 additions and 25 deletions.
diff --git a/mysql-test/r/filesort.result b/mysql-test/r/filesort.result
@@ -195,3 +195,24 @@ LIMIT 1
 );
 ERROR 22023: Invalid GIS data provided to function st_astext.
 DROP TABLE t;
+#
+# Bug #32738705: "OUT OF SORT MEMORY ERROR" HAS AN INCONSISTENT RELATIONSHIP WITH THE BUFFER SIZE
+#
+CREATE TABLE t1 ( a INTEGER, b TEXT );
+INSERT INTO t1 VALUES (1, REPEAT('x', 40001));
+INSERT INTO t1 VALUES (2, REPEAT('x', 40002));
+INSERT INTO t1 VALUES (3, REPEAT('x', 40003));
+INSERT INTO t1 VALUES (4, REPEAT('x', 40005));
+INSERT INTO t1 VALUES (5, REPEAT('x', 40008));
+INSERT INTO t1 VALUES (6, REPEAT('x', 40013));
+SET sort_buffer_size=65536;
+SELECT a, LENGTH(b) FROM t1 ORDER BY a DESC;
+a	LENGTH(b)
+6	40013
+5	40008
+4	40005
+3	40003
+2	40002
+1	40001
+SET sort_buffer_size=DEFAULT;
+DROP TABLE t1;
diff --git a/mysql-test/t/filesort.test b/mysql-test/t/filesort.test
@@ -189,3 +189,24 @@ SELECT
   LIMIT 1
 );
 DROP TABLE t;
+
+--echo #
+--echo # Bug #32738705: "OUT OF SORT MEMORY ERROR" HAS AN INCONSISTENT RELATIONSHIP WITH THE BUFFER SIZE
+--echo #
+
+CREATE TABLE t1 ( a INTEGER, b TEXT );
+INSERT INTO t1 VALUES (1, REPEAT('x', 40001));
+INSERT INTO t1 VALUES (2, REPEAT('x', 40002));
+INSERT INTO t1 VALUES (3, REPEAT('x', 40003));
+INSERT INTO t1 VALUES (4, REPEAT('x', 40005));
+INSERT INTO t1 VALUES (5, REPEAT('x', 40008));
+INSERT INTO t1 VALUES (6, REPEAT('x', 40013));
+
+# Set up a sort with large addons (since b is TEXT and not HUGETEXT,
+# it's treated as a packed addon). We can keep one row and then some
+# in the sort buffer, but not two, so we need special handling during merging.
+SET sort_buffer_size=65536;
+SELECT a, LENGTH(b) FROM t1 ORDER BY a DESC;
+SET sort_buffer_size=DEFAULT;
+
+DROP TABLE t1;
diff --git a/sql/filesort.cc b/sql/filesort.cc
@@ -1737,6 +1737,9 @@ static uint read_to_buffer(IO_CACHE *fromfile, Merge_chunk *merge_chunk,
       bytes_to_read = rec_length * static_cast<size_t>(count);
       if (count == 0) {
         // Not even room for the first row.
+        // TODO(sgunders): Consider adopting the single-row
+        // fallback from packed addons below, if it becomes
+        // an issue.
         my_error(ER_OUT_OF_SORTMEMORY, ME_FATALERROR);
         LogErr(ERROR_LEVEL, ER_SERVER_OUT_OF_SORTMEMORY);
         return (uint)-1;
@@ -1750,6 +1753,8 @@ static uint read_to_buffer(IO_CACHE *fromfile, Merge_chunk *merge_chunk,
     if (mysql_file_pread(fromfile->file, merge_chunk->buffer_start(),
                          bytes_to_read, merge_chunk->file_position(), MYF_RW))
       return (uint)-1; /* purecov: inspected */
+    merge_chunk->set_valid_buffer_end(merge_chunk->buffer_start() +
+                                      bytes_to_read);
 
     size_t num_bytes_read;
     if (packed_addon_fields || using_varlen_keys) {
@@ -1760,6 +1765,7 @@ static uint read_to_buffer(IO_CACHE *fromfile, Merge_chunk *merge_chunk,
        */
       uchar *record = merge_chunk->buffer_start();
       uint ix = 0;
+      uint extra_bytes_to_advance = 0;
       for (; ix < count; ++ix) {
         if (using_varlen_keys &&
             (record + Sort_param::size_of_varlength_field) >=
@@ -1780,8 +1786,24 @@ static uint read_to_buffer(IO_CACHE *fromfile, Merge_chunk *merge_chunk,
                 ? Addon_fields::read_addon_length(start_of_payload)
                 : param->fixed_res_length;
 
-        if (start_of_payload + res_length >= merge_chunk->buffer_end())
-          break;  // Incomplete record.
+        // NOTE: There are some dances with the arithmetic here to avoid
+        // forcing invalid pointers (start_of_payload + record may be
+        // outside the legal areas).
+        if (start_of_payload > merge_chunk->buffer_end() - res_length) {
+          // Incomplete record, but importantly, we're only missing the
+          // addon fields, so in a pinch, we can still merge the row
+          // and only stream the addon fields from disk to disk if needed.
+          // So we pretend we've read these bytes, and we'll stream
+          // the remaining ones from disk when we actually copy the row.
+          //
+          // We do this as a last-resort if we otherwise couldn't fit
+          // any rows at all, so that the merge doesn't fail.
+          if (ix == 0) {
+            ix = 1;
+            extra_bytes_to_advance = res_length + (start_of_payload - record);
+          }
+          break;
+        }
 
         assert(res_length > 0);
         record = start_of_payload + res_length;
@@ -1794,6 +1816,7 @@ static uint read_to_buffer(IO_CACHE *fromfile, Merge_chunk *merge_chunk,
       }
       count = ix;
       num_bytes_read = record - merge_chunk->buffer_start();
+      num_bytes_read += extra_bytes_to_advance;
       DBUG_PRINT("info", ("read %llu bytes of complete records",
                           static_cast<ulonglong>(bytes_to_read)));
     } else
@@ -1809,6 +1832,61 @@ static uint read_to_buffer(IO_CACHE *fromfile, Merge_chunk *merge_chunk,
   return 0;
 } /* read_to_buffer */
 
+/**
+  Copy “count” bytes from one file, starting at offset “offset”, to the current
+  write position (usually the end) of the other.
+ */
+static int copy_bytes(IO_CACHE *to_file, IO_CACHE *from_file, size_t count,
+                      my_off_t offset) {
+  // TODO(sgunders): Consider reusing the merge buffer if it's larger
+  // than 4 kB. However, note that we can only use the payload part of it,
+  // since we may need the sort key for deduplication purposes.
+  uchar buf[4096];
+  while (count > 0) {
+    size_t bytes_to_copy = min(count, sizeof(buf));
+    if (mysql_file_pread(from_file->file, buf, bytes_to_copy, offset, MYF_RW)) {
+      return 1; /* purecov: inspected */
+    }
+    if (my_b_write(to_file, buf, bytes_to_copy)) {
+      return 1; /* purecov: inspected */
+    }
+    count -= bytes_to_copy;
+    offset += bytes_to_copy;
+  }
+  return 0;
+}
+
+/**
+  Copy a row from “from_file” to “to_file” (this is used during merging).
+  Most commonly, we'll already have all (or most) of it in memory,
+  as indicated by “merge_chunk”, which must be positioned on the row.
+  But in very tight circumstances (see read_to_buffer(), some of it
+  may still be on disk, and will need to be copied directly from file to file.
+ */
+static int copy_row(IO_CACHE *to_file, IO_CACHE *from_file,
+                    Merge_chunk *merge_chunk, uint offset,
+                    uint bytes_to_write) {
+  // NOTE: We need to use valid_buffer_end() and not buffer_end(), as the buffer
+  // may have grown since we read data into it.
+  uchar *row_start = merge_chunk->current_key() + offset;
+  size_t bytes_in_buffer =
+      min<size_t>(merge_chunk->valid_buffer_end() - row_start, bytes_to_write);
+  size_t remaining_bytes = bytes_to_write - bytes_in_buffer;
+
+  if (bytes_in_buffer > 0) {
+    if (my_b_write(to_file, row_start, bytes_in_buffer)) {
+      return 1; /* purecov: inspected */
+    }
+  }
+  if (remaining_bytes > 0) {
+    if (copy_bytes(to_file, from_file, remaining_bytes,
+                   merge_chunk->file_position() - remaining_bytes)) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
 /**
   Merge buffers to one buffer.
 
@@ -1911,9 +1989,9 @@ static int merge_buffers(THD *thd, Sort_param *param, IO_CACHE *from_file,
         }
 
         if (!is_duplicate) {
-          if (my_b_write(to_file, merge_chunk->current_key() + offset,
-                         bytes_to_write)) {
-            return 1; /* purecov: inspected */
+          if (copy_row(to_file, from_file, merge_chunk, offset,
+                       bytes_to_write)) {
+            return 1;
           }
           if (!--max_rows) {
             error = 0; /* purecov: inspected */
@@ -1964,9 +2042,8 @@ static int merge_buffers(THD *thd, Sort_param *param, IO_CACHE *from_file,
            !mcl.key_is_greater_than(merge_chunk->current_key(),
                                     param->m_last_key_seen));
       if (!is_duplicate) {
-        if (my_b_write(to_file, merge_chunk->current_key() + offset,
-                       bytes_to_write)) {
-          return 1; /* purecov: inspected */
+        if (copy_row(to_file, from_file, merge_chunk, offset, bytes_to_write)) {
+          return 1;
         }
         if (!--max_rows) {
           error = 0; /* purecov: inspected */

diff --git a/sql/sql_sort.h b/sql/sql_sort.h
@@ -55,21 +55,13 @@ constexpr size_t VARLEN_PREFIX = 4;
  */
 struct Merge_chunk {
  public:
-  Merge_chunk()
-      : m_current_key(nullptr),
-        m_file_position(0),
-        m_buffer_start(nullptr),
-        m_buffer_end(nullptr),
-        m_rowcount(0),
-        m_mem_count(0),
-        m_max_keys(0) {}
-
   my_off_t file_position() const { return m_file_position; }
   void set_file_position(my_off_t val) { m_file_position = val; }
   void advance_file_position(my_off_t val) { m_file_position += val; }
 
   uchar *buffer_start() { return m_buffer_start; }
   const uchar *buffer_end() const { return m_buffer_end; }
+  const uchar *valid_buffer_end() const { return m_valid_buffer_end; }
 
   void set_buffer(uchar *start, uchar *end) {
     m_buffer_start = start;
@@ -80,6 +72,10 @@ struct Merge_chunk {
     assert(m_buffer_end == nullptr || end <= m_buffer_end);
     m_buffer_end = end;
   }
+  void set_valid_buffer_end(uchar *end) {
+    assert(end <= m_buffer_end);
+    m_valid_buffer_end = end;
+  }
 
   void init_current_key() { m_current_key = m_buffer_start; }
   uchar *current_key() const { return m_current_key; }
@@ -118,14 +114,29 @@ struct Merge_chunk {
   }
 
  private:
-  uchar *m_current_key;      ///< The current key for this chunk.
-  my_off_t m_file_position;  ///< Current position in the file to be sorted.
-  uchar *m_buffer_start;     ///< Start of main-memory buffer for this chunk.
-  uchar *m_buffer_end;       ///< End of main-memory buffer for this chunk.
-  ha_rows m_rowcount;        ///< Number of unread rows in this chunk.
-  ha_rows m_mem_count;       ///< Number of rows in the main-memory buffer.
-  ha_rows m_max_keys;        ///< If we have fixed-size rows:
-                             ///    max number of rows in buffer.
+  /// The current key for this chunk.
+  uchar *m_current_key = nullptr;
+
+  /// Current position in the file to be sorted.
+  my_off_t m_file_position = 0;
+
+  /// Start of main-memory buffer for this chunk.
+  uchar *m_buffer_start = nullptr;
+
+  /// End of main-memory buffer for this chunk.
+  uchar *m_buffer_end = nullptr;
+
+  /// End of actual, valid data for this chunk.
+  uchar *m_valid_buffer_end;
+
+  /// Number of unread rows in this chunk.
+  ha_rows m_rowcount = 0;
+
+  /// Number of rows in the main-memory buffer.
+  ha_rows m_mem_count = 0;
+
+  /// If we have fixed-size rows: max number of rows in buffer.
+  ha_rows m_max_keys = 0;
 };
 
 typedef Bounds_checked_array<Merge_chunk> Merge_chunk_array;