-
Notifications
You must be signed in to change notification settings - Fork 195
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: update lance file format to support per-page encoding #1857
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,7 +19,200 @@ | |
syntax = "proto3"; | ||
|
||
package lance.file; | ||
|
||
// Lance v2 File Format | ||
// | ||
// Note: the number of pages (PN) is independent of the | ||
// number of columns (CN) and each page/column can | ||
// have any number of buffers (Px_N)/(Cx_N) which | ||
// is determined by the encodings. | ||
// | ||
// ├────────────────────────────────┤ | ||
// | Data Pages | | ||
// | Page 0, Buffer 0 | | ||
// | ... | | ||
// | Page 0, Buffer P0_N | | ||
// | (optional padding) | | ||
// | Page 1, Buffer 0 | | ||
// | ... | | ||
// | Page PN, Buffer PN_N | | ||
// ├────────────────────────────────┤ | ||
// | Column Metadatas | | ||
// | Column 0 Metadata | | ||
// | Column 0 Meta Buffer 0 | | ||
// | ... | | ||
// | Column 0 Meta Buffer C0_N | | ||
// | (optional padding) | | ||
// | Column 1 Metadata | | ||
// | ... | | ||
// | Column CN Meta Buffer CN_N | | ||
// | (optional padding) | | ||
// ├────────────────────────────────┤ | ||
// | Column Metadata Offset Table | | ||
// | Column 0 Metadata Position | | ||
// | Column 0 Metadata Size | | ||
// | ... | | ||
// | Column CN Metadata Position | | ||
// | Column CN Metadata Size | | ||
// ├────────────────────────────────┤ | ||
// | FileDescriptor | | ||
// ├────────────────────────────────┤ | ||
// | Metadata | | ||
// ├────────────────────────────────┤ | ||
// | Footer | | ||
// | i64: Metadata position | | ||
// | u16: Major version | | ||
// | u16: Minor version | | ||
// | "LANC" | | ||
// ├────────────────────────────────┤ | ||
// | ||
// Note that (optional padding) indicates that a writer may insert padding bytes between | ||
// pages or column metadata blocks to align these to disk sector boundaries. Readers | ||
// should account for this (e.g. cannot assume pages are contiguous). | ||
// | ||
// This padding is normally not present in cloud storage where such padding is not helpful. | ||
|
||
|
||
// A file descriptor that describes the contents of a Lance file | ||
message FileDescriptor { | ||
// The schema of the file | ||
Schema schema = 1; | ||
// The number of rows in the file | ||
uint32 length = 2; | ||
} | ||
|
||
// A schema which describes the data type of each of the columns | ||
message Schema { | ||
// All fields in this file, including the nested fields. | ||
repeated lance.file.Field fields = 1; | ||
// Schema metadata. | ||
map<string, bytes> metadata = 5; | ||
} | ||
|
||
// Leaf encoding where all values in the array are the same value | ||
// | ||
// Buffers: | ||
// * There are no buffers | ||
// Children: | ||
// * There are no children | ||
message ConstantEncoding { | ||
// The constant value | ||
// If this is empty then the value is null | ||
bytes value = 1; | ||
} | ||
|
||
// Leaf encoding for fixed-width types | ||
// | ||
// This encoding does not store nulls, but it is commonly a | ||
// child of MaskedEncoding in which case some values may be | ||
// garbage. | ||
// | ||
// Buffers: | ||
// * Values - A contiguous buffer of fixed-size values | ||
message ValueEncoding { | ||
// The size of each item, in bits | ||
uint32 item_width = 1; | ||
} | ||
|
||
// Densely encodes nullable types using a boolean mask | ||
// | ||
// Children: | ||
// * Values - An array of values with the same logical length as this array | ||
// Null values are present but the bytes are garbage and should be | ||
// ignored. | ||
// * Validity - A boolean array representing whether each value is null or not | ||
message MaskedEncoding { | ||
// The encoding used to store the values | ||
EncodingType value_encoding = 1; | ||
// The encoding used to store the validity | ||
EncodingType validity_encoding = 2; | ||
} | ||
|
||
// Values encoded using a dictionary | ||
// | ||
// Buffers: | ||
// * There are no buffers | ||
// Children: | ||
// * An array of indices into the dictionary (which are unsigned integers) | ||
// * An array of values | ||
message DictionaryEncoding { | ||
// The number of bits in each key | ||
uint32 key_width = 1; | ||
// The encoding used to encode the keys | ||
EncodingType key_encoding = 2; | ||
// The encoding used to encode the values | ||
EncodingType value_encoding = 3; | ||
} | ||
|
||
// Encoding for variable-length lists of data | ||
// | ||
// Buffers: | ||
// * There are no buffers | ||
// Children: | ||
// * An array of offsets (which are unsigned integer values) | ||
// * An array of values | ||
message VariableLengthEncoding { | ||
// The number of bits in each offset | ||
uint32 offset_width = 1; | ||
// The encoding used to encode the offsets | ||
EncodingType offset_encoding = 3; | ||
// The encoding used to encode the values | ||
EncodingType value_encoding = 4; | ||
} | ||
|
||
// An encoding describes how an array is physically serialized into buffers | ||
// | ||
// Some encodings split an array into multiple arrays. For example, plain | ||
// encoding stores an array of values and an array for validity. Each of those | ||
// child arrays has its own encoding. This means that the encoding forms a | ||
// tree of encodings where the leaves are buffers. | ||
message EncodingType { | ||
oneof type { | ||
ConstantEncoding constant = 1; | ||
ValueEncoding value = 2; | ||
MaskedEncoding masked = 3; | ||
DictionaryEncoding dictionary = 4; | ||
VariableLengthEncoding variable_length = 5; | ||
} | ||
} | ||
|
||
message ColumnMetadata { | ||
message Page { | ||
// The file offsets of each of the page buffers | ||
// | ||
// These offsets might point to the column's data section or they | ||
// might point to the column's metadata section. | ||
// | ||
Comment on lines
+183
to
+185
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What does in the data vs metadata section? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Data section is data that is unique to a page. Metadata section is data that is common across all pages in a column. E.g. if you wanted to have a single dictionary for all dictionary encoded pages in a column you could put it in the metadata section. |
||
// The number of pages is variable and depends on the encoding. There | ||
// may be zero buffers (e.g. constant encoded data) in which case this | ||
// could be empty. | ||
repeated uint64 buffer_offsets = 1; | ||
// The size (in bytes) of each of the page buffers | ||
// | ||
// This field will have the same length as `buffer_offsets` and | ||
// may be empty. | ||
repeated uint64 buffer_sizes = 2; | ||
// Logical length (e.g. # rows) of the page | ||
uint32 length = 3; | ||
// The top-level node of the encoding tree used to encode the page | ||
EncodingType encoding = 4; | ||
} | ||
repeated Page pages = 1; | ||
ColumnStatistics statistics = 2; | ||
|
||
// Statistics store the min/max/nulls/... statistics of a page. | ||
// There is an array for each statistic. The logical length of the | ||
// arrays is equal the the number of pages for the column. The number of | ||
// statistics pages may be smaller than the number of column pages (in most | ||
// cases there will only be one page of statistics) | ||
message ColumnStatistics { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where do these go? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The buffers will be in the metadata section. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I realized I forgot an actual reference to this message. I've added that in
|
||
// The schema of statistics for this column | ||
repeated Field schema = 1; | ||
// Pages of statistics data | ||
repeated Page pages = 2; | ||
} | ||
} | ||
|
||
// Metadata of one Lance file. | ||
message Metadata { | ||
// 4 was used for StatisticsMetadata in the past, but has been moved to prevent | ||
|
@@ -28,10 +221,27 @@ message Metadata { | |
|
||
// Position of the manifest in the file. If it is zero, the manifest is stored | ||
// externally. | ||
// | ||
// In Lance version 1 this is a protobuf-encoded Manifest message from the table | ||
// format. | ||
// | ||
// In Lance version 2 this is a FileDescriptor message and, for consistency and | ||
// simplicity this message should always be present. If the schema is cached or | ||
// stored externally then the reader is free to skip reading this. | ||
// | ||
// This message is immediately followed by the metadata block. Therefore, the | ||
// size of this block can be calculated by subtracting this value from the | ||
// metadata offset (that was used to read this message) | ||
uint64 manifest_position = 1; | ||
|
||
// Logical offsets of each chunk group, i.e., number of the rows in each | ||
// chunk. | ||
// | ||
// This field is optional in Lance version 2. If it is non-empty then the | ||
// writer should guarantee that pages are cutoff to align with these boundaries. | ||
// | ||
// If this is a Lance version 2 file and this is empty then it means that | ||
// the writer did not write pages into row groups. | ||
repeated int32 batch_offsets = 2; | ||
|
||
// The file position that page table is stored. | ||
|
@@ -51,6 +261,10 @@ message Metadata { | |
// position = page_table[5][4][0]; | ||
// length = page_table[5][4][1]; | ||
// ``` | ||
// | ||
// This field is only used in Lance version 1. In a Lance version 2 file | ||
// the page table is replaced by detailed column metadata and this field will | ||
// always be 0. | ||
uint64 page_table_position = 3; | ||
|
||
message StatisticsMetadata { | ||
|
@@ -78,13 +292,49 @@ message Metadata { | |
// position = stats_page_table[5][0]; | ||
// length = stats_page_table[5][1]; | ||
// ``` | ||
uint64 page_table_position = 3; | ||
uint64 page_table_position = 3; | ||
} | ||
|
||
// File statistics. This field is only present in Lance version 1. | ||
// | ||
// In Lance version 2 statistics are stored per-column and this field | ||
// will not be present. | ||
StatisticsMetadata statistics = 5; | ||
|
||
// The number of columns in the file (including inner columns when there | ||
// are nested fields) | ||
// | ||
// This can be used to access the column metadata offsets array which is | ||
// stored immediately before manifest_position. | ||
// | ||
// Given N columns the column metadata positions and sizes are stored in a | ||
// contiguous buffer of 2*N uint64 values immediately preceding the file | ||
// descriptor (or the metadata if the file is not self describing). | ||
// | ||
// If we let `column_offsets_pos` be: | ||
// manifest_position - (16 * num_columns) | ||
Comment on lines
+314
to
+315
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if we should just have a direct offset here. That would make it possible to have another optional section later. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure I understand. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As in More generally, just feeling a bit squeamish about having to make these kind of computations to get the offset and size of a section. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, I get it now. I'll add that in. |
||
// | ||
// Then the metadata for column x starts at the uint64: | ||
// file[column_offsets_pos + (16 * x)] | ||
// The size of the metadata for column x is given by the uint64: | ||
// file[column_offsets_pos + (16 * x) + 8] | ||
// | ||
// This field is ignored in Lance version 1 files (the page table is used instead) | ||
uint32 num_columns = 6; | ||
Comment on lines
+304
to
+323
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this solves problem 1 described in #1809, but I'm not sure if it solves problem 2. Could you describe how There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think there's two parts:
I'm not sure yet why we needed to fill these gaps in the old approach. It's very possible I'm missing something. I was thinking, when we read the file, the column metadata at index X corresponds to the field at index X (in dfs-pre-order) of the schema in the file descriptor. Field IDs are a table concern and not something that the file format bothers with (other than storing and loading them faithfully)
It's probably true that lists will never have a validity buffer (since we can always use negative values as sentinels) but we still need a flag in the metadata telling us if we need to post-process the sentinels or not. Also, (non-fixed-size) lists do have an offsets buffer, and there are many different ways that buffer could be encoded. Struct might have a nullability bitmap or they might not. If the page doesn't have any nulls we can skip the buffer. If the page has nulls but doesn't have any all-null-structs then we can use a sentinel. If the page has nulls and all-null-structs then we need a bitmap. We could also encode structs using a sparse bitmap like they do in procella. In this case there is a one validity bitmap buffer AND an offsets buffer per child. The same goes with fixed-size-list. There may be a need for a validity buffer or we may choose to do some kind of sparse encoding of the FSL (basically turning it into a variable size list where zero-size lists represent null) if most of the data is null. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
In the old way, the page table is represented in memory as a
This is fine, we just need to be really specific about what the dfs-pre-order is. Right now FSL doesn't have child field, and thus the current page table ignores them, even though it does have entries for structs. However, later we want to change schemas to include child fields of FSL, so they are aligned with Arrow schemas. We can either do that now, in our definition of the dfs-pre-order, or find some compatible way to make that change in the future. I think my main concern at this point is the transition plan for FSL children. The status quo that we have column metadata for lists and structs makes sense. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think I follow your concern better now. The pragmatic answer to your question is that the only thing that matters for nesting is struct columns. FSL, regular lists, and maps do not have "depth" in the proposal (the protobuf encoding is recursive and does have depth but this is true even for primitive columns). The outside-the-box new-age answer is that the file format does not have any depth at all. There is just a flat list of columns. "nesting" is an in-memory / arrow concept and we have to map that concept onto the file format in the reader /writer. For example, we could choose to map arrow struct arrays into a single "packed" column. We could choose to map arrow struct arrays into a single "shredded" column (the list of children, instead of being at the "column" level, would be instead nested into the encoding). Both of these approaches are probably bad ideas since they prevent any kind of nested projection pushdown (though the "packed struct" may still be useful in certain cases). We could even nest other things. We could create a special kind of FSL, let's call it a tuple, where we encode each index into its own "lance column". This would allow us to do nested tuple projection. Although, again, being practical, it would be less work for everyone else (e.g. arrow, etc.) if we implemented such a thing as a struct. |
||
// The start of the column metadata section | ||
// | ||
// If column projection is not needed, or if the goal is to cache all column | ||
// metadata, then this field can be used to quickly load the entire column metadata | ||
// section in a single read without referring to the column metadata offsets array | ||
// | ||
// This field is ignored in Lance version 1 files | ||
uint64 column_metadata_start = 7; | ||
Comment on lines
+324
to
+331
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This gives us the start, how do we know where the end is? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suppose the intention was to have it end at |
||
} // Metadata | ||
|
||
// Supported encodings. | ||
// | ||
// Only used in Lance version 1. In Lance version 2 the | ||
// equivalent is EncodingType which is part of a page. | ||
enum Encoding { | ||
// Invalid encoding. | ||
NONE = 0; | ||
|
@@ -99,6 +349,8 @@ enum Encoding { | |
} | ||
|
||
// Dictionary field metadata | ||
// | ||
// Only used in Lance version 1 | ||
message Dictionary { | ||
/// The file offset for storing the dictionary value. | ||
/// It is only valid if encoding is DICTIONARY. | ||
|
@@ -119,7 +371,7 @@ message Field { | |
} | ||
Type type = 1; | ||
|
||
// Fully qualified name. | ||
// Fully qualified name. Lance requires unique column names. | ||
string name = 2; | ||
/// Field Id. | ||
/// | ||
|
@@ -159,12 +411,24 @@ message Field { | |
// If this field is nullable. | ||
bool nullable = 6; | ||
|
||
// The encoding of the Field | ||
// | ||
// This is ignored in Lance version 2 | ||
// A column may be stored in many different encodings (the writer can choose what is most appropriate | ||
// for any given page) | ||
// | ||
// When loading data in memory the user might specify a desired target encoding but this should be | ||
// unrelated to the encoding used in the file. | ||
Encoding encoding = 7; | ||
|
||
/// The file offset for storing the dictionary value. | ||
/// It is only valid if encoding is DICTIONARY. | ||
/// | ||
/// The logic type presents the value type of the column, i.e., string value. | ||
/// | ||
/// This is ignored in Lance version 2. If a dictionary is common throughout a file then the dictionary | ||
/// should be stored in the column metadata. If the dictionary changes from page to page then the dictionary | ||
/// should be stored within the page. | ||
Dictionary dictionary = 8; | ||
|
||
// Deprecated: optional extension type name, use metadata field ARROW:extension:name | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
BTW are you going to mention Sentinel encoding?