Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: update lance file format to support per-page encoding #1857

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
268 changes: 266 additions & 2 deletions protos/file.proto
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,200 @@
syntax = "proto3";

package lance.file;

// Lance v2 File Format
//
// Note: the number of pages (PN) is independent of the
// number of columns (CN) and each page/column can
// have any number of buffers (Px_N)/(Cx_N) which
// is determined by the encodings.
//
// ├────────────────────────────────┤
// | Data Pages |
// | Page 0, Buffer 0 |
// | ... |
// | Page 0, Buffer P0_N |
// | (optional padding) |
// | Page 1, Buffer 0 |
// | ... |
// | Page PN, Buffer PN_N |
// ├────────────────────────────────┤
// | Column Metadatas |
// | Column 0 Metadata |
// | Column 0 Meta Buffer 0 |
// | ... |
// | Column 0 Meta Buffer C0_N |
// | (optional padding) |
// | Column 1 Metadata |
// | ... |
// | Column CN Meta Buffer CN_N |
// | (optional padding) |
// ├────────────────────────────────┤
// | Column Metadata Offset Table |
// | Column 0 Metadata Position |
// | Column 0 Metadata Size |
// | ... |
// | Column CN Metadata Position |
// | Column CN Metadata Size |
// ├────────────────────────────────┤
// | FileDescriptor |
// ├────────────────────────────────┤
// | Metadata |
// ├────────────────────────────────┤
// | Footer |
// | i64: Metadata position |
// | u16: Major version |
// | u16: Minor version |
// | "LANC" |
// ├────────────────────────────────┤
//
// Note that (optional padding) indicates that a writer may insert padding bytes between
// pages or column metadata blocks to align these to disk sector boundaries. Readers
// should account for this (e.g. cannot assume pages are contiguous).
//
// This padding is normally not present in cloud storage where such padding is not helpful.


// A file descriptor that describes the contents of a Lance file
message FileDescriptor {
// The schema of the file
Schema schema = 1;
// The number of rows in the file
uint32 length = 2;
}

// A schema which describes the data type of each of the columns
message Schema {
// All fields in this file, including the nested fields.
repeated lance.file.Field fields = 1;
// Schema metadata.
map<string, bytes> metadata = 5;
}

// Leaf encoding where all values in the array are the same value
//
// Buffers:
// * There are no buffers
// Children:
// * There are no children
message ConstantEncoding {
// The constant value
// If this is empty then the value is null
bytes value = 1;
}

// Leaf encoding for fixed-width types
//
// This encoding does not store nulls, but it is commonly a
// child of MaskedEncoding in which case some values may be
// garbage.
//
// Buffers:
// * Values - A contiguous buffer of fixed-size values
message ValueEncoding {
// The size of each item, in bits
uint32 item_width = 1;
}

// Densely encodes nullable types using a boolean mask
//
// Children:
// * Values - An array of values with the same logical length as this array
// Null values are present but the bytes are garbage and should be
// ignored.
// * Validity - A boolean array representing whether each value is null or not
message MaskedEncoding {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW are you going to mention Sentinel encoding?

// The encoding used to store the values
EncodingType value_encoding = 1;
// The encoding used to store the validity
EncodingType validity_encoding = 2;
}

// Values encoded using a dictionary
//
// Buffers:
// * There are no buffers
// Children:
// * An array of indices into the dictionary (which are unsigned integers)
// * An array of values
message DictionaryEncoding {
// The number of bits in each key
uint32 key_width = 1;
// The encoding used to encode the keys
EncodingType key_encoding = 2;
// The encoding used to encode the values
EncodingType value_encoding = 3;
}

// Encoding for variable-length lists of data
//
// Buffers:
// * There are no buffers
// Children:
// * An array of offsets (which are unsigned integer values)
// * An array of values
message VariableLengthEncoding {
// The number of bits in each offset
uint32 offset_width = 1;
// The encoding used to encode the offsets
EncodingType offset_encoding = 3;
// The encoding used to encode the values
EncodingType value_encoding = 4;
}

// An encoding describes how an array is physically serialized into buffers
//
// Some encodings split an array into multiple arrays. For example, plain
// encoding stores an array of values and an array for validity. Each of those
// child arrays has its own encoding. This means that the encoding forms a
// tree of encodings where the leaves are buffers.
message EncodingType {
oneof type {
ConstantEncoding constant = 1;
ValueEncoding value = 2;
MaskedEncoding masked = 3;
DictionaryEncoding dictionary = 4;
VariableLengthEncoding variable_length = 5;
}
}

message ColumnMetadata {
message Page {
// The file offsets of each of the page buffers
//
// These offsets might point to the column's data section or they
// might point to the column's metadata section.
//
Comment on lines +183 to +185
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does in the data vs metadata section?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Data section is data that is unique to a page. Metadata section is data that is common across all pages in a column. E.g. if you wanted to have a single dictionary for all dictionary encoded pages in a column you could put it in the metadata section.

// The number of pages is variable and depends on the encoding. There
// may be zero buffers (e.g. constant encoded data) in which case this
// could be empty.
repeated uint64 buffer_offsets = 1;
// The size (in bytes) of each of the page buffers
//
// This field will have the same length as `buffer_offsets` and
// may be empty.
repeated uint64 buffer_sizes = 2;
// Logical length (e.g. # rows) of the page
uint32 length = 3;
// The top-level node of the encoding tree used to encode the page
EncodingType encoding = 4;
}
repeated Page pages = 1;
ColumnStatistics statistics = 2;

// Statistics store the min/max/nulls/... statistics of a page.
// There is an array for each statistic. The logical length of the
// arrays is equal the the number of pages for the column. The number of
// statistics pages may be smaller than the number of column pages (in most
// cases there will only be one page of statistics)
message ColumnStatistics {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where do these go?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The buffers will be in the metadata section.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I realized I forgot an actual reference to this message. I've added that in ColumnMetadata:

  repeated Page pages = 1;
  ColumnStatistics statistics = 2;

// The schema of statistics for this column
repeated Field schema = 1;
// Pages of statistics data
repeated Page pages = 2;
}
}

// Metadata of one Lance file.
message Metadata {
// 4 was used for StatisticsMetadata in the past, but has been moved to prevent
Expand All @@ -28,10 +221,27 @@ message Metadata {

// Position of the manifest in the file. If it is zero, the manifest is stored
// externally.
//
// In Lance version 1 this is a protobuf-encoded Manifest message from the table
// format.
//
// In Lance version 2 this is a FileDescriptor message and, for consistency and
// simplicity this message should always be present. If the schema is cached or
// stored externally then the reader is free to skip reading this.
//
// This message is immediately followed by the metadata block. Therefore, the
// size of this block can be calculated by subtracting this value from the
// metadata offset (that was used to read this message)
uint64 manifest_position = 1;

// Logical offsets of each chunk group, i.e., number of the rows in each
// chunk.
//
// This field is optional in Lance version 2. If it is non-empty then the
// writer should guarantee that pages are cutoff to align with these boundaries.
//
// If this is a Lance version 2 file and this is empty then it means that
// the writer did not write pages into row groups.
repeated int32 batch_offsets = 2;

// The file position that page table is stored.
Expand All @@ -51,6 +261,10 @@ message Metadata {
// position = page_table[5][4][0];
// length = page_table[5][4][1];
// ```
//
// This field is only used in Lance version 1. In a Lance version 2 file
// the page table is replaced by detailed column metadata and this field will
// always be 0.
uint64 page_table_position = 3;

message StatisticsMetadata {
Expand Down Expand Up @@ -78,13 +292,49 @@ message Metadata {
// position = stats_page_table[5][0];
// length = stats_page_table[5][1];
// ```
uint64 page_table_position = 3;
uint64 page_table_position = 3;
}

// File statistics. This field is only present in Lance version 1.
//
// In Lance version 2 statistics are stored per-column and this field
// will not be present.
StatisticsMetadata statistics = 5;

// The number of columns in the file (including inner columns when there
// are nested fields)
//
// This can be used to access the column metadata offsets array which is
// stored immediately before manifest_position.
//
// Given N columns the column metadata positions and sizes are stored in a
// contiguous buffer of 2*N uint64 values immediately preceding the file
// descriptor (or the metadata if the file is not self describing).
//
// If we let `column_offsets_pos` be:
// manifest_position - (16 * num_columns)
Comment on lines +314 to +315
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we should just have a direct offset here. That would make it possible to have another optional section later.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure I understand.

Copy link
Contributor

@wjones127 wjones127 Jan 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As in column_offsets_pos should be an actual field, and not just an implicit computed one. Then we could add a new section between the column metadata and the file metadata.

More generally, just feeling a bit squeamish about having to make these kind of computations to get the offset and size of a section.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I get it now. I'll add that in.

//
// Then the metadata for column x starts at the uint64:
// file[column_offsets_pos + (16 * x)]
// The size of the metadata for column x is given by the uint64:
// file[column_offsets_pos + (16 * x) + 8]
//
// This field is ignored in Lance version 1 files (the page table is used instead)
uint32 num_columns = 6;
Comment on lines +304 to +323
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this solves problem 1 described in #1809, but I'm not sure if it solves problem 2. Could you describe how x is derived? Do we skip fields with children that have no buffers of their own (struct, fsl in the future)?

Copy link
Contributor Author

@westonpace westonpace Jan 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there's two parts:

  • Gaps in field ids

I'm not sure yet why we needed to fill these gaps in the old approach. It's very possible I'm missing something. I was thinking, when we read the file, the column metadata at index X corresponds to the field at index X (in dfs-pre-order) of the schema in the file descriptor. Field IDs are a table concern and not something that the file format bothers with (other than storing and loading them faithfully)

  • Transient columns (structs / lists)

It's probably true that lists will never have a validity buffer (since we can always use negative values as sentinels) but we still need a flag in the metadata telling us if we need to post-process the sentinels or not. Also, (non-fixed-size) lists do have an offsets buffer, and there are many different ways that buffer could be encoded.

Struct might have a nullability bitmap or they might not. If the page doesn't have any nulls we can skip the buffer. If the page has nulls but doesn't have any all-null-structs then we can use a sentinel. If the page has nulls and all-null-structs then we need a bitmap.

We could also encode structs using a sparse bitmap like they do in procella. In this case there is a one validity bitmap buffer AND an offsets buffer per child.

The same goes with fixed-size-list. There may be a need for a validity buffer or we may choose to do some kind of sparse encoding of the FSL (basically turning it into a variable size list where zero-size lists represent null) if most of the data is null.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure yet why we needed to fill these gaps in the old approach.

In the old way, the page table is represented in memory as a BTreeMap<FieldId, (Offset, Length)>. That BTreeMap is written to disk as an array, where the ith element has field id field_ids_offset + i. So as-is, the page table is coupled to the field ids.

I was thinking, when we read the file, the column metadata at index X corresponds to the field at index X (in dfs-pre-order) of the schema in the file descriptor.

This is fine, we just need to be really specific about what the dfs-pre-order is. Right now FSL doesn't have child field, and thus the current page table ignores them, even though it does have entries for structs. However, later we want to change schemas to include child fields of FSL, so they are aligned with Arrow schemas. We can either do that now, in our definition of the dfs-pre-order, or find some compatible way to make that change in the future.

I think my main concern at this point is the transition plan for FSL children. The status quo that we have column metadata for lists and structs makes sense.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I follow your concern better now.

The pragmatic answer to your question is that the only thing that matters for nesting is struct columns. FSL, regular lists, and maps do not have "depth" in the proposal (the protobuf encoding is recursive and does have depth but this is true even for primitive columns).

The outside-the-box new-age answer is that the file format does not have any depth at all. There is just a flat list of columns. "nesting" is an in-memory / arrow concept and we have to map that concept onto the file format in the reader /writer. For example, we could choose to map arrow struct arrays into a single "packed" column. We could choose to map arrow struct arrays into a single "shredded" column (the list of children, instead of being at the "column" level, would be instead nested into the encoding). Both of these approaches are probably bad ideas since they prevent any kind of nested projection pushdown (though the "packed struct" may still be useful in certain cases).

We could even nest other things. We could create a special kind of FSL, let's call it a tuple, where we encode each index into its own "lance column". This would allow us to do nested tuple projection. Although, again, being practical, it would be less work for everyone else (e.g. arrow, etc.) if we implemented such a thing as a struct.

// The start of the column metadata section
//
// If column projection is not needed, or if the goal is to cache all column
// metadata, then this field can be used to quickly load the entire column metadata
// section in a single read without referring to the column metadata offsets array
//
// This field is ignored in Lance version 1 files
uint64 column_metadata_start = 7;
Comment on lines +324 to +331
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This gives us the start, how do we know where the end is?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose the intention was to have it end at manifest_position - (16 * num_columns)

} // Metadata

// Supported encodings.
//
// Only used in Lance version 1. In Lance version 2 the
// equivalent is EncodingType which is part of a page.
enum Encoding {
// Invalid encoding.
NONE = 0;
Expand All @@ -99,6 +349,8 @@ enum Encoding {
}

// Dictionary field metadata
//
// Only used in Lance version 1
message Dictionary {
/// The file offset for storing the dictionary value.
/// It is only valid if encoding is DICTIONARY.
Expand All @@ -119,7 +371,7 @@ message Field {
}
Type type = 1;

// Fully qualified name.
// Fully qualified name. Lance requires unique column names.
string name = 2;
/// Field Id.
///
Expand Down Expand Up @@ -159,12 +411,24 @@ message Field {
// If this field is nullable.
bool nullable = 6;

// The encoding of the Field
//
// This is ignored in Lance version 2
// A column may be stored in many different encodings (the writer can choose what is most appropriate
// for any given page)
//
// When loading data in memory the user might specify a desired target encoding but this should be
// unrelated to the encoding used in the file.
Encoding encoding = 7;

/// The file offset for storing the dictionary value.
/// It is only valid if encoding is DICTIONARY.
///
/// The logic type presents the value type of the column, i.e., string value.
///
/// This is ignored in Lance version 2. If a dictionary is common throughout a file then the dictionary
/// should be stored in the column metadata. If the dictionary changes from page to page then the dictionary
/// should be stored within the page.
Dictionary dictionary = 8;

// Deprecated: optional extension type name, use metadata field ARROW:extension:name
Expand Down
2 changes: 2 additions & 0 deletions rust/lance-file/src/format/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ impl From<&Metadata> for pb::Metadata {
page_table_position: m.page_table_position as u64,
manifest_position: m.manifest_position.unwrap_or(0) as u64,
statistics,
num_columns: 0,
column_metadata_start: 0,
}
}
}
Expand Down
Loading