-
Notifications
You must be signed in to change notification settings - Fork 219
/
format.proto
206 lines (170 loc) · 5.48 KB
/
format.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
syntax = "proto3";
package lance.format.pb;
import "google/protobuf/timestamp.proto";
/*
Format:
+----------------------------------------+
| Encoded Column 0, Chunk 0 |
...
| Encoded Column M, Chunk N - 1 |
| Encoded Column M, Chunk N |
| Indices ... |
| Chunk Position (M x N x 8) |
| Manifest (Optional) |
| Metadata |
| i64: metadata position |
| MAJOR_VERSION | MINOR_VERSION | "LANC" |
+----------------------------------------+
*/
/// UUID type. encoded as 16 bytes.
message UUID {
bytes uuid = 1;
}
// Manifest is a global section shared between all the files.
message Manifest {
// All fields of the dataset, including the nested fields.
repeated Field fields = 1;
// Fragments of the dataset.
repeated DataFragment fragments = 2;
// Snapshot version number.
uint64 version = 3;
// The file position of the version auxiliary data.
// * It is not inheritable between versions.
// * It is not loaded by default during query.
uint64 version_aux_data = 4;
// Schema metadata.
map<string, bytes> metadata = 5;
// If presented, the file position of the index metadata.
optional uint64 index_section = 6;
// Version creation Timestamp, UTC timezone
google.protobuf.Timestamp timestamp = 7;
// Optional version tag
string tag = 8;
}
// Auxiliary Data attached to a version.
// Only load on-demand.
message VersionAuxData {
// key-value metadata.
map<string, bytes> metadata = 3;
}
// Metadata describing the index.
message IndexMetadata {
// Unique ID of an index. It is unique across all the dataset versions.
UUID uuid = 1;
// The columns to build the index.
repeated int32 fields = 2;
// Index name. Must be unique within one dataset version.
string name = 3;
// The version of the dataset this index was built from.
uint64 dataset_version = 4;
}
// Index Section, containing a list of index metadata for one dataset version.
message IndexSection {
repeated IndexMetadata indices = 1;
}
// Data fragment. A fragment is a set of files which represent the
// different columns of the same rows.
// If column exists in the schema, but the related file does not exist,
// treat this column as nulls.
message DataFragment {
// Unique ID of each DataFragment
uint64 id = 1;
repeated DataFile files = 2;
}
// Lance Data File
message DataFile {
// Relative path to the root.
string path = 1;
// The ids of the fields/columns in this file
repeated int32 fields = 2;
}
// Metadata of one Lane file.
message Metadata {
// Position of the manifest in the file. If it is zero, the manifest is stored
// externally.
uint64 manifest_position = 1;
// Logical offsets of each chunk group, i.e., number of the rows in each
// chunk.
repeated int32 batch_offsets = 2;
// The file position that page table is stored.
//
// A page table is a matrix of N x N x 2, where N = num_fields, and M =
// num_batches. Each cell in the table is a pair of <position:int64,
// length:int64> of the page. Both position and length are int64 values. The
// <position, length> of all the pages in the same column are then
// contiguously stored.
//
// For example, for the column 5 and batch 4, we have:
// ```text
// position = page_table[5][4][0];
// length = page_table[5][4][1];
// ```
uint64 page_table_position = 3;
}
// Supported encodings.
enum Encoding {
// Invalid encoding.
NONE = 0;
// Plain encoding.
PLAIN = 1;
// Var-length binary encoding.
VAR_BINARY = 2;
// Dictionary encoding.
DICTIONARY = 3;
// Run-length encoding.
RLE = 4;
}
// Dictionary field metadata
message Dictionary {
/// The file offset for storing the dictionary value.
/// It is only valid if encoding is DICTIONARY.
///
/// The logic type presents the value type of the column, i.e., string value.
int64 offset = 1;
/// The length of dictionary values.
int64 length = 2;
}
// Field metadata for a column.
message Field {
enum Type {
PARENT = 0;
REPEATED = 1;
LEAF = 2;
}
Type type = 1;
// Fully qualified name.
string name = 2;
/// Field Id.
int32 id = 3;
/// Parent Field ID. If not set, this is a top-level column.
int32 parent_id = 4;
// Logical types, support parameterized Arrow Type.
string logical_type = 5;
// If this field is nullable.
bool nullable = 6;
Encoding encoding = 7;
/// The file offset for storing the dictionary value.
/// It is only valid if encoding is DICTIONARY.
///
/// The logic type presents the value type of the column, i.e., string value.
Dictionary dictionary = 8;
// optional extension type name
string extension_name = 9;
}