From 19ae70d2a3fce75392ed71f7f437ba52e171c37c Mon Sep 17 00:00:00 2001 From: Assaf Vayner Date: Tue, 30 Sep 2025 08:22:34 -0700 Subject: [PATCH 1/2] xet protocol file and CI setup --- .../xet_protocol_build_documentation.yml | 20 + .../xet_protocol_build_pr_documentation.yml | 21 + .../xet_protocol_upload_pr_documentation.yml | 16 + docs/xet-protocol/_toctree.yml | 30 + docs/xet-protocol/api.md | 193 ++++++ docs/xet-protocol/auth.md | 151 +++++ docs/xet-protocol/chunking.md | 150 +++++ docs/xet-protocol/deduplication.md | 189 ++++++ docs/xet-protocol/download-protocol.md | 361 ++++++++++++ docs/xet-protocol/file-id.md | 33 ++ docs/xet-protocol/file-reconstruction.md | 133 +++++ docs/xet-protocol/hashing.md | 178 ++++++ docs/xet-protocol/index.md | 51 ++ docs/xet-protocol/shard.md | 549 ++++++++++++++++++ docs/xet-protocol/upload-protocol.md | 177 ++++++ docs/xet-protocol/xorb.md | 135 +++++ 16 files changed, 2387 insertions(+) create mode 100644 .github/workflows/xet_protocol_build_documentation.yml create mode 100644 .github/workflows/xet_protocol_build_pr_documentation.yml create mode 100644 .github/workflows/xet_protocol_upload_pr_documentation.yml create mode 100644 docs/xet-protocol/_toctree.yml create mode 100644 docs/xet-protocol/api.md create mode 100644 docs/xet-protocol/auth.md create mode 100644 docs/xet-protocol/chunking.md create mode 100644 docs/xet-protocol/deduplication.md create mode 100644 docs/xet-protocol/download-protocol.md create mode 100644 docs/xet-protocol/file-id.md create mode 100644 docs/xet-protocol/file-reconstruction.md create mode 100644 docs/xet-protocol/hashing.md create mode 100644 docs/xet-protocol/index.md create mode 100644 docs/xet-protocol/shard.md create mode 100644 docs/xet-protocol/upload-protocol.md create mode 100644 docs/xet-protocol/xorb.md diff --git a/.github/workflows/xet_protocol_build_documentation.yml b/.github/workflows/xet_protocol_build_documentation.yml new file mode 100644 index 000000000..97ad259a2 --- /dev/null +++ b/.github/workflows/xet_protocol_build_documentation.yml @@ -0,0 +1,20 @@ +name: Build Xet Protocol documentation + +on: + push: + paths: + - "docs/xet-protocol/**" + branches: + - main + +jobs: + build: + uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main + with: + commit_sha: ${{ github.sha }} + package: hub-docs + package_name: xet-protocol + path_to_docs: hub-docs/docs/xet-protocol/ + additional_args: --not_python_module + secrets: + hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} diff --git a/.github/workflows/xet_protocol_build_pr_documentation.yml b/.github/workflows/xet_protocol_build_pr_documentation.yml new file mode 100644 index 000000000..50df7cd9b --- /dev/null +++ b/.github/workflows/xet_protocol_build_pr_documentation.yml @@ -0,0 +1,21 @@ +name: Build Xet Protocol PR Documentation + +on: + pull_request: + paths: + - "docs/xet-protocol/**" + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + build: + uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main + with: + commit_sha: ${{ github.event.pull_request.head.sha }} + pr_number: ${{ github.event.number }} + package: hub-docs + package_name: xet-protocol + path_to_docs: hub-docs/docs/xet-protocol/ + additional_args: --not_python_module diff --git a/.github/workflows/xet_protocol_upload_pr_documentation.yml b/.github/workflows/xet_protocol_upload_pr_documentation.yml new file mode 100644 index 000000000..269397216 --- /dev/null +++ b/.github/workflows/xet_protocol_upload_pr_documentation.yml @@ -0,0 +1,16 @@ +name: Upload Xet Protocol PR Documentation + +on: + workflow_run: + workflows: ["Build Xet Protocol PR Documentation"] + types: + - completed + +jobs: + build: + uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main + with: + package_name: xet-protocol + secrets: + hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} + comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }} diff --git a/docs/xet-protocol/_toctree.yml b/docs/xet-protocol/_toctree.yml new file mode 100644 index 000000000..2f723cf2c --- /dev/null +++ b/docs/xet-protocol/_toctree.yml @@ -0,0 +1,30 @@ +- local: index + title: Xet Protocol Specification + +- title: Building a client library for xet storage + sections: + - local: upload-protocol + title: Upload Protocol + - local: download-protocol + title: Download Protocol + - local: api + title: CAS API + - local: auth + title: Authentication and Authorization + - local: file-id + title: Hugging Face Hub Files Conversion to Xet File ID's + +- title: Overall Xet architecture + sections: + - local: chunking + title: Content-Defined Chunking + - local: hashing + title: Hashing Methods + - local: file-reconstruction + title: File Reconstruction + - local: xorb + title: Xorb Format + - local: shard + title: Shard Format + - local: deduplication + title: Deduplication diff --git a/docs/xet-protocol/api.md b/docs/xet-protocol/api.md new file mode 100644 index 000000000..70c08a26b --- /dev/null +++ b/docs/xet-protocol/api.md @@ -0,0 +1,193 @@ +# CAS API Documentation + +This document describes the HTTP API endpoints used by the Content Addressable Storage (CAS) client to interact with the remote CAS server. + +## Authentication + +To authenticate, authorize, and obtain the API base URL, follow the instructions in [Authentication](./auth). + +## Converting Hashes to Strings + +Sometimes hashes are used in API paths as hexadecimal strings (reconstruction, xorb upload, global dedupe API). + +To convert a 32 hash to a 64 hexadecimal character string to be used as part of an API path there is a specific procedure, MUST NOT directly convert each byte. + +### Procedure + +For every 8 bytes in the hash (indices 0-7, 8-15, 16-23, 24-31) reverse the order of each byte in those regions then concatenate the regions back in order. + +Otherwise stated, consider each 8 byte part of a hash as a little endian 64 bit unsigned integer, then concatenate the hexadecimal representation of the 4 numbers in order (each padded with 0's to 16 characters). + +> [!NOTE] +> In all cases that a hash is represented as a string it is converted from a byte array to a string using this procedure. + +### Example + +Suppose a hash value is: +`[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]` + +Then before converting to a string it will first have its bytes reordered to: +`[7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24]` + +So the string value of the the provided hash [0..32] is **NOT** `000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f`. +It is: `07060504030201000f0e0d0c0b0a0908171615141312111f1e1d1c1b1a1918`. + +## Endpoints + +### 1. Get File Reconstruction + +- **Description**: Retrieves reconstruction information for a specific file, includes byte range support when `Range` header is set. +- **Path**: `/v1/reconstructions/{file_id}` +- **Method**: `GET` +- **Parameters**: + - `file_id`: File hash in hex format (64 lowercase hexadecimal characters). +See [file hashes](./hashing#file-hashes) for computing the file hash and [converting hashes to strings](./api#converting-hashes-to-strings). +- **Headers**: + - `Range`: OPTIONAL. Format: `bytes={start}-{end}` (end is inclusive). +- **Minimum Token Scope**: `read` +- **Body**: None. +- **Response**: JSON (`QueryReconstructionResponse`) + + ```json + { + "offset_into_first_range": 0, + "terms": [...], + "fetch_info": {...} + } + ``` + +- **Error Responses**: See [Error Cases](./api#error-cases) + - `400 Bad Request`: Malformed `file_id` in the path. Fix the path before retrying. + - `401 Unauthorized`: Refresh the token to continue making requests, or provide a token in the `Authorization` header. + - `404 Not Found`: The file does not exist. Not retryable. + - `416 Range Not Satisfiable`: The requested byte range start exceeds the end of the file. Not retryable. + +```txt +GET /v1/reconstructions/0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef +-H "Authorization: Bearer " +OPTIONAL: -H Range: "bytes=0-100000" +``` + +### Example File Reconstruction Response Body + +See [QueryReconstructionResponse](./download-protocol#queryreconstructionresponse-structure) for more details in the download protocol specification. + +### 2. Query Chunk Deduplication (Global Deduplication) + +- **Description**: Checks if a chunk exists in the CAS for deduplication purposes. +- **Path**: `/v1/chunks/{prefix}/{hash}` +- **Method**: `GET` +- **Parameters**: + - `prefix`: The only acceptable prefix for the Global Deduplication API is `default-merkledb`. + - `hash`: Chunk hash in hex format (64 lowercase hexadecimal characters). +See [Chunk Hashes](./hashing#chunk-hashes) to compute the chunk hash and [converting hashes to strings](./api#converting-hashes-to-strings). +- **Minimum Token Scope**: `read` +- **Body**: None. +- **Response**: Shard format bytes (`application/octet-stream`), deserialize as a [shard](./shard#global-deduplication). +- **Error Responses**: See [Error Cases](./api#error-cases) + - `400 Bad Request`: Malformed hash in the path. Fix the path before retrying. + - `401 Unauthorized`: Refresh the token to continue making requests, or provide a token in the `Authorization` header. + - `404 Not Found`: Chunk not already tracked by global deduplication. Not retryable. + +```txt +GET /v1/chunks/default-merkledb/0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef +-H "Authorization: Bearer " +``` + +#### Example Shard Response Body + +An example shard response body can be found in [Xet reference files](https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/Electric_Vehicle_Population_Data_20250917.csv.shard.dedupe). + +### 3. Upload Xorb + +- **Description**: Uploads a serialized Xorb to the server; uploading real data in serialized format. +- **Path**: `/v1/xorbs/{prefix}/{hash}` +- **Method**: `POST` +- **Parameters**: + - `prefix`: The only acceptable prefix for the Xorb upload API is `default`. + - `hash`: Xorb hash in hex format (64 lowercase hexadecimal characters). +See [Xorb Hashes](./hashing#xorb-hashes) to compute the hash, and [converting hashes to strings](./api#converting-hashes-to-strings). +- **Minimum Token Scope**: `write` +- **Body**: Serialized Xorb bytes (`application/octet-stream`). +See [xorb format serialization](./xorb). +- **Response**: JSON (`UploadXorbResponse`) + +```json +{ + "was_inserted": true +} +``` + +- Note: `was_inserted` is `false` if the Xorb already exists; this is not an error. + +- **Error Responses**: See [Error Cases](./api#error-cases) + - `400 Bad Request`: Malformed hash in the path, Xorb hash does not match the body, or body is incorrectly serialized. + - `401 Unauthorized`: Refresh the token to continue making requests, or provide a token in the `Authorization` header. + - `403 Forbidden`: Token provided but does not have a wide enough scope (for example, a `read` token was provided). Clients MUST retry with a `write` scope token. + +```txt +POST /v1/xorbs/default/0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef +-H "Authorization: Bearer " +``` + +#### Example Xorb Request Body + +An example xorb request body can be found in [Xet reference files](https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/eea25d6ee393ccae385820daed127b96ef0ea034dfb7cf6da3a950ce334b7632.xorb). + +### 4. Upload Shard + +- **Description**: Uploads a Shard to the CAS. +Uploads file reconstructions and new xorb listing, serialized into the shard format; marks the files as uploaded. +- **Path**: `/v1/shards` +- **Method**: `POST` +- **Minimum Token Scope**: `write` +- **Body**: Serialized Shard data as bytes (`application/octet-stream`). +See [Shard format guide](./shard#shard-upload). +- **Response**: JSON (`UploadShardResponse`) + +```json +{ + "result": 0 +} +``` + +- Where `result` is: + - `0`: The Shard already exists. + - `1`: `SyncPerformed` — the Shard was registered. + +The value of `result` does not carry any meaning, if the upload shard API returns a `200 OK` status code, the upload was successful and the files listed are considered uploaded. + +- **Error Responses**: See [Error Cases](./api#error-cases) + - `400 Bad Request`: Shard is incorrectly serialized or Shard contents failed verification. + - Can mean that a referenced Xorb doesn't exist or the shard is too large + - `401 Unauthorized`: Refresh the token to continue making requests, or provide a token in the `Authorization` header. + - `403 Forbidden`: Token provided but does not have a wide enough scope (for example, a `read` token was provided). + +```txt +POST /v1/shards +-H "Authorization: Bearer " +``` + +#### Example Shard Request Body + +An example shard request body can be found in [Xet reference files](https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/Electric_Vehicle_Population_Data_20250917.csv.shard.verification-no-footer). + +## Error Cases + +### Non-Retryable Errors + +- **400 Bad Request**: Returned when the request parameters are invalid (for example, invalid Xorb/Shard on upload APIs). +- **401 Unauthorized**: Refresh the token to continue making requests, or provide a token in the `Authorization` header. +- **403 Forbidden**: Token provided but does not have a wide enough scope (for example, a `read` token was provided for an API requiring `write` scope). +- **404 Not Found**: Occurs on `GET` APIs where the resource (Xorb, file) does not exist. +- **416 Range Not Satisfiable**: Reconstruction API only; returned when byte range requests are invalid. Specifically, the requested start range is greater than or equal to the length of the file. + +### Retryable Errors + +- **Connection Errors**: Often caused by network issues. Retry if intermittent. +Clients SHOULD ensure no firewall blocks requests and SHOULD NOT use DNS overrides. +- **429 Rate Limiting**: Lower your request rate using a backoff strategy, then wait and retry. +Assume all APIs are rate limited. +- **500 Internal Server Error**: The server experienced an intermittent issue; clients SHOULD retry their requests. +- **503 Service Unavailable**: Service is temporarily unable to process requests; wait and retry. +- **504 Gateway Timeout**: Service took too long to respond; wait and retry. diff --git a/docs/xet-protocol/auth.md b/docs/xet-protocol/auth.md new file mode 100644 index 000000000..735b4b9be --- /dev/null +++ b/docs/xet-protocol/auth.md @@ -0,0 +1,151 @@ +# Authentication and Authorization + +To invoke any API's mentioned in this specification a client MUST first acquire a token (and the URL) to authenticate against the server which serves these API's. + +The Xet protocol server uses bearer authentication via a token generated by the Hugging Face Hub (). + +The following section explains how to acquire such a token. + +## Token Request API Endpoints + +**URL Pattern:** + +```txt +https://huggingface.co/api/{repo_type}s/{repo_id}/xet-{token_type}-token/{revision} +``` + +**Parameters:** + +All parameters are required to form the URL. + +- `repo_type`: Type of repository - `model`, `dataset`, or `space` +- `repo_id`: Repository identifier in format `namespace/repo-name` +- `token_type`: Either `read` or `write`. +- `revision`: Git revision (branch, tag, or commit hash; default to using `main` if no specific ref is required) + +To understand the distinction for between `token_type` values read onwards in this document to [Token Scope](./auth#token-scope). + +**Example URLs:** + +```txt +https://huggingface.co/api/models/sentence-transformers/all-MiniLM-L6-v2/xet-read-token/main +https://huggingface.co/api/datasets/HuggingFaceM4/the_cauldron/xet-write-token/v1.1 +https://huggingface.co/api/spaces/jsulz/ready-xet-go/xet-read-token/main +``` + +**HTTP Method:** GET + +**Required Headers:** + +- `Authorization`: Bearer token for Hugging Face Hub authentication + +### Response Format + +A JSON encoded object with the following format: + +```typescript +{ + "accessToken": string, + "exp": number, + "casUrl": string, +} +``` + +- accessToken is the token to be used when invoking API's on the Xet CAS service (any Xet API denoted in this specification) +- exp is the unix timestamp of when this token expires +- casUrl is the API service endpoint URL + +Users MAY assume the "accessToken" and "casUrl" fields lengths have an upper limit of 64000 characters. + +#### Example Response Object + +```json +{ + "accessToken": "xet_xxxxxxxxxxx", + "exp": 1848535668, + "casUrl": "https://cas-server.xethub.hf.co" +} +``` + +### Error Handling + +#### HTTP Errors + +- **401 Unauthorized**: Invalid or missing Hub authentication token +- **403 Forbidden**: Insufficient permissions for the requested token type +- **404 Not Found**: Repository or revision does not exist + +### Implementation Example + +Here's a basic implementation flow: + +1. **Make the request:** + + ```http + GET /api/models/black-forest-labs/FLUX.1-dev/xet-read-token/main + Host: huggingface.co + Authorization: Bearer xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + ``` + +2. **Parse the response** + + ```python + endpoint = response_json["casUrl"] + access_token = response_json["accessToken"] + expiration = response_json["exp"] + ``` + +3. **Use the token with Xet service:** + Use Bearer authentication with the value for the `accessToken` key to authenticate with the Xet service at `endpoint` until `expiration` time. + +4. **Token refresh (when needed):** + Use the same API to generate a new token. + + > [!NOTE] + > In `xet-core` we SHOULD add 30 seconds of buffer time before the provided `expiration` time to refresh the token. + +## Token Scope + +Xet tokens can have either a `read` or a `write` scope. +`write` scope supersedes `read` scope and all `read` scope API's can be invoked when using a `write` scope token. +The type of token issued is determined on the `token_type` URI path component when requesting the token from the Hugging Face Hub (see above). + +Check API specification for what scope level is necessary to invoke each API (briefly, only `POST /shard` and `POST /xorb/*` API's require `write` scope). + +The scope of the Xet tokens is limited to the repository and ref for which they were issued. To upload or download from different repositories or refs (different branches) clients MUST be issued different tokens. + +## Token Scope Relative to Hugging Face Hub Authentication Token + +When requesting a Xet token from the Hugging Face Hub, you will only receive a Xet token matching the requested parameters if you actually have access to them, based on the access afforded to your Hub authentication token. + +If you require a `write` scope Xet token, then you MUST request it using a Hugging Face Hub token that has write access to the particular repository and ref that you want to access. + +If you request a `read` scope Xet token, then you MUST request it using a Hugging Face Hub token that has at least read access to the particular repository and ref you want to access. + +If you are using Fine-grained Hugging Face Hub Access Tokens, your tokens MUST have read or write access to the contents of repositories to be issues read or write Xet tokens respectively. + +## Security Considerations + +- Xet tokens are time-limited and SHOULD be refreshed/swapped before expiration +- Store tokens securely and SHOULD NOT log them (both Hub authentication tokens and Xet tokens) +- SHOULD use read tokens when possible; only request write tokens when necessary + +## Diagram + +```mermaid +sequenceDiagram + autonumber + actor C as Client + participant H as Hugging Face Hub (https://huggingface.co) + participant CAS as CAS API + + loop Repeat after token expiration + C->>H: https://huggingface.co/api/{repo_type}s/{repo_id}/xet-{token_type}-token/{revision} + H->>C: { casUrl, accessToken, exp } + + loop Client invoking CAS API's + C->>CAS: CAS API's using accessToken and casUrl
(Reconstruction, Xorb & Shard upload, Global Dedupe) + CAS->>C: API Responses, returns code 401 Unauthorized if token expired + end + end +``` diff --git a/docs/xet-protocol/chunking.md b/docs/xet-protocol/chunking.md new file mode 100644 index 000000000..1d501e044 --- /dev/null +++ b/docs/xet-protocol/chunking.md @@ -0,0 +1,150 @@ +# Content-Defined Chunking Algorithm + +The goal in chunking is to convert file data into smaller variable length chunks, approximately 64 KiB in length. +Chunks boundaries MUST be computed in a deterministic way such that chunking the same data in 2 different places yields chunks that can be deduplicated. + +```txt + +---------+---------+---------+---------+---------+---------+---------+-------------- +File -> | chunk 0 | chunk 1 | chunk 2 | chunk 3 | chunk 4 | chunk 5 | chunk 6 | chunk 7 | ... + +---------+---------+---------+---------+---------+---------+---------+-------------- +``` + +## Step-by-step Algorithm (Gearhash-based CDC) + +### Constant Parameters + +- target_chunk_size: `64 KiB` +- MIN_CHUNK_SIZE: `8 KiB` (minimum chunk size) +- MAX_CHUNK_SIZE: `128 KiB` (maximum chunk size) +- MASK: `0xFFFF000000000000` (16 one-bits → boundary probability 1/2^16 per byte) +- TABLE[256]: table of 256 64-bit constants ([rust-gearhash-table]) + +### State + +- h: 64-bit hash, initialized to 0 +- start_offset: start offset of the current chunk, initialized to 0 + +### Per-byte Update Rule (Gearhash) + +For each input byte `b`, update the hash with 64-bit wrapping arithmetic: + +```text +h = (h << 1) + TABLE[b] +``` + +### Boundary Test and Size Constraints + +At each position after updating `h`, let `size = current_offset - start_offset + 1`. + +- If `size < MIN_CHUNK_SIZE`: skip testing `MASK`; continue +- Else if `size >= MAX_CHUNK_SIZE`: force a boundary +- Else if `(h & MASK) == 0`: boundary at this position + +When a boundary found or taken: + +- Emit the chunk `[start_offset, current_offset + 1)` +- Set `start_offset = current_offset + 1` +- Reset `h = 0` + +At end-of-file, if `start_offset < len(data)`, emit the final chunk `[start_offset, len(data))`. + +### Pseudocode + +```text +Inputs: (See above for constant parameters) + data: byte array + +State: + h = 0 + start_offset = 0 // start of the "current chunk" + +if len(data) < MIN_CHUNK_SIZE: + emit chunk [0, len(data)) + done + +for i in range(0, len(data)): + b = data[i] + h = (h << 1) + TABLE[b] // 64-bit wrapping + size = i + 1 - start_offset + + if size < MIN_CHUNK_SIZE: + continue + + if size >= MAX_CHUNK_SIZE or (h & MASK) == 0: + emit chunk [start_offset, i + 1) + start_offset = i + 1 + h = 0 + +if start_offset < len(data): + emit chunk [start_offset, len(data)) +``` + +### Boundary probability and mask selection + +Given that MASK has 16 one-bits, for a random 64-bit hash `h`, the chance that all those 16 bits are zero is 1 / 2^16. On average, that means you’ll see a match about once every 64 KiB. + +### Properties + +- Deterministic boundaries: same content → same chunks +- Locality: small edits only affect nearby boundaries +- Linear time and constant memory: single 64-bit state and counters + +### Intuition and Rationale + +- The table `TABLE[256]` injects pseudo-randomness per byte value so that the evolving hash `h` behaves like a random 64-bit value with respect to the mask test. This makes boundaries content-defined yet statistically evenly spaced. +- The left shift `(h << 1)` amplifies recent bytes, helping small changes affect nearby positions without globally shifting all boundaries. +- Resetting `h` to 0 at each boundary prevents long-range carryover and keeps boundary decisions for each chunk statistically independent. + +### Implementation Notes + +- Only reset `h` when you emit a boundary. This ensures chunking is stable even when streaming input in pieces. +- Apply the mask test only once `size >= MIN_CHUNK_SIZE`. This reduces the frequency of tiny chunks and stabilizes average chunk sizes. +- MUST force a boundary at `MAX_CHUNK_SIZE` even if `(h & MASK) != 0`. This guarantees bounded chunk sizes and prevents pathological long chunks when matches are rare. +- Use 64-bit wrapping arithmetic for `(h << 1) + TABLE[b]`. This is the behavior in the reference implementation [rust-gearhash]. + +### Edge Cases + +- Tiny files: if `len(data) < MIN_CHUNK_SIZE`, the entire `data` is emitted as a single chunk. +- Long runs without a match: if no position matches `(h & MASK) == 0` before `MAX_CHUNK_SIZE`, a boundary is forced at `MAX_CHUNK_SIZE` to cap chunk size. + +### Portability and Determinism + +- With a fixed `T[256]` table and mask, the algorithm is deterministic across platforms: same input → same chunk boundaries. +- Endianness does not affect behavior because updates are byte-wise and use scalar 64-bit operations. +- SIMD-accelerated implementations (when available) are optimizations only; they produce the same boundaries as the scalar path [rust-gearhash]. + +## Minimum-size Skip-ahead (Cut-point Skipping Optimization) + +Computing and testing the rolling hash at every byte is expensive for large data, and early tests inside the first few bytes of a chunk are disallowed by the `MIN_CHUNK_SIZE` constraint anyway. +We are able to intentionally skip testing some data with cut-point skipping to accelerate scanning without affecting correctness. + +The hash function by virtue of the use of 64 byte integer length and the bit shift (`(h << 1) + TABLE[b]`) causes the hash at any byte offset to only depend on the last 64 bytes. +With a Gear rolling hash window of 64 bytes, the first boundary test is deferred until at least `MIN_CHUNK_SIZE - 64 - 1` bytes into the chunk. +This ensures that, by the time the first boundary can be considered (at offset `MIN_CHUNK_SIZE`), at least one full hash window of bytes from the current chunk has influenced the hash state. + +- Effect: + - Distribution quality is preserved because the first admissible test uses a well-mixed hash (full window), avoiding bias from the earliest bytes. + - Performance improves by avoiding per-byte hashing/judgment in the prefix where boundaries cannot be taken. + - Correctness is preserved because boundaries MUST NOT be set before `MIN_CHUNK_SIZE` and the hash produced at a testable offset is the same as the hash computed had we not skipped any bytes. +- Notes: + - This is an optimization of the search procedure only; it does not change the boundary condition, mask, or emitted chunk set compared to a byte-by-byte implementation that simply refrains from taking boundaries before `MIN_CHUNK_SIZE`. + - In the reference code, this appears as advancing the scan pointer by up to `MIN_CHUNK_SIZE - 64 - 1` before invoking the mask test loop. + +## References + +- rust-gearhash: Fast, SIMD-accelerated GEAR hashing for CDC [rust-gearhash] +- FastCDC paper (background and design rationale of CDC) [fastcdc-paper] + +## Sample Reference + +The [xet-team/xet-spec-reference-files](https://huggingface.co/datasets/xet-team/xet-spec-reference-files) repository contains the original file [Electric_Vehicle_Population_Data_20250917.csv](https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/Electric_Vehicle_Population_Data_20250917.csv). + +In the same repository in file [Electric_Vehicle_Population_Data_20250917.csv.chunks](https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/Electric_Vehicle_Population_Data_20250917.csv.chunks) +the chunks produced out of [Electric_Vehicle_Population_Data_20250917.csv](https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/Electric_Vehicle_Population_Data_20250917.csv) are listed. +Each line in the file is a 64 hexadecimal character string version of the hash of the chunk, followed by a space and then the number of bytes in that chunk. + +Implementors should use the chunk lengths to determine that they are producing the right chunk boundaries for this file with their chunking implementation. + +[rust-gearhash]: https://github.com/srijs/rust-gearhash +[rust-gearhash-table]: https://github.com/srijs/rust-gearhash/blob/adad44e7141cfd29d898cf6e0858f50b995db286/src/table.rs#L5 +[fastcdc-paper]: https://www.usenix.org/conference/atc16/technical-sessions/presentation/xia diff --git a/docs/xet-protocol/deduplication.md b/docs/xet-protocol/deduplication.md new file mode 100644 index 000000000..1ff30209d --- /dev/null +++ b/docs/xet-protocol/deduplication.md @@ -0,0 +1,189 @@ +# Xet Chunk-Level Deduplication Specification + +## Overview + +Chunk-level deduplication is a fundamental optimization technique in the Xet system that eliminates redundant data by identifying and sharing identical content blocks (chunks) across files and repositories. +This specification details the procedures, algorithms, and security mechanisms that enable efficient storage and transfer while maintaining data integrity and access control. + +Deduplication in Xet operates at the chunk level rather than the file level, providing fine-grained deduplication capabilities that can identify shared content even when files differ significantly. +This approach is particularly effective for scenarios common in machine learning and data science workflows, such as: + +- Multiple versions of datasets with incremental changes +- Model checkpoints that share common layers or parameters +- Documentation and configuration files with similar content +- Large files where only portions have changed between versions + +## Core Concepts + +### Chunks + +A **chunk** is a variable-sized content block derived from files using Content-Defined Chunking (CDC) with a rolling hash function. Chunks are the fundamental unit of deduplication in Xet. + +- **Target size**: 64KB (configurable) +- **Size range**: 8KB to 128KB (minimum and maximum constraints) +- **Identification**: Each chunk is uniquely identified by its cryptographic hash (MerkleHash) + +[Detailed chunking description](./chunking) + +### Xorbs + +**Xorbs** are objects that aggregate multiple chunks for efficient storage and transfer: + +- **Maximum size**: 64MB +- **Maximum chunks**: 8,192 chunks per xorb +- **Purpose**: Batch multiple chunks together to reduce metadata and network overhead when uploading and downloading groups of chunks + +### Shards (Xorb Lists) + +**Shards** are objects that contain a list of xorbs that can be deduped against (for the context of deduplication, ignore the file info section of the shard format). + +- **Maximum size**: 64MB +- **Purpose**: Provide a format on a positive reply to a global deduplication request with information about xorbs that already exist in the CAS system. + +### CAS (Content Addressable Storage) + +The **CAS** system provides the underlying storage infrastructure: + +- **Content addressing**: All objects are stored and retrieved by their cryptographic hash +- **Immutability**: Once stored, file content cannot be modified +- **Deduplication**: Identical content is automatically deduplicated at the storage level + +## Deduplication Procedure + +### 1. File Processing and Chunking + +When a file is processed for upload, it undergoes the following steps: + +```mermaid +graph TD + A[File Input] --> B[Content-Defined Chunking] + B --> C[Hash Computation] + C --> D[Chunk Creation] + D --> E[Deduplication Query] +``` + +1. **Chunking**: Content-defined chunking using GearHash algorithm creates variable-sized chunks of file data +2. **Hash Computation**: Each chunk's content is hashed using a cryptographic hash function (Blake3-based MerkleHash) +3. **Chunk Object Creation**: Chunks are wrapped with metadata including hash, size, and data + +### 2. Multi-Level Deduplication Strategy + +Xet employs a three-tiered deduplication strategy to maximize efficiency while minimizing latency: + +#### Level 1: Local Session Deduplication + +**Scope**: Current upload session +**Mechanism**: In-memory hash lookup table +**Purpose**: Eliminate redundancy within the current file or session + +**Benefits**: + +- Fastest lookup (in-memory) +- Zero network overhead +- Immediate deduplication feedback + +#### Level 2: Cached Metadata Deduplication + +**Scope**: Previously uploaded files and sessions +**Mechanism**: Local shard file metadata cache +**Purpose**: Leverage deduplication against recently uploaded content + +**Benefits**: + +- Fast local disk access +- No network latency +- Persistent across sessions + +#### Level 3: Global Deduplication API + +**Scope**: Entire Xet system +**Mechanism**: Global deduplication service with HMAC protection +**Purpose**: Discover deduplication opportunities across all users and repositories + +### 3. Global Deduplication Process + +The global deduplication system provides deduplication capabilities across all data that is managed by the Xet system: + +#### Eligibility Criteria + +Not all chunks are eligible for global deduplication queries to manage system load: + +1. **First chunk**: The first chunk of every file is always eligible. +2. **Hash pattern matching**: Chunks are eligible if: the last 8 bytes of the hash interpreted as a little-endian 64 bit integer % 1024 == 0. + +**Recommendations:** +**Spacing constraints**: The global dedupe API is optimized to return information about nearby chunks when there is a match. Consider only issueing a request to an eligible chunk every ~4MB of data. + +#### Query Process + +1. **Background Query**: Global deduplication queries SHOULD run asynchronously to avoid blocking upload +2. **HMAC Protection**: Chunk hashes are protected using HMAC keys +3. **Shard Response**: When a match is found, the API returns a shard containing: + - **CAS Info Section**: Contains metadata about many xorbs that store chunks + - **HMAC Key**: Included in the shard metadata header used to encrypt chunk hashes +4. **Encrypted Chunk Matching**: All chunk hashes in the returned shard have been encrypted with the HMAC key +5. **Match Discovery Process**: To find matches, clients MUST: + - Encrypt their chunk hash using the provided HMAC key + - Search for the encrypted hash within the shard's chunk listings + - For subsequent chunks, repeat the encryption and search process + - Track the original (non-encrypted) chunk hash while noting which xorb contains that chunk +6. **Metadata Caching**: Client downloads and caches shard metadata for future deduplication + +#### HMAC Security Mechanism + +Global deduplication uses HMAC (Hash-based Message Authentication Code) to protect chunk hashes while enabling deduplication. + +**Security Properties**: + +Raw chunk hashes are never transmitted from servers to clients; a client has to encrypt their raw chunk hash and find a match to know a raw chunk hash exists in the system. +They MAY know this chunk hash because they own this data, the match has made them privy to know which xorb has this chunk hash and the position in the xorb, but has not revealed any other raw chunk hashes in that xorb or other xorbs. + +## Technical Implementation Details + +### Chunk Hash Computation + +Each chunk has its content hashed using a cryptographic hash function (Blake3-based MerkleHash) to create a unique identifier for content addressing. +[See section about hashing](./hashing#chunk-hashes). + +### Xorb Formation + +When new chunks need to be stored, they are aggregated into xorbs based on size and count limits. If adding a new chunk would exceed the maximum xorb size or chunk count, the current xorb is finalized and uploaded. [See section about xorb formation](./xorb) + +### File Reconstruction Information + +When chunks are deduplicated, the system creates file reconstruction information that includes: + +- Hash of the xorb containing the chunks +- Flags for the CAS block +- Total bytes in the segment +- Start and end indices within the xorb (start inclusive, end exclusive) + +This information allows the system to reconstruct files by: + +1. Identifying which xorbs contain the needed chunks +2. Extracting the specific chunk ranges from each xorb +3. Concatenating chunks in the correct order + +[See section about file reconstruction](./file-reconstruction). + +## Fragmentation Prevention + +While deduplication is valuable for saving space, doing it too aggressively can cause file fragmentation—meaning a file’s chunks end up scattered across many different xorbs. This can make reading files slower and less efficient. +To avoid this, in xet-core we aim (and encourage implementors) to keep long, continuous runs of chunks together in the same xorb whenever possible. Implementations SHOULD keep long, continuous runs together when feasible. +Instead of always deduplicating every possible chunk, the system sometimes chooses to reference a straight run of chunks in a single xorb, even if it means skipping deduplication for a few chunks. +This approach balances the benefits of deduplication with the need to keep files easy and fast to read. +Consider for example referencing a deduplicated chunks in a minimum run of chunks (e.g. at least 8 chunks) or targeting an average contiguous run of chunks totalling length >= 1MB. + +## Conclusion + +Xet's chunk-level deduplication system provides a comprehensive solution for efficient data storage and transfer in large-scale data workflows. +By combining local, cached, and global deduplication strategies with robust security mechanisms and fragmentation prevention, +the system achieves significant storage savings while maintaining performance and data integrity. + +The multi-tiered approach ensures that deduplication is both effective and efficient: + +- Local deduplication provides immediate benefits within sessions +- Cached deduplication leverages recent upload history +- Global deduplication enables cross-repository optimization while preserving security + +The system's design prioritizes both efficiency and safety, with comprehensive error handling, performance monitoring, and security measures that make it suitable for production use in data-intensive applications. diff --git a/docs/xet-protocol/download-protocol.md b/docs/xet-protocol/download-protocol.md new file mode 100644 index 000000000..4b0f85903 --- /dev/null +++ b/docs/xet-protocol/download-protocol.md @@ -0,0 +1,361 @@ +# Download Protocol + +This document describes the complete process of downloading a single file from the Xet protocol using the Content Addressable Storage (CAS) reconstruction API. + +## Overview + +File download in the Xet protocol is a two-stage process: + +1. **Reconstruction Query**: Query the CAS API to get file reconstruction metadata +2. **Data Fetching**: Download and reassemble the file using the reconstruction metadata + +## Stage 1: Calling the Reconstruction API + +### Single File Reconstruction + +To download a file given a file hash, first call the reconstruction API to get the file reconstruction. Follow the steps in [api](./api#1-get-file-reconstruction). + +Note that you will need at least a `read` scope auth token, [auth reference](./auth). + +> [!TIP] +> For large files it is RECOMMENDED to request the reconstruction in batches i.e. the first 10GB, download all the data, then the next 10GB and so on. Clients can use the `Range` header to specify a range of file data. + +## Stage 2: Understanding the Reconstruction Response + +The reconstruction API returns a `QueryReconstructionResponse` object with three key components: + +### QueryReconstructionResponse Structure + +```json +{ + "offset_into_first_range": 0, + "terms": [ + { + "hash": "a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456", + "unpacked_length": 263873, + "range": { + "start": 0, + "end": 4 + } + }, + ... + ], + "fetch_info": { + "a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456": [ + { + "range": { + "start": 0, + "end": 4 + }, + "url": "https://transfer.xethub.hf.co/xorb/default/a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456", + "url_range": { + "start": 0, + "end": 131071 + } + }, + ... + ], + ... + } +} +``` + +### Fields + +#### offset_into_first_range + +- Type: `number` +- For a full file or when the specified range start is 0, then this is guaranteed to be `0` +- For range queries this is the byte offset into the first term (deserialized/chunks decompressed) to start to keep data from. + - since the requested range may start in the middle of a chunk, and data MUST be downloaded in full chunks (since they may need to be deserialized) then this offset tells a client how many bytes to skip in the first chunk (or possibly multiple chunks within the first term). + +#### terms + +- Type: `Array` +- Ordered list of reconstruction terms describing what chunks to download from which xorb +- Each `CASReconstructionTerm` contains: + - `hash`: The xorb hash (64-character lowercase hex string) + - `range`: Chunk index range`{ start: number, end: number }` within the xorb; end-exclusive `[start, end)` + - `unpacked_length`: Expected length after decompression (for validation) + +#### fetch_info + +- Type: `Map>` +- Maps xorb hashes to required information to download some of their chunks. + - The mapping is to an array of 1 or more `CASReconstructionFetchInfo` +- Each `CASReconstructionFetchInfo` contains: + - `url`: HTTP URL for downloading the xorb data, presigned URL containing authorization information + - `url_range` (bytes_start, bytes_end): Byte range `{ start: number, end: number }` for the Range header; end-inclusive `[start, end]` + - The `Range` header MUST be set as `Range: bytes=-` when downloading this chunk range + - `range` (index_start, index_end): Chunk index range `{ start: number, end: number }` that this URL provides; end-exclusive `[start, end)` + - This range indicates which range of chunk indices within this xorb that this fetch info term is describing + +## Stage 3: Downloading and Reconstructing the File + +### Process Overview + +1. Process each `CASReconstructionTerm` in order from the `terms` array +2. For each `CASReconstructionTerm`, find matching fetch info using the term's hash + + 1. get the list of fetch_info items under the xorb hash from the `CASReconstructionTerm`. The xorb hash is guaranteed to exist as a key in the fetch_info map. + 2. linearly iterate through the list of `CASReconstructionFetchInfo` and find one which refers to a chunk range that is equal or encompassing the term's chunk range. + - Such a fetch_info item is guaranteed to exist. If none exist the server is at fault. +3. Download the required data using HTTP `GET` request and MUST set the `Range` header +4. Deserialize the downloaded xorb data to extract chunks + + 1. This series of chunks contains chunks at indices specified by the `CASReconstructionFetchInfo`'s `range` field. Trim chunks at the beginning or end to match the chunks specified by the reconstruction term's `range` field. + 2. (for the first term only) skip `offset_into_first_range` bytes +5. Concatenate the results in term order to reconstruct the file + +### Detailed Download Process + +#### Download Reconstruction + +- use the reconstruction api to download the reconstruction object for a given file + +```python +file_id = "0123...abcdef" +api_endpoint, token = get_token() # follow auth instructions +url = api_endpoint + "/reconstructions/" + file_id +reconstruction = get(url, headers={"Authorization": "Bearer: " + token}) + +# break the reconstruction into components +terms = reconstruction["terms"] +fetch_info = reconstruction["fetch_info"] +offset_into_first_range = reconstruction["offset_into_first_range"] +``` + +#### Match Terms to Fetch Info + +For each `CASReconstructionTerm` in the `terms` array: + +- Look up the term's `hash` in the `fetch_info` map to get a list of `CASReconstructionFetchInfo` +- Find a `CASReconstructionFetchInfo` entry where the fetch info's `range` contains the term's `range` + - linearly search through the array of `CASReconstructionFetchInfo` and find the element where the range block (`{ "start": number, "end": number }`) of the `CASReconstructionFetchInfo` has start <= term's range start AND end >= term's range end. + - The server is meant to guarantee a match, if there isn't a match this download is considered failed and the server made an error. + +```python +for term in terms: + xorb_hash = term["hash"] + fetch_info_entries = fetch_info[xorb_hash] + fetch_info_entry = None + for entry in fetch_info_entries: + if entry["range"][start] <= term["range"]["start"] and entry["range"]["end"] >= term["range"]["end"]: + fetch_info_entry = entry + break + if fetch_info_entry is None: + # Error! +``` + +#### Step 2: Download Xorb Data + +For each matched fetch info: + +1. Make an HTTP GET request to the `url` in the fetch info entry +2. Include a `Range` header: `bytes={url_range.start}-{url_range.end}` + +```python +for term in terms: + ... + data_url = fetch_info_entry["url"] + range_header = "bytes=" + fetch_info_entry["url_range"]["start"] + "-" + fetch_info_entry["url_range"]["end"] + data = get(data_url, headers={"Range": range_header}) +``` + +#### Deserialize Downloaded Data + +The downloaded data is in xorb format and MUST be deserialized: + +1. **Parse xorb structure**: The data contains compressed chunks with headers +2. **Decompress chunks**: Each chunk has a header followed by compressed data +3. **Extract byte indices**: Track byte boundaries between chunks for range extraction +4. **Validate length**: Decompressed length MUST match `unpacked_length` from the term + +**Note**: The deserialization process depends on the [Xorb format](./xorb). + +```python +for term in terms: + ... + chunks = {} + for i in range(fetch_info_entry["range"]["start"], fetch_info_entry["range"]["end"]): + chunk = deserialize_chunk(data) # assume data is a reader that advances forwards + chunks[i] = chunk + # at this point data should be fully consumed +``` + +#### Step 4: Extract Term Data + +From the deserialized xorb data: + +1. Use the term's `range` to identify which chunks are needed +2. Extract only the chunks specified by `range.start` to `range.end-1` (end-exclusive) +3. Apply any range offsets if processing a partial file download + +```python +file_chunks = [] +for term in terms: + ... + for i in range(term["range"]["start"], term["range"]["end"]): + chunk = chunks[i] + # it is possible that the offset captures multiple chunks, so we may need to skip whole chunks + if offset_into_first_range > len(chunk): + offset_into_first_range -= len(chunk) + continue + if offset_info_first_range > 0: + chunk = chunk[offset_into_first_range:] + offset_info_first_range = 0 + + file_chunks.push(chunk) +``` + +#### Step 5: Stitch Results Together + +Write all of the chunks to the output file or buffer. + +If a range was specified then the total data will need to be truncated to the amount of bytes requested. +When a range is specified but the range does not end on a chunk boundary the last byte of the requested range will be in the middle of the last chunk. +A client knows the start of the data from `offset_into_first_range` and can then use the length of the specified range to know end end offset. + +```python +with open(file_path) as f: + for chunk in file_chunks: + f.write(chunk) +``` + +## Range Downloads + +For partial file downloads, the reconstruction API supports range queries: + +- Include `Range: bytes=start-end` header in reconstruction request +- The `offset_into_first_range` field indicates where your range starts within the first term +- The end of the content will need to be truncated to fit the requested range. + - Except if the requested range exceeds the total file length, then the returned content will be shorter and no truncation is necessary. + +When downloading individual term data: + +A client MUST include the `Range` header formed with the values from the `url_range` field to specify the exact range of data of a xorb that they are accessing. Not specifying this header will cause result in an authorization failure. + +Xet global deduplication requires that access to xorbs is only granted to authorized ranges. +Not specifying this header will result in an authorization failure. + +## Performance Considerations + +- **Range coalescing**: Multiple terms may share the same fetch info for efficiency, so a single fetch info may be larger than any 1 term and could be used to fulfil multiple terms. +Consider downloading such content only once and reusing the data. +- **Parallel downloads**: Terms can be downloaded in parallel, but MUST be assembled in order + - On file systems with fast seeking, it MAY be advantageous to open the output file in different threads and writing contents at different offsets +- **Caching**: Clients SHOULD consider caching downloaded xorb ranges to avoid redundant requests +- **Retry logic**: Implement exponential backoff for transient failures + +### Caching recommendations + +1. It can be ineffective to cache the reconstruction object + 1. The fetch_info section provides short-expiration pre-signed URL's hence Clients SHOULD NOT cache the urls beyond their short expiration + 2. To get those URL's to access the data you will need to call the reconstruction API again anyway +2. Cache chunks by range not just individually + 1. If you need a chunk from a xorb it is very likely that you will need another, so cache them close +3. Caching helps when downloading similar contents. May not be worth to cache data if you are always downloading different things + +## More complex QueryReconstruction Example + +Here's an example of a serialized `QueryReconstructionResponse` struct that shows how file reconstruction would work across multiple xorbs: + +```json +{ + "offset_into_first_range": 0, + "terms": [ + { + "hash": "a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456", + "unpacked_length": 263873, + "range": { + "start": 1, + "end": 4 + } + }, + { + "hash": "fedcba0987654321098765432109876543210fedcba098765432109876543", + "unpacked_length": 143890, + "range": { + "start": 0, + "end": 3 + } + }, + { + "hash": "a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456", + "unpacked_length": 3063572, + "range": { + "start": 3, + "end": 43 + } + }, + ], + "fetch_info": { + "a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456": [ + { + "range": { + "start": 1, + "end": 43 + }, + "url": "https://transfer.xethub.hf.co/xorb/default/a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIOSFODNN7EXAMPLE%2F20130721%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20130721T201207Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=d6796aa6097c82ba7e33b4725e8396f8a9638f7c3d4b5a6b7c8d9e0f1a2b3c4d", + "url_range": { + "start": 57980, + "end": 1433008 + } + } + ], + "fedcba0987654321098765432109876543210fedcba098765432109876543": [ + { + "range": { + "start": 0, + "end": 3 + }, + "url": "https://transfer.xethub.hf.co/xorb/default/fedcba0987654321098765432109876543210fedcba098765432109876543?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIOSFODNN7EXAMPLE%2F20130721%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20130721T201207Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=d6796aa6097c82ba7e33b4725e8396f8a9638f7c3d4b5a6b7c8d9e0f1a2b3c4d", + "url_range": { + "start": 0, + "end": 65670 + } + } + ] + } +} +``` + +This example shows reconstruction of a file that requires: + +- Chunks `[1, 4)` from the first xorb (~264KB of unpacked data) +- Chunks `[0, 2)` from the second xorb (~144KB of unpacked data) +- Chunks `[3, 43)` from the same xorb from the first term (~3MB of unpacked data) + +The `fetch_info` provides the HTTP URLs and byte ranges needed to download the required chunk data from each xorb. The ranges provided within `fetch_info` and term sections are always end-exclusive i.e. `{ "start": 0, "end": 3 }` is a range of 3 chunks at indices 0, 1 and 2. +The ranges provided under a `fetch_info` items' `url_range` key are to be used to form the `Range` header when downloading the chunk range. +A `"url_range"` value of `{ "start": X, "end": Y }` creates a `Range` header value of `bytes=X-Y`. + +When downloading and deserializing the chunks from xorb `a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456` we will have the chunks at indices `[1, 43)`. +We will need to only use the chunks at `[1, 4)` to fulfill the first term and then chunks `[3, 43)` to fulfill the third term. +Note that in this example the chunk at index 3 is used twice! This is the benefit of deduplication; we only need to download the chunk content once. + +## Diagram + +```mermaid +sequenceDiagram + autonumber + actor client as Client + participant S as CAS API + participant Transfer as Transfer Service (Xet storage) + + client->>S: GET /reconstructions/{file_id}
Authorization: Bearer
Range: bytes=start-end (optional) + S-->>client: 200 OK
QueryReconstructionResponse {offset_into_first_range, terms[], fetch_info{}} + + loop For each term in terms (ordered) + client->>client: Find fetch_info by xorb hash, entry whose range contains term.range + client->>Transfer: GET {url}
Range: bytes=url_range.start-url_range.end + Transfer-->>client: 206 Partial Content
xorb byte range + client->>client: Deserialize xorb → chunks for fetch_info.range + client->>client: Trim to term.range, apply offset for first term + client->>client: Append chunks to output + end + + alt Range requested + client->>client: Truncate output to requested length + end +``` diff --git a/docs/xet-protocol/file-id.md b/docs/xet-protocol/file-id.md new file mode 100644 index 000000000..9e4bae8af --- /dev/null +++ b/docs/xet-protocol/file-id.md @@ -0,0 +1,33 @@ +# Getting a Xet File ID from the Hugging Face Hub + +This section explains the Xet file ID used in the reconstruction API to download a file from the Hugging Face Hub using the xet protocol. + +Given a particular namespace, repository and branch or commit hash and file path from the root of the repository, build the "resolve" URL for the file following this format: + +```js +Define: +namespace: username/organization, e.g. Qwen +repository: the repository name e.g. Qwen-Image-Edit +branch: any git branch or commit hash e.g. main +filepath: filepath in repository e.g. transformer/diffusion_pytorch_model-00001-of-00009.safetensors + +resolve URL: + +https://huggingface.co/{namespace}/{repository}/resolve/{branch}/{filepath} + + +Example: + +https://huggingface.co/Qwen/Qwen-Image-Edit/resolve/main/transformer/diffusion_pytorch_model-00001-of-00009.safetensors +``` + +Then make a `GET` request to the resolve URL using your standard Hugging Face Hub credentials/token. + +If the file is stored on the xet system then a successful response will have a `X-Xet-Hash` header. + +The string value of this header is the Xet file ID and SHOULD be used in the path of the reconstruction API URL. +This is the string representation of the hash and can be used directly in the file reconstruction API on download. + +> [!NOTE] +> The resolve URL will return a 302 redirect http status code, following the redirect will download the content via the old LFS compatible route rather than through the Xet protocol. +In order to use the Xet protocol you MUST NOT follow this redirect. diff --git a/docs/xet-protocol/file-reconstruction.md b/docs/xet-protocol/file-reconstruction.md new file mode 100644 index 000000000..9e70f20f7 --- /dev/null +++ b/docs/xet-protocol/file-reconstruction.md @@ -0,0 +1,133 @@ +# File Reconstruction: Term-based Representation + +This document describes how a file can be represented and reconstructed from a compact, deduplicated form using a series of terms. Each term specifies where to source data (a content-addressed container called a xorb) and which chunk indices within that container are required. + +## Glossary + +- **Xorb**: A content-addressed container holding a sequence of chunks. It is identified by a cryptographic hash (the “xorb hash”). +- **Chunk**: A unit of data inside a xorb. Chunks are ordered and individually decompressible/decodable into raw bytes. +- **Chunk index**: The 0-based position of a chunk within a particular xorb. +- **Term**: A pair that identifies a xorb and a contiguous chunk index range within that xorb: `(xorb hash, chunk range [start, end))`. +- **Reconstruction**: A list of terms. + +## Core Idea + +After following the [chunking procedure](./chunking) a file can be represented as an ordering of chunks. +Those chunks are then packed into [xorbs](./xorb) and given the set of xorbs we convert the file representation to "reconstruction" made up of "terms". +When forming xorbs the ordering and grouping of chunks prioritizes contiguous runs of chunks that appear in a file such that when referencing a xorb we maximize the term range length. + +Any file’s raw bytes can be described as the concatenation of data produced by a sequence of terms. +Each term references a contiguous range of chunks within a particular xorb. +The file is reconstructed by retrieving those chunk ranges, decoding them to raw bytes, and concatenating in order. + +### Diagram + +A file with 4 terms. Each term is a pointer to chunk range within a xorb. + +```txt +File Reconstruction + + ┌----------------------------┬┬--------------------------┬┬---------------------------┬┬-------------------------┐ + ¦ X0 ¦¦ X1 ¦¦ X2 ¦¦ X3 ¦ + ¦ start: 0 ¦¦ start: 0 ¦¦ start: 300 ¦¦ start: 300 ¦ + ¦ end: 1024 ¦¦ end: 700 ¦¦ end: 1024 ¦¦ end: 700 ¦ + ├----------------------------++--------------------------++---------------------------++-------------------------┤ + ¦ /¦ /\ /\ / + ¦ / ¦ / \ ¦ \ / + ¦ / ¦ /---/ \----\ ¦ \----\ / + ¦ ¦ ¦ / \ ¦ \ / + ¦ ¦ ¦ / \ ¦ \ / + ┌-------------------------┐ ┌-------------------------┐ ┌-------------------------┐ ┌-------------------------┐ + ¦ X0 ¦ ¦ X1 ¦ ¦ X2 ¦ ¦ X3 ¦ + ¦ ¦ ¦ ¦ ¦ ¦ ¦ ¦ + ¦0 1024¦ ¦0 1000¦ ¦0 1090¦ ¦0 870 ¦ + └-------------------------┘ └-------------------------┘ └-------------------------┘ └-------------------------┘ +``` + +## Term Format + +Each term consists of: + +- **Xorb hash**: A 32 byte hash value that is the key to the xorb in the CAS (Content Addressed Store). +- **Chunk range**: A half-open interval [start, end) of chunk indices within that xorb. The range includes the chunk at index `start` and excludes the chunk at index `end`. + +## Reconstruction Rules + +Given an ordered list of terms describing a file: + +1. For each term, fetch the specified chunk range from the identified xorb. +2. Decode/decompress the chunks into raw bytes, preserving the original order. +3. If reconstructing the entire file, concatenate the decoded outputs of all terms in their listed order. +4. If reconstructing a byte sub-range of the file, the first and last terms may be partially used: + - Skip a prefix of bytes within the first term’s decoded output so the file-level range starts at the requested offset. + - Truncate the tail of the last term’s decoded output so the file-level range ends at the requested position. + +### Ordering and Coverage + +- Terms are ordered according to the file’s byte order. Concatenating their decoded outputs yields the requested file region. +- Gaps MUST NOT be present. If gaps exist, the reconstruction would not produce a contiguous byte stream. + +### Multiple Terms per Xorb and Coalescing + +- A file may contain multiple terms that reference the same xorb, potentially with disjoint chunk ranges. This enables deduplication across distant parts of the file. +- When multiple terms target overlapping or adjacent chunk ranges within the same xorb, implementations SHOULD coalesce these into a single retrieval to reduce I/O and request overhead, while preserving the term-level reconstruction semantics. + +### Chunk and Byte Boundaries + +- Chunk ranges are specified in chunk index space, not byte offsets. The decoded size of each chunk is not required to be uniform. +- The byte length contributed by a term equals the sum of the decoded sizes of its referenced chunks, minus any initial skip or final truncation when reconstructing sub-ranges. +- Slicing within a term (for sub-range reconstruction) happens at byte granularity in the decoded output of the addressed chunk sequence. + +### Determinism and Integrity + +- The xorb hash binds the identity of the underlying chunk set to its content. If the xorb hash is correct and the specified chunk range is retrieved and decoded as defined, the resulting bytes are deterministic. +- A file-level identity (e.g., a cryptographic hash of the reconstructed bytes) can be validated by reconstructing and hashing the result. + +### Example (Conceptual) + +Assume a file is represented by the following ordered terms: + +| Term | Xorb hash (conceptual) | Chunk range | +|------|-------------------------|-------------| +| 1 | X1 | [0, 5) | +| 2 | X2 | [3, 8) | +| 3 | X1 | [9, 12) | + +Reconstruction proceeds by obtaining chunks 0,1,2,3,4 from xorb X1, chunks 3,4,5,6,7 from xorb X2, and chunks 9,10,11 from xorb X1, decoding each contiguous range, and concatenating in the term order 1 → 2 → 3. + +## Serialization and Deserialization + +This section summarizes how the term-based reconstruction is persisted and exchanged. + +### Serialization into shards (file info section) + +A file’s reconstruction can be serialized into a shard as part of its file info section. +Conceptually, this section encodes the complete set of terms that describe the file. +When stored this way, the representation is canonical and sufficient to reconstruct the full file solely from its referenced xorb ranges. + +Reference: [shard format file info](./shard#2-file-info-section) + +### Deserialization from the reconstruction API (JSON) + +A reconstruction API can return a JSON object that carries the full reconstruction. +This response is represented by a structure named “QueryReconstructionResponse”, where the `terms` key enumerates the ordered list of terms required to reconstruct the entire file. +The `terms` list contains, for each term, the xorb identifier and the contiguous chunk index range to retrieve. +Other fields may provide auxiliary details (such as offsets or fetch hints) that optimize retrieval without altering the meaning of the `terms` sequence. + +Reference: [api](./api), [download protocol](./download-protocol) + +## Fragmentation and Why Longer Ranges Matter + +Fragmentation refers to representing a file with many very short, scattered ranges across many xorbs. While this can maximize deduplication opportunities, it often harms read performance and increases overhead. + +- Meaning of fragmentation: The file’s byte stream is assembled from numerous small term ranges, potentially spanning many xorbs. This implies more lookups, more range fetches, and poorer locality. +- Costs of fragmentation: + - Larger reconstruction objects + - Potentially more network requests (overhead per request) +- Why prioritize longer ranges: + - Fewer, longer ranges reduce round-trips and enable more sequential reads + - Simpler scheduling and write patterns during reconstruction + +In practice there is a balance: longer ranges improve reconstruction performance, while finer granularity can increase deduplication savings. +Favoring longer contiguous chunk ranges within the same xorb, and coalescing adjacent or overlapping ranges when feasible, helps maintain good read performance without sacrificing correctness. +In `xet-core` we use a fragmentation prevention mechanism that targets that the average term contains 8 chunks. diff --git a/docs/xet-protocol/hashing.md b/docs/xet-protocol/hashing.md new file mode 100644 index 000000000..6b2c3ab7c --- /dev/null +++ b/docs/xet-protocol/hashing.md @@ -0,0 +1,178 @@ +# Hashing + +- [Chunk hashes](#chunk-hashes) - compute for each chunk from chunk data. +- [Xorb Hashes](#xorb-hashes) - compute for each xorb from its chunk hashes. +- [File Hashes](#file-hashes) - compute for each file from its chunk hashes. +- [Term Verification Hashes](#term-verification-hashes) - compute for each term in a reconstruction when serializing a shard from the chunk hashes in the xorb that is used in that term. + +The Xet protocol utilizes a few different hashing types. + +All hashes referenced are 32 bytes (256 bits) long. + +## Chunk Hashes + +After cutting a chunk of data, the chunk hash is computed via a blake3 keyed hash with the following key (DATA_KEY): + +### DATA_KEY + +```json +[ + 102, 151, 245, 119, 91, 149, 80, 222, 49, 53, 203, 172, 165, 151, 24, 28, 157, 228, 33, 16, 155, 235, 43, 88, 180, 208, 176, 75, 147, 173, 242, 41 +] +``` + +[reference implementation](https://github.com/huggingface/xet-core/blob/main/merklehash/src/data_hash.rs#L308-L311) + +## Xorb Hashes + +Xorbs are composed of a series of chunks; given the series of chunks that make up a xorb, to compute the hash or xorb hash we will compute a MerkleHash using a [Merkle Tree](https://en.wikipedia.org/wiki/Merkle_tree) data structure with custom hashing functions. +**The xorb hash will be the root node hash of the MerkleTree.** + +The leaf node hashes are the chunk hashes as described in the previous section. + +The hash function used to compute internal node hashes is as follows: + +- concatenate the hashes together such that for each chunk there is a line in order formatted like `{chunk_hash:x} : {size}\n` + - the hash first in lowercase hex format (64 hex characters e.g. `a3f91d6e8b47c20ff9d84a1c77dcb8e5a91e6fbf2b2d483af6d3c1e90ac57843`) + - a space, a colon, a space (` : `) + - the chunk length number e.g. 64000 + - finally a newline `\n` character +- Then take the bytes from this string and compute a blake3 keyed hash with the following key (INTERNAL_NODE_KEY) + +[reference implementation](https://github.com/huggingface/xet-core/blob/main/merklehash/src/aggregated_hashes.rs#L103-L109) + +### INTERNAL_NODE_KEY + +```json +[ + 1, 126, 197, 199, 165, 71, 41, 150, 253, 148, 102, 102, 180, 138, 2, 230, 93, 221, 83, 111, 55, 199, 109, 210, 248, 99, 82, 230, 74, 83, 113, 63 +] +``` + +### Example of data for internal node + +Consider that a node were 4 chunks with the following pairs of hashes and lengths: + +```txt +hash,length (bytes) +1f6a2b8e9d3c4075a2e8c5fd4f0b763e6f3c1d7a9b2e6487de3f91ab7c6d5401,10000 +7c94fe2a38bdcf9b4d2a6f7e1e08ac35bc24a7903d6f5a0e7d1c2b93e5f748de,20000 +cfd18a92e0743bb09e56dbf76ea2c34d99b5a0cf271f8d429b6cd148203df061,25000 +e38d7c09a21b4cf8d0f92b3a85e6df19f7c20435e0b1c78a9d635f7b8c2e4da1,64000 +``` + +Then to form the buffer to compute the internal node hash we will create this string (note the `\n` newline at the end): + +```txt +"1f6a2b8e9d3c4075a2e8c5fd4f0b763e6f3c1d7a9b2e6487de3f91ab7c6d5401 : 10000 +7c94fe2a38bdcf9b4d2a6f7e1e08ac35bc24a7903d6f5a0e7d1c2b93e5f748de : 20000 +cfd18a92e0743bb09e56dbf76ea2c34d99b5a0cf271f8d429b6cd148203df061 : 25000 +e38d7c09a21b4cf8d0f92b3a85e6df19f7c20435e0b1c78a9d635f7b8c2e4da1 : 64000 +" +``` + +Then compute the blake3 keyed hash with INTERNAL_NODE_KEY to get the final hash. + +### Example Python code for the internal hash function + +```python +from blake3 import blake3 + +def internal_hash_function(node): + buffer = "" + for chunk in node: + size = len(chunk) + chunk_hash = compute_chunk_hash(chunk) + buffer += f"{chunk_hash:x} : {size}\n" + + blake3(bytes(buffer), key=INTERNAL_NODE_KEY) +``` + +## File Hashes + +After chunking a whole file, to compute the file hash, follow the same procedure used to compute the xorb hash and then take that final hash as data to compute a blake3 keyed hash with a key that is all 0's. + +This means create a MerkleTree using the same hashing functions described in the previous section. +Then take the root node's hash and compute a blake3 keyed hash with the key being 32 0-value bytes. + +[reference implementation](https://github.com/huggingface/xet-core/blob/main/merklehash/src/aggregated_hashes.rs#L123-L125) + +## Term Verification Hashes + +When uploading a shard, each term in each file info in the shard MUST have a matching FileVerificationEntry section that contains a hash. + +To generate this hash, take the chunk hashes for the specific range of chunks that make up the term and: + +1. **Concatenate the raw hash bytes**: Take all the chunk hashes in the range (from `chunk_index_start` to `chunk_index_end` in the xorb specified in the term) and concatenate their raw 32-byte representations together in order. + +2. **Apply keyed hash**: Compute a blake3 keyed hash of the concatenated bytes using the following verification key (VERIFICATION_KEY): + +### VERIFICATION_KEY + +```json +[ + 127, 24, 87, 214, 206, 86, 237, 102, 18, 127, 249, 19, 231, 165, 195, 243, 164, 205, 38, 213, 181, 219, 73, 230, 65, 36, 152, 127, 40, 251, 148, 195 +] +``` + +The result of the blake3 keyed hash is the verification hash that MUST be used in the FileVerificationEntry for the term. + +[reference implementation](https://github.com/huggingface/xet-core/blob/main/mdb_shard/src/chunk_verification.rs#L4-L16) + +### Example Python code for the verification hash + +```python +def verification_hash_function(term): + buffer = bytes() + # note chunk ranges are end exclusive + for chunk_hash in term.xorb.chunk_hashes[term.chunk_index_start : term.chunk_index_end]: + buffer.extend(bytes(chunk_hash)) + return blake3(buffer, key=VERIFICATION_KEY) +``` + +## Reference Files + +Reference files are provided in Hugging Face Dataset repository [xet-team/xet-spec-reference-files](https://huggingface.co/datasets/xet-team/xet-spec-reference-files). + +In this repository there are a number of different samples implementors can use to verify hash computations. + +> Note that all hashes are represented as strings. +To get the raw value of these hashes you must invert the endianness of each byte octet in the hash string, reversing the procedure described in [api](./api#converting-hashes-to-strings). + +### Chunk Hashes Sample + +There are 3 chunks files, for each file name, the first 64 characters are the string format of the chunk hash of the data in the file: + +- [b10aa1dc71c61661de92280c41a188aabc47981739b785724a099945d8dc5ce4.chunk](https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/b10aa1dc71c61661de92280c41a188aabc47981739b785724a099945d8dc5ce4.chunk) +- [26255591fa803b6baf25d88c315b8a6f5153d5bcfdf18ec5ef526264e0ccc907.chunk](https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/26255591fa803b6baf25d88c315b8a6f5153d5bcfdf18ec5ef526264e0ccc907.chunk) +- [099cb228194fe640e36a6c7d274ee5ed3a714ccd557a0951d9b6b43a7292b5d1.chunk](https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/099cb228194fe640e36a6c7d274ee5ed3a714ccd557a0951d9b6b43a7292b5d1.chunk) + +### File Hash Sample + +The [xet-team/xet-spec-reference-files](https://huggingface.co/datasets/xet-team/xet-spec-reference-files) repository contains the original file +[Electric_Vehicle_Population_Data_20250917.csv](https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/Electric_Vehicle_Population_Data_20250917.csv). + +When processed through the Xet upload protocol the chunks that are produced for this file are listed (formatted ` `) in the file +[Electric_Vehicle_Population_Data_20250917.csv.chunks](https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/Electric_Vehicle_Population_Data_20250917.csv.chunks). + +Using these chunks to compute a file hash of the entire file the result is the hash stored in the file +[Electric_Vehicle_Population_Data_20250917.csv.xet-file-hash](https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/Electric_Vehicle_Population_Data_20250917.csv.xet-file-hash) or the raw value `118a53328412787fee04011dcf82fdc4acf3a4a1eddec341c910d30a306aaf97`. + +### Xorb Hash Sample + +All of the chunks of [Electric_Vehicle_Population_Data_20250917.csv](https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/Electric_Vehicle_Population_Data_20250917.csv) can fit into 1 single xorb. + +The xorb produced with all of the chunks in order for this file can be found serialized in file [eea25d6ee393ccae385820daed127b96ef0ea034dfb7cf6da3a950ce334b7632.xorb](https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/eea25d6ee393ccae385820daed127b96ef0ea034dfb7cf6da3a950ce334b7632.xorb). + +The hash of this xorb is `eea25d6ee393ccae385820daed127b96ef0ea034dfb7cf6da3a950ce334b7632`, the value in [Electric_Vehicle_Population_Data_20250917.csv.xet-xorb-hash](https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/Electric_Vehicle_Population_Data_20250917.csv.xet-xorb-hash). + +The chunks that make up this xorb are listed in a file [eea25d6ee393ccae385820daed127b96ef0ea034dfb7cf6da3a950ce334b7632.xorb.chunks](https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/eea25d6ee393ccae385820daed127b96ef0ea034dfb7cf6da3a950ce334b7632.xorb.chunks); +note this file is equivalent to [Electric_Vehicle_Population_Data_20250917.csv.chunks](https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/Electric_Vehicle_Population_Data_20250917.csv.chunks). + +### Range Hash Sample + +In the reconstruction of [Electric_Vehicle_Population_Data_20250917.csv](https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/Electric_Vehicle_Population_Data_20250917.csv) +with xorb [eea25d6ee393ccae385820daed127b96ef0ea034dfb7cf6da3a950ce334b7632](https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/eea25d6ee393ccae385820daed127b96ef0ea034dfb7cf6da3a950ce334b7632.xorb) there is 1 range that contains all 796 chunks. + +The verification range hash for this range is the value in [eea25d6ee393ccae385820daed127b96ef0ea034dfb7cf6da3a950ce334b7632.xorb.range-hash](https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/eea25d6ee393ccae385820daed127b96ef0ea034dfb7cf6da3a950ce334b7632.xorb.range-hash) +which is `d81c11b1fc9bc2a25587108c675bbfe65ca2e5d350b0cd92c58329fcc8444178`. diff --git a/docs/xet-protocol/index.md b/docs/xet-protocol/index.md new file mode 100644 index 000000000..ab27c53ad --- /dev/null +++ b/docs/xet-protocol/index.md @@ -0,0 +1,51 @@ +# Xet Protocol Specification + +> [!NOTE] +> Version 0.1.0 (1.0.0 on release) +> The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in BCP 14 [RFC2119](https://www.ietf.org/rfc/rfc2119.txt) [RFC8174](https://www.ietf.org/rfc/rfc8174.txt) +when, and only when, they appear in all capitals, as shown here. + +This specification defines the end-to-end Xet protocol for content-addressed data: chunking and hashing rules, deduplication strategy, xorb and shard object formats, file reconstruction semantics, authentication, and the CAS APIs for upload and download. +Its goal is interoperability and determinism: independent implementations MUST produce the same hashes, objects, and API behavior so data written by one client can be read by another with integrity and performance. +Implementors can create their own clients, SDKs, and tools that speak the Xet protocol and interface with the CAS service, as long as they MUST adhere to the requirements defined here. + +## Building a Client Library for Xet Storage + +- [Upload Protocol](./upload-protocol): End-to-end top level description of the upload flow. +- [Download Protocol](./download-protocol): Instructions for the download procedure. +- [CAS API](./api): HTTP endpoints for reconstruction, global chunk dedupe, xorb upload, and shard upload, including error semantics. +- [Authentication and Authorization](./auth): How to obtain Xet tokens from the Hugging Face Hub, token scopes, and security considerations. +- [Converting Hugging Face Hub Files to Xet File ID's](./file-id): How to obtain a Xet file id from the Hugging Face Hub for a particular file in a model or dataset repository. + +## Overall Xet Architecture + +- [Content-Defined Chunking](./chunking): Gearhash-based CDC with parameters, boundary rules, and performance optimizations. +- [Hashing Methods](./hashing): Descriptions and definitions of the different hashing functions used for chunks, xorbs and term verification entries. +- [File Reconstruction](./file-reconstruction): Defining "term"-based representation of files using xorb hash + chunk ranges. +- [Xorb Format](./xorb): Explains grouping chunks into xorbs, 64 MiB limits, binary layout, and compression schemes. +- [Shard Format](./shard): Binary shard structure (header, file info, CAS info, footer), offsets, HMAC key usage, and bookends. +- [Deduplication](./deduplication): Explanation of chunk level dedupe including global system-wide chunk level dedupe. + +## Reference Implementation + +### xet-core: hf-xet + git-xet + +The primary reference implementation of the protocol written in Rust 🦀 lives in the [xet-core](https://github.com/huggingface/xet-core) repository under multiple crates: + +- [cas_types](https://github.com/huggingface/xet-core/tree/main/cas_types) - Common re-usable types for interacting with CAS API's +- [cas_client](https://github.com/huggingface/xet-core/tree/main/cas_client) - Client interface that calls CAS API's, including comprehensive implementation of download protocol. +- [mdb_shard](https://github.com/huggingface/xet-core/tree/main/mdb_shard) - Library for interacting with shards and the shard binary format. +- [deduplication](https://github.com/huggingface/xet-core/tree/main/deduplication) - Exposes interfaces to deduplicate chunks locally and using global deduplication + - [deduplication/src/chunking.rs](https://github.com/huggingface/xet-core/blob/main/deduplication/src/chunking.rs) - The reference implementation of the chunking algorithm. +- [merklehash](https://github.com/huggingface/xet-core/tree/main/merklehash) - Exports a `MerkleHash` type extensively used to represent hashes. Exports functions to compute the different hashes used to track chunks, xorbs and files. +- [data](https://github.com/huggingface/xet-core/tree/main/data) - Comprehensive package exposing interfaces to upload and download contents +- [hf_xet](https://github.com/huggingface/xet-core/tree/main/hf_xet) - Python bindings to use the Xet protocol for uploads and downloads with the Hugging Face Hub. +- [git-xet](ttps://github.com/huggingface/xet-core/tree/main/git-xet) - git lfs custom transfer agent that uploads files using the xet protocol to the Hugging Face Hub. + +### huggingface.js + +There is also a second reference implementation in Huggingface.js that can be used when downloading or uploading files with the `@huggingface/hub` library. + +- Download uses the `XetBlob` that can be found in [XetBlob.ts](https://github.com/huggingface/huggingface.js/blob/main/packages/hub/src/utils/XetBlob.ts). +- The upload implementation is more comprehensive but the root of it begins in [uploadShards](https://github.com/huggingface/huggingface.js/blob/main/packages/hub/src/utils/uploadShards.ts). + - The upload process uses xet-core constructs compiled from Rust to WebAssembly, particularly all functions exported from the [hf_xet_thin_wasm](https://github.com/huggingface/xet-core/tree/main/hf_xet_thin_wasm) crate. diff --git a/docs/xet-protocol/shard.md b/docs/xet-protocol/shard.md new file mode 100644 index 000000000..ab62b370e --- /dev/null +++ b/docs/xet-protocol/shard.md @@ -0,0 +1,549 @@ +# MDB Shard File Format Specification + +A Shard is a serialized object containing file reconstruction information and xorb metadata for deduplication purposes. + +The Shard format is the vehicle for uploading the file reconstruction upload and communicating information about xorbs and chunks that clients can deduplicate their data against. + +## Overview + +The MDB (Merkle Database) shard file format is a binary format used to store file metadata and content-addressable storage (CAS) information for efficient deduplication and retrieval. +This document describes the binary layout and deserialization process for the shard format. +Implementors of the xet protocol MUST use the shard format when implementing the [upload protocol](./upload-protocol). +The shard format is used on the shard upload (record files) and global deduplication APIs. + +## Use As API Request and Response Bodies + +The shard format is used in the shard upload API as the request payload and in the global deduplication/chunk query API as the response payload. + +### Shard Upload + +The shard in this case is a serialization format that allows clients to denote the files that they are uploading. +Each file reconstruction maps to a File Info block in the File Info section. +Additionally, the listing of all new xorbs that the client created are mapped to items (CAS Info blocks) in the CAS Info section so that they may be deduplicated against in the future. + +When uploading a shard the footer section MUST be omitted. + +An example of a shard that can be used for file upload can be found in [Xet reference files](https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/Electric_Vehicle_Population_Data_20250917.csv.shard.verification-no-footer). +A version of this shard that also contains the footer in [Xet reference files](https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/Electric_Vehicle_Population_Data_20250917.csv.shard.verification) too, see the README for the reference files dataset for more context. + +### Global Deduplication + +Shards returned by the Global Deduplication API have an empty File Info Section, and only contain relevant information in the CAS Info section. +The CAS Info section returned by this API contains xorbs, where a xorb described in the CAS Info section contains the chunk that was queried. +Clients can deduplicate their content against any of the other xorbs described in any CAS Info block in the CAS Info section of the returned shard. +Other xorb descriptions returned in a shard are possibly more likely to reference content that the client has. + +An example of a shard that can be returned for a global deduplication query can be found in [Xet reference files](https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/Electric_Vehicle_Population_Data_20250917.csv.shard.dedupe). + +## File Structure + +A shard file consists of the following sections in order: + +```txt +┌─────────────────────┐ +│ Header │ +├─────────────────────┤ +│ File Info Section │ +├─────────────────────┤ +│ CAS Info Section │ +├─────────────────────┤ +│ Footer │ +└─────────────────────┘ +``` + +## Overall File Layout with Byte Offsets + +```txt +Offset 0: +┌───────────────────────────────────────────────────────┐ +│ Header (48 bytes) │ ← Fixed size +└───────────────────────────────────────────────────────┘ + +Offset footer.file_info_offset: +┌───────────────────────────────────────────────────────┐ +│ │ +│ File Info Section │ ← Variable size +│ (Multiple file blocks + │ +│ bookend entry) │ +│ │ +└───────────────────────────────────────────────────────┘ + +Offset footer.cas_info_offset: +┌───────────────────────────────────────────────────────┐ +│ │ +│ CAS Info Section │ ← Variable size +│ (Multiple CAS blocks + │ +│ bookend entry) │ +│ │ +└───────────────────────────────────────────────────────┘ + +Offset footer.footer_offset: +┌───────────────────────────────────────────────────────┐ +│ Footer (200 bytes, sometimes omitted) │ ← Fixed size +└───────────────────────────────────────────────────────┘ +``` + +## Constants + +- `MDB_SHARD_HEADER_VERSION`: 2 +- `MDB_SHARD_FOOTER_VERSION`: 1 +- `MDB_FILE_INFO_ENTRY_SIZE`: 48 bytes (size of each file info structure) +- `MDB_CAS_INFO_ENTRY_SIZE`: 48 bytes (size of each CAS info structure) +- `MDB_SHARD_HEADER_TAG`: 32-byte magic identifier + +## Data Types + +All multi-byte integers are stored in little-endian format. + +- `u8`: 8-bit unsigned integer +- `u32`: 32-bit unsigned integer +- `u64`: 64-bit unsigned integer +- Byte Array types are denoted like in rust as `[u8; N]` where `N` is the number of bytes in the array. +- Hash: 32-byte hash value, a special `[u8; 32]` + +## 1. Header (MDBShardFileHeader) + +**Location**: Start of file (offset 0) +**Size**: 48 bytes + +```rust +struct MDBShardFileHeader { + tag: [u8; 32], // Magic number identifier + version: u64, // Header version (must be 2) + footer_size: u64, // Size of footer in bytes, set to 0 if footer is omitted +} +``` + +**Memory Layout**: + +```txt +┌────────────────────────────────────────────────────────────────┬───────────┬───────────┐ +│ tag (32 bytes) │ version │ footer_sz │ +│ Magic Number Identifier │ (8 bytes) │ (8 bytes) │ +└────────────────────────────────────────────────────────────────┴───────────┴───────────┘ +0 32 40 48 +``` + +**Deserialization steps**: + +1. Read 32 bytes for the magic tag +2. Verify tag matches `MDB_SHARD_HEADER_TAG` +3. Read 8 bytes for version (u64) +4. Verify version equals 2 +5. Read 8 bytes for footer_size (u64) + +> [!NOTE] +> When serializing, footer_size MUST be the number of bytes that make up the footer, or 0 if the footer is omitted. + +## 2. File Info Section + +**Location**: `footer.file_info_offset` to `footer.cas_info_offset` or directly after the header + +This section contains a sequence of 0 or more file information (File Info) blocks, each consisting at least a header and at least 1 data sequence entry, and OPTIONAL verification entries and metadata extension section. +The file info section ends when reaching the bookend entry. + +Each File Info block within the overall section is a serialization of a [file reconstruction](./file-reconstruction) into a binary format. +For each file, there is a `FileDataSequenceHeader` and for each term a `FileDataSequenceEntry` with OPTIONAL a matching `FileVerificationEntry` and also OPTIONAL at the end a `FileMetadataExt`. + +A shard File Info section can contain more than 1 File Info block in series, after completing reading all the content for 1 file description, the next one immediately begins. +If when reading the header of the next section a reader encounters the bookend entry that means the file info section is over; you have read the last file description in this shard. + +### File Info Section Layout + +**Without Optional Components**: + +```txt +┌─────────────────────┐ +│ FileDataSeqHeader │ ← File 1 +├─────────────────────┤ +│ FileDataSeqEntry │ +├─────────────────────┤ +│ FileDataSeqEntry │ +├─────────────────────┤ +│ ... │ +├─────────────────────┤ +│ FileDataSeqHeader │ ← File 2 +├─────────────────────┤ +│ FileDataSeqEntry │ +├─────────────────────┤ +│ ... │ +├─────────────────────┤ +│ Bookend Entry │ ← All 0xFF hash + zeros +└─────────────────────┘ +``` + +**With All Optional Components**: + +```txt +┌─────────────────────┐ +│ FileDataSeqHeader │ ← File 1 (flags indicate verification + metadata) +├─────────────────────┤ +│ FileDataSeqEntry │ +├─────────────────────┤ +│ FileDataSeqEntry │ +├─────────────────────┤ +│ ... │ +├─────────────────────┤ +│ FileVerifyEntry │ ← One per FileDataSeqEntry +├─────────────────────┤ +│ FileVerifyEntry │ +├─────────────────────┤ +│ ... │ +├─────────────────────┤ +│ FileMetadataExt │ ← One per file (if flag set) +├─────────────────────┤ +│ FileDataSeqHeader │ ← File 2 +├─────────────────────┤ +│ ... │ +├─────────────────────┤ +│ Bookend Entry │ ← All 0xFF hash + zeros +└─────────────────────┘ +``` + +### FileDataSequenceHeader + +```rust +struct FileDataSequenceHeader { + file_hash: Hash, // 32-byte file hash + file_flags: u32, // Flags indicating conditional sections that follow + num_entries: u32, // Number of FileDataSequenceEntry structures + _unused: [u8; 8], // Reserved space 8 bytes +} +``` + +**File Flags**: + +- `MDB_FILE_FLAG_WITH_VERIFICATION` (0x80000000 or 1 << 31): Has verification entries +- `MDB_FILE_FLAG_WITH_METADATA_EXT` (0x40000000 or 1 << 30): Has metadata extension + +Given the `file_data_sequence_header.file_flags & MASK` (bitwise AND) operations, if the result != 0 then the effect is true. + +**Memory Layout**: + +```txt +┌────────────────────────────────────────────────────────────────┬──────────┬───────────┬────────────┐ +│ file_hash (32 bytes) │file_flags│num_entries│ _unused │ +│ File Hash Value │(4 bytes) │(4 bytes) │ (8 bytes) │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────┴────────────┘ +0 32 36 40 48 +``` + +### FileDataSequenceEntry + +Each `FileDataSequenceEntry` is 1 term is essentially the binary serialization of a [file reconstruction term](./file-reconstruction#term-format). + +```rust +struct FileDataSequenceEntry { + cas_hash: Hash, // 32-byte Xorb hash in the term + cas_flags: u32, // CAS flags (reserved for future, set to 0) + unpacked_segment_bytes: u32, // Term size when unpacked + chunk_index_start: u32, // Start chunk index within the Xorb for the term + chunk_index_end: u32, // End chunk index (exclusive) within the Xorb for the term +} +``` + +> [!NOTE] +> Note that when describing a chunk range in a `FileDataSequenceEntry` use ranges that are start-inclusive but end-exclusive i.e. `[chunk_index_start, chunk_index_end)` + +**Memory Layout**: + +```txt +┌────────────────────────────────────────────────────────────────┬─────────┬─────────┬─────────┬─────────┐ +│ cas_hash (32 bytes) │cas_flags│unpacked │chunk_idx│chunk_idx│ +│ CAS Block Hash │(4 bytes)│seg_bytes│start │end │ +│ │ │(4 bytes)│(4 bytes)│(4 bytes)│ +└────────────────────────────────────────────────────────────────┴─────────┴─────────┴─────────┴─────────┘ +0 32 36 40 44 48 +``` + +### FileVerificationEntry (OPTIONAL) + +Verification Entries MUST be set for shard uploads. + +To generate verification hashes for shard upload read the section about [Verification Hashes](./hashing#Term-Verification-Hashes). + +```rust +struct FileVerificationEntry { + range_hash: Hash, // 32-byte verification hash + _unused: [u8; 16], // Reserved (16 bytes) +} +``` + +**Memory Layout**: + +```txt +┌────────────────────────────────────────────────────────────────┬────────────────────────────────┐ +│ range_hash (32 bytes) │ _unused (16 bytes) │ +│ Verification Hash │ Reserved Space │ +└────────────────────────────────────────────────────────────────┴────────────────────────────────┘ +0 32 48 +``` + +When a shard has verification entries, all file info sections MUST have verification entries. +If only some subset of files in the shard have verification entries, the shard is considered invalid. +Every `FileDataSequenceEntry` will have a matching `FileVerificationEntry` in this case where the range_hash is computed with the chunk hashes for that range of chunks. + +For any file the nth `FileVerificationEntry` correlates to the nth `FileDataSequenceEntry`, and like `FileDataSequenceEntries` if there are verification entries there will be `file_data_sequence_header.num_entries` verification entries (following the num_entries data sequence entries). + +### FileMetadataExt (OPTIONAL) + +This section is REQUIRED per file for shards uploaded through the shard upload API. + +There is only 1 `FileMetadataExt` instance per file info block and it is the last component of that file info block when present. + +The sha256 field is the 32 byte SHA256 of the file contents of the file described. + +```rust +struct FileMetadataExt { + sha256: Hash, // 32-byte SHA256 hash + _unused: [u8; 16], // Reserved (16 bytes) +} +``` + +**Memory Layout**: + +```txt +┌────────────────────────────────────────────────────────────────┬────────────────────────────────┐ +│ sha256 (32 bytes) │ _unused (16 bytes) │ +│ SHA256 Hash │ Reserved Space │ +└────────────────────────────────────────────────────────────────┴────────────────────────────────┘ +0 32 48 +``` + +### File Info Bookend + +The end of the file info sections is marked by a bookend entry. + +The bookend entry is 48 bytes long where the first 32 bytes are all `0xFF`, followed by 16 bytes of all `0x00`. + +Suppose you were attempting to deserialize a `FileDataSequenceHeader` and it's file hash was all 1 bits then this entry is a bookend entry and the next bytes start the next section. + +Since the file info section immediately follows the header, a client MAY skip deserializing the footer to know where it starts deserializing this section. +The file info section begins right after the header and ends when the bookend is reached. + +**Deserialization steps**: + +1. Seek to `footer.file_info_offset` +2. Read `FileDataSequenceHeader` +3. Check if `file_hash` is all `0xFF` (bookend marker) - if so, stop +4. Read `file_data_sequence_header.num_entries` × `FileDataSequenceEntry` structures +5. If `file_flags & MDB_FILE_FLAG_WITH_VERIFICATION != 0`: read `file_data_sequence_header.num_entries` × `FileVerificationEntry` +6. If `file_flags & MDB_FILE_FLAG_WITH_METADATA_EXT != 0`: read 1 × `FileMetadataExt` +7. Repeat from step 2 until bookend found + +## 3. CAS Info Section + +**Location**: `footer.cas_info_offset` to `footer.footer_offset` or directly after the file info section bookend + +This section contains CAS (Content Addressable Storage) block information. Each CAS Info block represents a xorb by first having a `CASChunkSequenceHeader` which contains the number of `CASChunkSequenceEntries` to follow that make up this block. The CAS Info section ends when reaching the bookend entry. + +### CAS Info Section Layout + +```txt +┌─────────────────────┐ +│ CASChunkSeqHeader │ ← CAS Block 1 +├─────────────────────┤ +│ CASChunkSeqEntry │ +├─────────────────────┤ +│ CASChunkSeqEntry │ +├─────────────────────┤ +│ ... │ +├─────────────────────┤ +│ CASChunkSeqHeader │ ← CAS Block 2 +├─────────────────────┤ +│ CASChunkSeqEntry │ +├─────────────────────┤ +│ ... │ +├─────────────────────┤ +│ Bookend Entry │ ← All 0xFF hash + zeros +└─────────────────────┘ +``` + +**Deserialization steps**: + +1. Seek to `footer.cas_info_offset` +2. Read `CASChunkSequenceHeader` +3. Check if `cas_hash` is all 0xFF (bookend marker) - if so, stop +4. Read `cas_chunk_sequence_header.num_entries` × `CASChunkSequenceEntry` structures +5. Repeat from step 2 until bookend found + +### CASChunkSequenceHeader + +```rust +struct CASChunkSequenceHeader { + cas_hash: Hash, // 32-byte Xorb hash + cas_flags: u32, // CAS flags (reserved for later, set to 0) + num_entries: u32, // Number of chunks in this Xorb + num_bytes_in_cas: u32, // Total size of all raw chunk bytes in this Xorb + num_bytes_on_disk: u32, // Length of the xorb after serialized when uploaded +} +``` + +**Memory Layout**: + +```txt +┌────────────────────────────────────────────────────────────────┬─────────┬─────────┬─────────┬─────────┐ +│ cas_hash (32 bytes) │cas_flags│num_ │num_bytes│num_bytes│ +│ CAS Block Hash │(4 bytes)│entries │in_cas │on_disk │ +│ │ │(4 bytes)│(4 bytes)│(4 bytes)│ +└────────────────────────────────────────────────────────────────┴─────────┴─────────┴─────────┴─────────┘ +0 32 36 40 44 48 +``` + +### CASChunkSequenceEntry + +Every `CASChunkSequenceHeader` will have a `num_entries` number field. +This number is the number of `CASChunkSequenceEntry` items that should be deserialized that are associated with the xorb described by this CAS Info block. + +```rust +struct CASChunkSequenceEntry { + chunk_hash: Hash, // 32-byte chunk hash + chunk_byte_range_start: u32, // Start position in CAS block + unpacked_segment_bytes: u32, // Size when unpacked + _unused: [u8; 8], // Reserved space 8 bytes +} +``` + +**Memory Layout**: + +```txt +┌────────────────────────────────────────────────────────────────┬─────────┬─────────┬─────────────────┐ +│ chunk_hash (32 bytes) │chunk_ │unpacked │ _unused │ +│ Chunk Hash │byte_ │segment_ │ (8 bytes) │ +│ │range_ │bytes │ │ +│ │start │(4 bytes)│ │ +│ │(4 bytes)│ │ │ +└────────────────────────────────────────────────────────────────┴─────────┴─────────┴─────────────────┘ +0 32 36 40 48 +``` + +### CAS Info Bookend + +The end of the cas info sections is marked by a bookend entry. + +The bookend entry is 48 bytes long where the first 32 bytes are all `0xFF`, followed by 16 bytes of all `0x00`. + +Suppose you were attempting to deserialize a `CASChunkSequenceHeader` and it's hash was all 1 bits then this entry is a bookend entry and the next bytes start the next section. + +Since the cas info section immediately follows the file info section bookend, a client MAY skip deserializing the footer to know where the cas info section starts starts deserialize this section, it begins right after the file info section bookend and ends when the next bookend is reached. + +## 4. Footer (MDBShardFileFooter) + +> [!NOTE] +> MUST NOT include the footer when serializing the shard as the body for the shard upload API. + +**Location**: End of file minus footer_size +**Size**: 200 bytes + +```rust +struct MDBShardFileFooter { + version: u64, // Footer version (must be 1) + file_info_offset: u64, // Offset to file info section + cas_info_offset: u64, // Offset to CAS info section + _buffer: [u8; 48], // Reserved space (48 bytes) + chunk_hash_hmac_key: Hash, // HMAC key for chunk hashes (32 bytes) + shard_creation_timestamp: u64, // Creation time (seconds since epoch) + shard_key_expiry: u64, // Expiry time (seconds since epoch) + _buffer2: [u8; 72], // Reserved space (72 bytes) + footer_offset: u64, // Offset where footer starts +} +``` + +**Memory Layout**: + +> [!NOTE] +> Fields are not exactly to scale + +```txt +┌─────────┬─────────┬─────────┬─────────────────────────────────────────────────────────────┬─────────────────────────────────────┐ +│ version │file_info│cas_info │ _buffer (reserved) │ chunk_hash_hmac_key │ +│(8 bytes)│offset │offset │ (48 bytes) │ (32 bytes) │ +│ │(8 bytes)│(8 bytes)│ │ │ +└─────────┴─────────┴─────────┴─────────────────────────────────────────────────────────────┴─────────────────────────────────────┘ +0 8 16 24 72 104 + +┌─────────┬──────────┬─────────────────────────────────────────────────────────────────────────────┬─────────┐ +│creation │shard_ │ _buffer (reserved) │footer_ │ +│timestamp│key_expiry│ (72 bytes) │offset │ +│(8 bytes)│ (8 bytes)│ │(8 bytes)│ +└─────────┴──────────┴─────────────────────────────────────────────────────────────────────────────┴─────────┘ +104 112 120 192 200 +``` + +**Deserialization steps**: + +1. Seek to `file_size - footer_size` +2. Read all fields sequentially as u64 values +3. Verify version equals 1 + +### Use of Footer Fields + +#### file_info_offset and cas_info_offset + +These offsets allow you to seek into the shard data buffer to reach these sections without deserializing linearly. + +#### HMAC Key Protection + +If `footer.chunk_hash_hmac_key` is non-zero (as a response shard from the global dedupe API), chunk hashes in the CAS Info section are protected with [HMAC](https://en.wikipedia.org/wiki/HMAC): + +- The stored chunk hashes are `HMAC(original_hash, footer.chunk_hash_hmac_key)` +- To check if a chunk of data that you have matches a chunk listed in the shard, compute `HMAC(chunk_hash, footer.chunk_hash_hmac_key)` for your chunk hash and search through the shard results. +If you find a match (matched_chunk) then you know the original chunk hash of your chunk and the matched_chunk is the same and you can deduplicate your chunk by referencing the xorb that matched_chunk belongs to. + +#### Shard Key Expiry + +The shard key expiry is a 64 bit unix timestamp of when the shard received is to be considered expired (usually in the order of days or weeks after the shard was sent back). + +After this expiry time has passed clients SHOULD consider this shard expired and SHOULD NOT use it to deduplicate data. +Uploads that reference xorbs that were referenced by this shard can be rejected at the server's discretion. + +## Complete Deserialization Algorithm + +```text +// ** option 1, read linearly, streaming ** +// assume shard is a read-able file-like object and the reader position is at start of shard +// 1. Read and validate header +header = read_header(shard) + +// 2. Read file info section +file_info = read_file_info_section(shard) // read through file info bookend + +// 3. Read CAS info section +cas_info = read_cas_info_section(shard) // read through cas info bookend + +// 4. Read footer +footer = read_footer(shard) + +// shard reader should now be at EOF + + +// ** option 2, read footer and seek ** +// assume shard is a read-able seek-able file-like object +// 1. Read and validate header +seek(start of shard) +header = read_header(shard) + +// 2. Read and validate footer (needed for offsets) +seek(end of shard minus header.footer_size) +footer = read_footer(shard) + +// 3. Read file info section +seek(footer.file_info_offset) +file_info = read_file_info_section(shard) // until footer.cas_info_offset + +// 4. Read CAS info section +seek(footer.cas_info_offset) +cas_info = read_cas_info_section(shard) // until footer.footer_offset +``` + +## Version Compatibility + +- Header version 2: Current format +- Footer version 1: Current format +- Shards with different versions will be rejected + +## Error Handling + +- Always verify magic numbers and versions +- Check that offsets are within file bounds +- Verify that bookend markers are present where expected diff --git a/docs/xet-protocol/upload-protocol.md b/docs/xet-protocol/upload-protocol.md new file mode 100644 index 000000000..f993eaeb1 --- /dev/null +++ b/docs/xet-protocol/upload-protocol.md @@ -0,0 +1,177 @@ +# Upload Protocol + +This document describes how files are uploaded in the Xet protocol to the Content Addressable Storage (CAS) service. +The flow converts input files into chunks, applies deduplication, groups chunks into xorbs, uploads xorbs, then forms and uploads shards that reference those xorbs. +The steps can be done all be done concurrently except that all xorbs MUST be uploaded before a shard referencing them is uploaded. +Content addressing uses hashes as stable keys for deduplication and integrity verification. + +## Xet Object Types + +### Chunk + +A chunk is a slice of data from a real file. + +A chunk has an associated hash computed through the [chunk hashing process](./hashing#chunk-hashes) and its data is determined by finding chunk boundaries following the chunking algorithm defined in [chunking](./chunking). + +A chunk is ~64KiB of data with a maximum of 128KiB and minimum of 8KiB. +However, the minimum chunk size limit is not enforced for the last chunk of a file or if the file is smaller than 8KiB. + +### Xorb + +A Xorb is composed of a sequence of chunks. + +Chunks in a xorb are not simply concatenated but instead compressed and appended after a header as described in [xorb](./xorb#xorb-format). +Chunks are collected in a xorb for more efficient upload and downloads of "ranges" of chunks. +Each chunk has an associated index (beginning at 0) and chunks may addressed from xorbs using through an end exclusive chunk index range i.e. [0, 100). + +Xorbs are created by grouping sequences of chunks from files and are referenced in file reconstructions to provide instructions to rebuild the file. + +Xorbs have an associated hash computed according to the instructions for the [xorb hashing process](./hashing#xorb-hashes). + +Xorbs are always less than or equal to 64MiB in length and on average contain 1024 chunks, but this number is variable. + +### File Reconstruction + +A file reconstruction is a "recipe" to recreate a file using data from xorbs. + +Each file reconstruction is made of a series of "terms" where each term contains a xorb hash and a chunk index range. +To reconstruct a file, a user needs will need the chunks at the specified range for each term, deserialized and decompressed and concatenated in term order. + +### Shards + +Shards are serialized representations of file information and xorb metadata. + +A shard may contain multiple file reconstructions or none. +A shard may also contain information about xorbs, particularly what chunks are in a particular xorb. + +Shards are used to communicate a "file upload" or registering the file in the CAS (Content Addressed Store) as well as registering groups of xorbs associated with the same upload. + +Shards are also used to communicate xorb metadata that can be used for deduplication using the Global Deduplication API. + +The shard format is specified in [shard](./shard). + +> [!NOTE] +> In xet-core the shard format is used to keep a local cache with fast lookup of known chunks for deduplication, other implementors of the xet protocol may choose to reuse the shard format for that purpose as well, however that is not a requirement of the protocol. + +## Steps + +### 1. Chunking + +Using the chunking algorithm described in [chunking](./chunking) first split the file into variable sized chunks. +Each unique chunk MUST have a unique hash computed as described in the [Chunk Hashing section](./hashing#chunk-hashes). +This chunk hash will be used to attempt to deduplicate any chunk against other known chunks. + +### 2. Deduplication + +Given a chunk hash, attempt to find if the chunk already exists in the Xet system. + +To deduplicate a chunk is to find if the current chunk hash already exists, either in the current upload process, in a local cache of known chunks or using the [Global Deduplication API](./api#2-query-chunk-deduplication-global-deduplication). + +When a chunk is deduplicated it SHOULD NOT be re-uploaded to the CAS (by being included in a xorb in the next step), but when rebuilding the file, the chunk needs to be included by referencing the xorb that includes it and the specific chunk index. + +> [!NOTE] +> Note that Deduplication is considered an optimization and is an OPTIONAL component of the upload process, however it provides potential resource saving. + +For more detail visit the [deduplication document](./deduplication) + +### 3. Xorb Formation and Hashing + +Contiguous runs of chunks are collected into xorbs (roughly 64 MiB total length per xorb), preserving order within each run. See formation rules: [xorb](./xorb#collecting-chunks). +The xorb's content-addressed key is computed using the chunks in the xorb. See: [hashing](./hashing#xorb-hashes). + +Given the xorb hash chunks in the xorb can be referred in file reconstructions. + +### 4. Xorb Serialization and Upload + +Each xorb is serialized into its binary representation as defined by the xorb format. See: [xorb](./xorb). +The client uploads each new xorb via the [Xorb upload API](./api#3-upload-xorb). + +The serialization and upload steps are separated from collecting chunks and hashing as these steps can be done independently while still referencing the xorb in creating file reconstructions. +However a xorb MUST be uploaded before a file reconstruction that references it is uploaded in a shard. + +### 5. Shard Formation, Collect Required Components + +Map each file to a reconstruction using available xorbs, the file reconstruction MUST point to ranges of chunks within xorbs that refer to each chunk in the file. +Terms for chunks that are deduplicated using results from the Global Dedupe API will use xorb hashes that already exist in CAS. + +Then for each file: + +- Compute the file hash using the [file hashing process](./hashing#file-hashes). +- For each xorb range (a "term") compute a [verification hash](./hashing#term-verification-hashes) in order to upload it. + - These hashes are used to ensure that the client uploading the file in the shard authoritatively has access to the actual file data. +- Compute the sha256 for the file contents + +With these components it is now possible to completely serialize a [file info block](./shard#2-file-info-section) in the shard format. + +In addition to the file info information, it is also necessary to collect all metadata for new xorbs that were created. +This metadata is the xorb hash, the hash and length of each chunk, the serialized length of the xorb and the sum of the chunk lengths for a xorb. +With these components it is now possible to serialize for each xorb a [CAS Info block](./shard#3-cas-info-section). + +### 6. Shard Serialization and Upload + +Given the information collected in the previous section, serialize a shard for a batch of files following the format specified in the [shard spec](./shard). + +The client uploads the shard via the [shard upload](./api#4-upload-shard) endpoint on the CAS server. +For this to succeed, all xorbs referenced by the shard MUST have already completed uploading. + +This API registers files as uploaded. + +> [!NOTE] +> For a large batch of files or a batch of large files if the serialized shard will be greater than 64 MiB you MUST break up the content into multiple shards. + +### Done + +After all xorbs and all shards are successfully uploaded, the full upload is considered complete. +Files can then be downloaded by any client using the [download protocol](./download-protocol). + +> [!NOTE] +> If this file is being uploaded to the Hugging Face Hub, users will need to commit a git lfs pointer file using the sha256 of the file contents. + +## Ordering and Concurrency + +There are some natural ordering requirements in the upload process, e.g. you MUST have determined a chunk boundary before computing the chunk hash, and you MUST have collected a sequence of chunks to create a xorb to compute the xorb hash etc. + +However there is one additional enforced requirement about ordering: **all xorbs referenced by a shard MUST be uploaded before that shard is uploaded**. +If any xorb referenced by a shard is not already uploaded when the shard upload API is called, the server will reject the request. +All xorbs whose hash is used as an entry in the cas info section and in data entries of the file info section are considered "referenced" by a shard. + +## Integrity and Idempotency + +- Hashing of chunks, xorbs, and shards ensures integrity and enables deduplication across local and global scopes. See: [hashing](./hashing). + - the same chunk data produces the same chunk hash + - the same set of chunks will produce the same xorb hash +- Consistent chunking algorithm yields that the same data will be split into the same chunks at the same boundaries, allowing those chunks to be matched to other data and deduplicated. +- Upload endpoints are idempotent with respect to content-addressed keys; re-sending an already-present xorb or shard is safe. + +## Diagram + +```mermaid +sequenceDiagram + autonumber + participant C as Client + participant S as CAS Server + + C->>C: Chunking: split file into chunks and compute chunk hashes + + Note right of C: 2) Local deduplication (OPTIONAL) + + loop For each chunk if chunk % 1024 == 0
(global dedupe eligible) + opt Global deduplication (OPTIONAL) + C->>S: GET /v1/chunks/default-merkledb/{chunk_hash} + S-->>C: 200 dedupe information or 404 not found + end + end + + C->>C: Xorb formation (group chunks ~64 MiB), hashing, serialization + + loop For each new Xorb + C->>S: POST /v1/xorbs/default/{xorb_hash} + S-->>C: 200 OK + end + + C->>C: Shard formation (files -> reconstructions) and serialization + C->>S: POST /v1/shards + S-->>C: 200 OK + + Note over C,S: All referenced Xorbs MUST be uploaded before Shard upload.
Endpoints are idempotent by content-addressed keys. +``` diff --git a/docs/xet-protocol/xorb.md b/docs/xet-protocol/xorb.md new file mode 100644 index 000000000..36ef1c04a --- /dev/null +++ b/docs/xet-protocol/xorb.md @@ -0,0 +1,135 @@ +# Xorb Formation & Serialization Format + +A "Xorb" (Xet Orb, pronounced like "zorb") is a sequence of chunks and a serialization format for a series of chunks. + +## Collecting Chunks + +Using the chunking algorithm a file is mapped to a series of chunks, once those chunks are found, they need to be collected into collections of Xorbs. + +It is advantageous to collect series of chunks in Xorbs such that they can be referred to as a whole range of chunks. + +Suppose a file is chunked into chunks A, B, C, D in the order ABCD. Then create a Xorb X1 with chunks A, B, C, D in this order (starting at chunk index 0), let's say this Xorb's hash is X1. Then to reconstruct the file we ask for Xorb X1 chunk range `[0, 4)`. + +While there's no explicit limit on the number of chunks in a Xorb, there is a limit of 64MiB on the total size of the Xorb as serialized. +Since some chunks will get compressed, it is generally advised to collect chunks until their total uncompressed length is near 64 MiB then serialize the struct. +Namely, Xorbs point to roughly 64 MiB worth of data. +(Recall that the target chunk size is 64 KiB so expect roughly ~1024 chunks per Xorb). + +The CAS server will reject Xorb uploads that exceed the 64 MiB serialized size limit. + +It is RECOMMENDED to pack chunks from multiple files into a Xorb if the size requirements allow, i.e. file X and Y both produced 10 new chunks each totalling a total of ~128000 bytes, then all those chunks can fit in a new Xorb. + +## Xorb Format + +A Xorb is a series of "Chunks" that is serialized according to a specific format that enables accessing chunks of ranges and builds in chunk level compression. + +```txt +┌─────────┬─────────────────────────────────┬─────────┬─────────────────────────────────┬─────────┬─────────────────────────────────┬────────── +│ Chunk │ │ Chunk │ │ Chunk │ │ +│ Header │ Compressed Chunk Data │ Header │ Compressed Chunk Data │ Header │ Compressed Chunk Data │ ... +│ │ │ │ │ │ │ +└─────────┴─────────────────────────────────┴─────────┴─────────────────────────────────┴─────────┴─────────────────────────────────┴─────────── +│ Chunk 0 │ Chunk 1 │ Chunk 2 │ ... +``` + +### Chunk Addressing + +Each chunk has an index within the Xorb it is in, starting at 0. +Chunks can be addressed individually by their index but are usually addressed or fetched in range. +Chunk ranges are always specified start inclusive and end exclusive i.e. `[start, end)`. + +## Chunk Format + +A chunk consists of a header followed by compressed data. The header contains metadata about the chunk, particularly the compression scheme required to know how to deserialize the chunk. + +### Chunk Header Structure + +The chunk header is serialized as follows: + +- **Version** (1 byte): Protocol version, currently `0` +- **Compressed Size** (3 bytes): Size of data after compression as a 3 byte little-endian unsigned integer. +- **Compression Type** (1 byte): Algorithm used for compression (See mapping below) +- **Uncompressed Size** (3 bytes): Size of raw chunk data (before compression) as a 3 byte little-endian unsigned integer. + +Both Compressed and Uncompressed Size can fit in a 3 byte integer, given that that a raw uncompressed chunk can be 128KiB at most, +requiring 18 binary digits to represent. +If utilizing the intended compression scheme results in a larger compressed chunk then the chunk SHOULD be stored uncompressed with then +the uncompressed size also being at a maximum of 128KiB. + +#### Chunk Header Layout + +```txt +┌─────────┬─────────────────────────────────┬──────────────┬─────────────────────────────────┐ +│ Version │ Compressed Size │ Compression │ Uncompressed Size │ +│ 1 byte │ 3 bytes │ Type │ 3 bytes │ +│ │ (little-endian) │ 1 byte │ (little-endian) │ +└─────────┴─────────────────────────────────┴──────────────┴─────────────────────────────────┘ +0 1 4 5 8 +``` + +### Chunk Compression Schemes + +| Value | Name | Description | +|-------|------|-------------| +| `0` | `None` | No compression - data is stored as-is | +| `1` | `LZ4` | Standard LZ4 compression | +| `2` | `ByteGrouping4LZ4` | Byte grouping with 4-byte groups followed by LZ4 compression. Optimized for floating-point and other structured data where grouping bytes by position improves compression ratios | + +#### Byte Grouping LZ4 Compression + +Byte grouping LZ4 compression is an optimization technique that improves compression ratios for structured data like floating-point numbers, integers, and other data types where values have similar byte patterns at specific positions. + +1. **Byte Grouping Phase**: The input data is reorganized by grouping bytes by their position within each 4-byte groups: + Create 4 buffers, for each 4 bytes of the chunk data (B1, B2, B3, B4) append each byte to their respective group i.e. in order from 1 to 4. Then concatenate the groups in order (1, 2, 3, 4). + + Example: + + - Original data: `[A1, A2, A3, A4, B1, B2, B3, B4, C1, C2, C3, C4, ...]` + - Grouped data: `[A1, B1, C1, ..., A2, B2, C2, ..., A3, B3, C3, ..., A4, B4, C4, ...]` + + If the total number of bytes in the chunk is not a multiple of 4, append the remaining bytes following the pattern (1 byte to each group) to the first 1-3 groups until there are no more bytes left in the chunk. + +2. **LZ4 Compression**: The grouped data is then compressed using standard LZ4 compression. + +#### Chunk Data + +Following the header is the compressed data block, exactly `compressed_size` bytes long. + +### Picking a Compression Scheme + +Picking the chunk compression scheme for the Xorb is a task left to the client when uploading the Xorb. +The goal is to minimize the overall size of the Xorb for faster transmission at the cost of resources to decompress a chunk on the receiving end. + +When picking a compression scheme for the chunk there are a number of strategies and implementors MAY make their decisions as to how to pick a compression scheme. +Note that a Xorb MAY contain chunks that utilize different compression schemes. + +1. **Brute Force** + + Try all possible compression schemes, pick the best one. + The best one MAY be the one producing the smallest compressed chunk or the fastest to decompress. + +2. **Best Effort Prediction** + + In `xet-core`, to predict if BG4 will be useful we maximum KL divergence between the distribution of per-byte pop-counts on a sample of each of the 4 groups that would be formed. + You can read more about it in [bg4_prediction.rs](./cas_object/src/byte_grouping/bg4_prediction.rs) and accompanying scripts. + + If the predictor does not show that BG4 will be better, we use Lz4 and in either case we will store the chunk as the uncompressed version if the compression scheme used does not show any benefit. + +#### Example Chunk Serialization + +```python +VERSION = 0 +buffer = bytes() + +for chunk in xorb.chunks: + uncompressed_length = len(chunk) + compressed, compression_scheme = pick_compression_scheme_and_compress(chunk) + header = Header(VERSION, len(compressed), compression_scheme, uncompressed_length) + buffer.write(header) + buffer.write(compressed) +``` + +## Xorb Format Sample + +For a sample of a serialized xorb object see [eea25d6ee393ccae385820daed127b96ef0ea034dfb7cf6da3a950ce334b7632.xorb](https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/eea25d6ee393ccae385820daed127b96ef0ea034dfb7cf6da3a950ce334b7632.xorb). +The hash of this xorb is `eea25d6ee393ccae385820daed127b96ef0ea034dfb7cf6da3a950ce334b7632` and it is composed of chunks from file [Electric_Vehicle_Population_Data_20250917.csv](https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/Electric_Vehicle_Population_Data_20250917.csv). From 35bd48fc707962b129b05a04415477d70a54f387 Mon Sep 17 00:00:00 2001 From: Assaf Vayner Date: Tue, 30 Sep 2025 09:48:09 -0700 Subject: [PATCH 2/2] xet-protocol -> xet --- ...uild_documentation.yml => xet_build_documentation.yml} | 8 ++++---- ...r_documentation.yml => xet_build_pr_documentation.yml} | 8 ++++---- ..._documentation.yml => xet_upload_pr_documentation.yml} | 6 +++--- docs/{xet-protocol => xet}/_toctree.yml | 0 docs/{xet-protocol => xet}/api.md | 0 docs/{xet-protocol => xet}/auth.md | 0 docs/{xet-protocol => xet}/chunking.md | 0 docs/{xet-protocol => xet}/deduplication.md | 0 docs/{xet-protocol => xet}/download-protocol.md | 0 docs/{xet-protocol => xet}/file-id.md | 0 docs/{xet-protocol => xet}/file-reconstruction.md | 0 docs/{xet-protocol => xet}/hashing.md | 0 docs/{xet-protocol => xet}/index.md | 0 docs/{xet-protocol => xet}/shard.md | 0 docs/{xet-protocol => xet}/upload-protocol.md | 0 docs/{xet-protocol => xet}/xorb.md | 0 16 files changed, 11 insertions(+), 11 deletions(-) rename .github/workflows/{xet_protocol_build_documentation.yml => xet_build_documentation.yml} (68%) rename .github/workflows/{xet_protocol_build_pr_documentation.yml => xet_build_pr_documentation.yml} (73%) rename .github/workflows/{xet_protocol_upload_pr_documentation.yml => xet_upload_pr_documentation.yml} (68%) rename docs/{xet-protocol => xet}/_toctree.yml (100%) rename docs/{xet-protocol => xet}/api.md (100%) rename docs/{xet-protocol => xet}/auth.md (100%) rename docs/{xet-protocol => xet}/chunking.md (100%) rename docs/{xet-protocol => xet}/deduplication.md (100%) rename docs/{xet-protocol => xet}/download-protocol.md (100%) rename docs/{xet-protocol => xet}/file-id.md (100%) rename docs/{xet-protocol => xet}/file-reconstruction.md (100%) rename docs/{xet-protocol => xet}/hashing.md (100%) rename docs/{xet-protocol => xet}/index.md (100%) rename docs/{xet-protocol => xet}/shard.md (100%) rename docs/{xet-protocol => xet}/upload-protocol.md (100%) rename docs/{xet-protocol => xet}/xorb.md (100%) diff --git a/.github/workflows/xet_protocol_build_documentation.yml b/.github/workflows/xet_build_documentation.yml similarity index 68% rename from .github/workflows/xet_protocol_build_documentation.yml rename to .github/workflows/xet_build_documentation.yml index 97ad259a2..fc7fc51ad 100644 --- a/.github/workflows/xet_protocol_build_documentation.yml +++ b/.github/workflows/xet_build_documentation.yml @@ -1,9 +1,9 @@ -name: Build Xet Protocol documentation +name: Build Xet documentation on: push: paths: - - "docs/xet-protocol/**" + - "docs/xet/**" branches: - main @@ -13,8 +13,8 @@ jobs: with: commit_sha: ${{ github.sha }} package: hub-docs - package_name: xet-protocol - path_to_docs: hub-docs/docs/xet-protocol/ + package_name: xet + path_to_docs: hub-docs/docs/xet/ additional_args: --not_python_module secrets: hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} diff --git a/.github/workflows/xet_protocol_build_pr_documentation.yml b/.github/workflows/xet_build_pr_documentation.yml similarity index 73% rename from .github/workflows/xet_protocol_build_pr_documentation.yml rename to .github/workflows/xet_build_pr_documentation.yml index 50df7cd9b..bc2d676ce 100644 --- a/.github/workflows/xet_protocol_build_pr_documentation.yml +++ b/.github/workflows/xet_build_pr_documentation.yml @@ -1,9 +1,9 @@ -name: Build Xet Protocol PR Documentation +name: Build Xet PR Documentation on: pull_request: paths: - - "docs/xet-protocol/**" + - "docs/xet/**" concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -16,6 +16,6 @@ jobs: commit_sha: ${{ github.event.pull_request.head.sha }} pr_number: ${{ github.event.number }} package: hub-docs - package_name: xet-protocol - path_to_docs: hub-docs/docs/xet-protocol/ + package_name: xet + path_to_docs: hub-docs/docs/xet/ additional_args: --not_python_module diff --git a/.github/workflows/xet_protocol_upload_pr_documentation.yml b/.github/workflows/xet_upload_pr_documentation.yml similarity index 68% rename from .github/workflows/xet_protocol_upload_pr_documentation.yml rename to .github/workflows/xet_upload_pr_documentation.yml index 269397216..0bdb1f02d 100644 --- a/.github/workflows/xet_protocol_upload_pr_documentation.yml +++ b/.github/workflows/xet_upload_pr_documentation.yml @@ -1,8 +1,8 @@ -name: Upload Xet Protocol PR Documentation +name: Upload Xet PR Documentation on: workflow_run: - workflows: ["Build Xet Protocol PR Documentation"] + workflows: ["Build Xet PR Documentation"] types: - completed @@ -10,7 +10,7 @@ jobs: build: uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main with: - package_name: xet-protocol + package_name: xet secrets: hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }} diff --git a/docs/xet-protocol/_toctree.yml b/docs/xet/_toctree.yml similarity index 100% rename from docs/xet-protocol/_toctree.yml rename to docs/xet/_toctree.yml diff --git a/docs/xet-protocol/api.md b/docs/xet/api.md similarity index 100% rename from docs/xet-protocol/api.md rename to docs/xet/api.md diff --git a/docs/xet-protocol/auth.md b/docs/xet/auth.md similarity index 100% rename from docs/xet-protocol/auth.md rename to docs/xet/auth.md diff --git a/docs/xet-protocol/chunking.md b/docs/xet/chunking.md similarity index 100% rename from docs/xet-protocol/chunking.md rename to docs/xet/chunking.md diff --git a/docs/xet-protocol/deduplication.md b/docs/xet/deduplication.md similarity index 100% rename from docs/xet-protocol/deduplication.md rename to docs/xet/deduplication.md diff --git a/docs/xet-protocol/download-protocol.md b/docs/xet/download-protocol.md similarity index 100% rename from docs/xet-protocol/download-protocol.md rename to docs/xet/download-protocol.md diff --git a/docs/xet-protocol/file-id.md b/docs/xet/file-id.md similarity index 100% rename from docs/xet-protocol/file-id.md rename to docs/xet/file-id.md diff --git a/docs/xet-protocol/file-reconstruction.md b/docs/xet/file-reconstruction.md similarity index 100% rename from docs/xet-protocol/file-reconstruction.md rename to docs/xet/file-reconstruction.md diff --git a/docs/xet-protocol/hashing.md b/docs/xet/hashing.md similarity index 100% rename from docs/xet-protocol/hashing.md rename to docs/xet/hashing.md diff --git a/docs/xet-protocol/index.md b/docs/xet/index.md similarity index 100% rename from docs/xet-protocol/index.md rename to docs/xet/index.md diff --git a/docs/xet-protocol/shard.md b/docs/xet/shard.md similarity index 100% rename from docs/xet-protocol/shard.md rename to docs/xet/shard.md diff --git a/docs/xet-protocol/upload-protocol.md b/docs/xet/upload-protocol.md similarity index 100% rename from docs/xet-protocol/upload-protocol.md rename to docs/xet/upload-protocol.md diff --git a/docs/xet-protocol/xorb.md b/docs/xet/xorb.md similarity index 100% rename from docs/xet-protocol/xorb.md rename to docs/xet/xorb.md