diff --git a/docs/docs.json b/docs/docs.json index cf1fe6a..fab2330 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -275,7 +275,6 @@ { "group": "AI Platforms & Frameworks", "pages": [ - "integrations/ai/huggingface", "integrations/ai/agno", "integrations/ai/langchain", "integrations/ai/llamaIndex", @@ -284,6 +283,13 @@ "integrations/ai/prompttools", "integrations/ai/synthetic-data-kit" ] + }, + { + "group": "Hugging Face Hub", + "pages": [ + "huggingface/overview", + "huggingface/datasets" + ] } ] } @@ -414,6 +420,10 @@ { "source": "/enterprise/performance", "destination": "/enterprise/benchmarks" + }, + { + "source": "/integrations/ai/huggingface", + "destination": "/huggingface/overview" } ] } diff --git a/docs/huggingface/datasets.mdx b/docs/huggingface/datasets.mdx new file mode 100644 index 0000000..d0d42c9 --- /dev/null +++ b/docs/huggingface/datasets.mdx @@ -0,0 +1,179 @@ +--- +title: "Datasets" +sidebarTitle: "Datasets" +description: "Browse Lance-format datasets ready to query on the Hugging Face Hub." +--- + +The [`lance-format`](https://huggingface.co/lance-format) organization on Hugging Face publishes a growing +catalog of multimodal datasets in Lance format. Each one bundles the raw data (images, audio, video, or text), +pre-computed embeddings, and on-disk vector / full-text indices as first-class columns in the same dataset — +so vector search, full-text search, and filtered scans work directly via `hf://` URIs without downloading. + +This is powered under the hood by the [Lance format's native Hugging Face integration](https://lance.org/integrations/huggingface/) +(via the [`pylance`](https://pypi.org/project/pylance/) library). LanceDB sits on top of Lance and gives you a +convenient table-style interface to query these datasets straight from the Hub: + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format//data") +tbl = db.open_table("train") + +# Vector search, full-text search, or filtered scans — directly on the Hub +results = tbl.search(query).limit(10).to_list() +``` + +Click any card below to view a dataset on the Hub. For a complete walkthrough, +see the [Hugging Face Hub overview](/huggingface/overview). + +## Image Classification + + + + `lance-format/mnist-lance` — 70,000 28×28 grayscale handwritten digits. The classic image-classification benchmark. + + + `lance-format/cifar10-lance` — 60,000 32×32 RGB images across 10 everyday object classes. + + + `lance-format/fashion-mnist-lance` — 70,000 28×28 grayscale clothing images across 10 categories. A harder drop-in replacement for MNIST. + + + `lance-format/food101-lance` — 101,000 food photographs across 101 dish classes. Fine-grained image classification. + + + `lance-format/oxford-pets-lance` — 7,390 cat and dog photos across 37 breeds, with an `is_dog` filter column. + + + `lance-format/stanford-cars-lance` — 8,144 images across 196 fine-grained car make / model / year classes, with BLIP captions. + + + `lance-format/imagenet-1k-val-lance` — The canonical 50,000-image ILSVRC2012 validation split. Research use only. + + + `lance-format/eurosat-lance` — 27,000 Sentinel-2 satellite tiles across 10 land-cover classes. The canonical remote-sensing classification benchmark. + + + +## Object Detection & Segmentation + + + + `lance-format/coco-detection-2017-lance` — 123,287 images with per-image bounding boxes and category labels across 80 classes. + + + `lance-format/pascal-voc-2012-segmentation-lance` — 2,913 image / mask pairs across 20 semantic classes. Small-scale semantic-segmentation benchmark. + + + `lance-format/ade20k-lance` — 27,574 scene images with semantic and instance segmentation maps, scene labels, and per-object metadata. + + + `lance-format/kitti-2d-detection-lance` — 7,481 driving-scene images with 2D + 3D bounding-box annotations. The canonical autonomous-driving benchmark. + + + +## Image Retrieval + + + + `lance-format/coco-captions-2017-lance` — Each row is one image paired with 5–7 human-written captions. The standard image-captioning corpus. + + + `lance-format/flickr30k-lance` — 31,783 images each paired with 5 human-written captions. + + + `lance-format/laion-1m` — ~1 million image–caption pairs from the LAION-5B corpus with rich metadata (URL, similarity, NSFW flags). + + + +## Visual Question Answering + + + + `lance-format/chartqa-lance` — 2,500 VQA test questions over scientific and business charts that combine logical and visual reasoning. + + + `lance-format/docvqa-lance` — ~10.5K VQA pairs over document images: forms, receipts, scans, and multi-page reports. + + + `lance-format/textvqa-lance` — 39,602 VQA pairs where every answer requires reading text inside the image. Includes per-image OCR tokens. + + + `lance-format/vqav2-lance` — 214,354 (image, question, 10 answers) triples over COCO images. The standard VQA benchmark. + + + `lance-format/gqa-testdev-balanced-lance` — 12,578 compositional VQA questions over 398 scene-graph-grounded images, with reasoning-program tags. + + + +## Text QA + + + + `lance-format/squad-v2-lance` — 142,192 Wikipedia-grounded questions including unanswerable "is impossible" examples. Stanford's reading-comprehension benchmark. + + + `lance-format/trivia-qa-lance` — 156,328 trivia questions paired with canonical answers and all accepted answer aliases. + + + `lance-format/hotpotqa-distractor-lance` — 97,852 multi-hop QA examples where each answer requires combining facts from two Wikipedia paragraphs. + + + `lance-format/natural-questions-val-lance` — 7,830 real Google search queries paired with the full Wikipedia article each answer lives in. + + + `lance-format/ms-marco-v2.1-lance` — ~910K real Bing user queries, each with 10 candidate passages and human-written reference answers. + + + +## Text Corpora + + + + `lance-format/fineweb-edu` — A multi-million-passage educational web corpus with cleaned text and rich metadata, designed for retrieval-heavy workloads. + + + +## Speech + + + + `lance-format/librispeech-clean-lance` — ~34K English audiobook utterances (FLAC) with reference transcripts. The standard ASR benchmark. + + + +## Video + + + + `lance-format/openvid-lance` — ~938K high-quality short videos with captions and per-clip aesthetic, motion, and camera-motion scores. + + + +## Robotics + + + + `lance-format/lerobot-pusht-lance` — The canonical PushT benchmark from the Diffusion Policy paper, published as three Lance tables (`frames`, `episodes`, `videos`). + + + `lance-format/lerobot-xvla-soft-fold` — 1,542 episodes / 2.85M frames at 20 FPS across 3 camera streams, with aligned robot state and action vectors. + + + +## Share your own dataset + +Got a multimodal dataset you want to publish? Convert it to Lance and push it to the Hub! +Anyone who opens it gets vector search, full-text search, and filtered scans on the data out of the box, +without recreating the embeddings or indexes on their end. + + +A step-by-step walkthrough on the LanceDB blog covering CLI setup, packaging your dataset, pushing to your namespace, and writing a dataset card. + + +Or browse the [latest trending Lance datasets](https://huggingface.co/datasets?format=format:lance&sort=trending) on Hugging Face. + diff --git a/docs/integrations/ai/huggingface.mdx b/docs/huggingface/overview.mdx similarity index 99% rename from docs/integrations/ai/huggingface.mdx rename to docs/huggingface/overview.mdx index e934506..7350a3f 100644 --- a/docs/integrations/ai/huggingface.mdx +++ b/docs/huggingface/overview.mdx @@ -1,6 +1,6 @@ --- title: "Hugging Face Hub" -sidebarTitle: "Hugging Face Hub" +sidebarTitle: "Overview" description: "Use LanceDB directly on Lance datasets hosted on the Hugging Face Hub for multimodal search and retrieval." ---