diff --git a/docs/hub/_toctree.yml b/docs/hub/_toctree.yml index ad3406ade..c9ba0ef02 100644 --- a/docs/hub/_toctree.yml +++ b/docs/hub/_toctree.yml @@ -108,6 +108,8 @@ title: Widget Examples - local: models-inference title: Inference API docs + - local: models-download-stats + title: Models Download Stats - local: models-faq title: Frequently Asked Questions - local: models-advanced @@ -149,6 +151,8 @@ sections: - local: datasets-viewer-configure title: Configure the Dataset Viewer + - local: datasets-download-stats + title: Datasets Download Stats - local: datasets-data-files-configuration title: Data files Configuration sections: diff --git a/docs/hub/datasets-download-stats.md b/docs/hub/datasets-download-stats.md new file mode 100644 index 000000000..8f3c3a5db --- /dev/null +++ b/docs/hub/datasets-download-stats.md @@ -0,0 +1,8 @@ +# Datasets Download Stats + +## How are download stats generated for datasets? + +The Hub provides download stats for all datasets loadable via the `datasets` library. To determine the number of downloads, the Hub counts every time `load_dataset` is called in Python, excluding Hugging Face's CI tooling on GitHub. No information is sent from the user, and no additional calls are made for this. The count is done server-side as we serve files for downloads. This means that: + +* The download count is the same regardless of whether the data is directly stored on the Hub repo or if the repository has a script to load the data from an external source. +* If a user manually downloads the data using tools like `wget` or the Hub's user interface (UI), those downloads will not be included in the download count. diff --git a/docs/hub/index.md b/docs/hub/index.md index ec13e8612..d7f0a8570 100644 --- a/docs/hub/index.md +++ b/docs/hub/index.md @@ -31,6 +31,7 @@ The Hugging Face Hub is a platform with over 350k models, 75k datasets, and 150k Tasks Widgets Inference API +Download Stats
@@ -44,6 +45,7 @@ The Hugging Face Hub is a platform with over 350k models, 75k datasets, and 150k Downloading Datasets Libraries Dataset Viewer +Download Stats Data files Configuration
diff --git a/docs/hub/models-download-stats.md b/docs/hub/models-download-stats.md new file mode 100644 index 000000000..4acfa9785 --- /dev/null +++ b/docs/hub/models-download-stats.md @@ -0,0 +1,157 @@ +# Models Download Stats + +## How are download stats generated for models? + +Counting the number of downloads for models is not a trivial task as a single model repository might contain multiple files, including multiple model weight files (e.g., with sharded models), and different formats depending on the library. To avoid double counting downloads (e.g., counting a single download of a model as multiple downloads), the Hub uses a set of query files that are employed for download counting. No information is sent from the user, and no additional calls are made for this. The count is done server-side as we serve files for downloads. + +Every HTTP request to these files, including `GET` and `HEAD` will be counted as a download. By default, when no library is specified, the Hub uses `config.json` as the default query file. Otherwise, the query file depends on each library, and the Hub might examine files such as `pytorch_model.bin` and `adapter_config.json`. + +## Which are the query files for different libraries? + +By default, the Hub looks at `config.json`, `config.yaml`, `hyperparams.yaml`, and `meta.yaml`. For the following set of libraries, there are specific query files + +```json +{ + "adapter-transformers": { + filter: [ + { + term: { path: "adapter_config.json" }, + }, + ], + }, + "asteroid": { + filter: [ + { + term: { path: "pytorch_model.bin" }, + }, + ], + }, + "flair": { + filter: [ + { + term: { path: "pytorch_model.bin" }, + }, + ], + }, + "keras": { + filter: [ + { + term: { path: "saved_model.pb" }, + }, + ], + }, + "ml-agents": { + filter: [ + { + wildcard: { path: "*.onnx" }, + }, + ], + }, + "nemo": { + filter: [ + { + wildcard: { path: "*.nemo" }, + }, + ], + }, + "open_clip": { + filter: [ + { + wildcard: { path: "*pytorch_model.bin" }, + }, + ], + }, + "sample-factory": { + filter: [ + { + term: { path: "cfg.json" }, + }, + ], + }, + "paddlenlp": { + filter: [ + { + term: { path: "model_config.json" }, + }, + ], + }, + "speechbrain": { + filter: [ + { + term: { path: "hyperparams.yaml" }, + }, + ], + }, + "sklearn": { + filter: [ + { + term: { path: "sklearn_model.joblib" }, + }, + ], + }, + "spacy": { + filter: [ + { + wildcard: { path: "*.whl" }, + }, + ], + }, + "stanza": { + filter: [ + { + term: { path: "models/default.zip" }, + }, + ], + }, + "stable-baselines3": { + filter: [ + { + wildcard: { path: "*.zip" }, + }, + ], + }, + "timm": { + filter: [ + { + terms: { path: ["pytorch_model.bin", "model.safetensors"] }, + }, + ], + }, + "diffusers": { + /// Filter out nested safetensors and pickle weights to avoid double counting downloads from the diffusers lib + must_not: [ + { + wildcard: { path: "*/*.safetensors" }, + }, + { + wildcard: { path: "*/*.bin" }, + }, + ], + /// Include documents that match at least one of the following rules + should: [ + /// Downloaded from diffusers lib + { + term: { path: "model_index.json" }, + }, + /// Direct downloads (LoRa, Auto1111 and others) + { + wildcard: { path: "*.safetensors" }, + }, + { + wildcard: { path: "*.ckpt" }, + }, + { + wildcard: { path: "*.bin" }, + }, + ], + minimum_should_match: 1, + }, + "peft": { + filter: [ + { + term: { path: "adapter_config.json" }, + }, + ], + } +} +``` diff --git a/docs/hub/models-faq.md b/docs/hub/models-faq.md index 890da38a5..0ec34bf63 100644 --- a/docs/hub/models-faq.md +++ b/docs/hub/models-faq.md @@ -1,4 +1,4 @@ -# Frequently Asked Questions +# Models Frequently Asked Questions ## How can I see what dataset was used to train the model? @@ -42,4 +42,4 @@ If the model card includes a link to a paper on arXiv, the Hugging Face Hub will -Read more about paper pages [here](./paper-pages). \ No newline at end of file +Read more about paper pages [here](./paper-pages).