Skip to content

Latest commit

 

History

History
148 lines (139 loc) · 4.47 KB

size.md

File metadata and controls

148 lines (139 loc) · 4.47 KB

Get the number of rows and the size in bytes

This guide shows you how to use the dataset viewer's /size endpoint to retrieve a dataset's size programmatically. Feel free to also try it out with ReDoc.

The /size endpoint accepts the dataset name as its query parameter:

```python import requests headers = {"Authorization": f"Bearer {API_TOKEN}"} API_URL = "https://datasets-server.huggingface.co/size?dataset=ibm/duorc" def query(): response = requests.get(API_URL, headers=headers) return response.json() data = query() ``` ```js import fetch from "node-fetch"; async function query(data) { const response = await fetch( "https://datasets-server.huggingface.co/size?dataset=ibm/duorc", { headers: { Authorization: `Bearer ${API_TOKEN}` }, method: "GET" } ); const result = await response.json(); return result; } query().then((response) => { console.log(JSON.stringify(response)); }); ``` ```curl curl https://datasets-server.huggingface.co/size?dataset=ibm/duorc \ -X GET \ -H "Authorization: Bearer ${API_TOKEN}" ```

The endpoint response is a JSON containing the size of the dataset, as well as each of its configurations and splits. It provides the number of rows, the number of colums (where applicable) and the size in bytes for the different forms of the data: original files, size in memory (RAM) and auto-converted parquet files. For example, the ibm/duorc dataset has 187.213 rows along all its configurations and splits, for a total of 97MB.

{
   "size":{
      "dataset":{
         "dataset":"ibm/duorc",
         "num_bytes_original_files":58710973,
         "num_bytes_parquet_files":58710973,
         "num_bytes_memory":1060742354,
         "num_rows":187213
      },
      "configs":[
         {
            "dataset":"ibm/duorc",
            "config":"ParaphraseRC",
            "num_bytes_original_files":37709127,
            "num_bytes_parquet_files":37709127,
            "num_bytes_memory":704394283,
            "num_rows":100972,
            "num_columns":7
         },
         {
            "dataset":"ibm/duorc",
            "config":"SelfRC",
            "num_bytes_original_files":21001846,
            "num_bytes_parquet_files":21001846,
            "num_bytes_memory":356348071,
            "num_rows":86241,
            "num_columns":7
         }
      ],
      "splits":[
         {
            "dataset":"ibm/duorc",
            "config":"ParaphraseRC",
            "split":"train",
            "num_bytes_parquet_files":26005668,
            "num_bytes_memory":494389683,
            "num_rows":69524,
            "num_columns":7
         },
         {
            "dataset":"ibm/duorc",
            "config":"ParaphraseRC",
            "split":"validation",
            "num_bytes_parquet_files":5566868,
            "num_bytes_memory":106733319,
            "num_rows":15591,
            "num_columns":7
         },
         {
            "dataset":"ibm/duorc",
            "config":"ParaphraseRC",
            "split":"test",
            "num_bytes_parquet_files":6136591,
            "num_bytes_memory":103271281,
            "num_rows":15857,
            "num_columns":7
         },
         {
            "dataset":"ibm/duorc",
            "config":"SelfRC",
            "split":"train",
            "num_bytes_parquet_files":14851720,
            "num_bytes_memory":248966361,
            "num_rows":60721,
            "num_columns":7
         },
         {
            "dataset":"ibm/duorc",
            "config":"SelfRC",
            "split":"validation",
            "num_bytes_parquet_files":3114390,
            "num_bytes_memory":56359392,
            "num_rows":12961,
            "num_columns":7
         },
         {
            "dataset":"ibm/duorc",
            "config":"SelfRC",
            "split":"test",
            "num_bytes_parquet_files":3035736,
            "num_bytes_memory":51022318,
            "num_rows":12559,
            "num_columns":7
         }
      ]
   },
   "pending":[
      
   ],
   "failed":[
      
   ],
   "partial":false
}

If the size has partial: true it means that the actual size of the dataset couldn't been determined because it's too big.

In that case the number of rows and bytes can be inferior to the actual numbers.