feat: 🎸 truncate cell contents instead of removing rows (#178)

Add a ROWS_MIN_NUMBER environment variable, which defines how many rows should be returned as a minimum. If the size of these rows is greater than the ROWS_MAX_BYTES limit, then the cells themselves are truncated (transformed to strings, then truncated to 100 bytes which is an hardcoded limit). In that case, the new field "truncated_cells" contain the list of cells (column names) that are truncated. BREAKING CHANGE: 🧨 The /rows response format has changed
huggingface · Mar 14, 2022 · f406c0d · f406c0d
1 parent 4c542a7
commit f406c0d
Show file tree

Hide file tree

Showing 9 changed files with 115 additions and 24 deletions.
diff --git a/.env.example b/.env.example
@@ -41,6 +41,9 @@
 # Max number of rows in the /rows endpoint response
 # ROWS_MAX_NUMBER=100
 
+# Min number of rows in the /rows endpoint response
+# ROWS_MIN_NUMBER=10
+
 # Number of uvicorn workers
 # WEB_CONCURRENCY = 2
 

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -28,6 +28,7 @@ jobs:
         run: sudo docker run -d -p 27018:27017 mongo:latest
       - name: Run unit tests
         env:
+          ROWS_MIN_NUMBER: 2
           ROWS_MAX_NUMBER: 5
           MONGO_CACHE_DATABASE: datasets_preview_cache_test
           MONGO_QUEUE_DATABASE: datasets_preview_queue_test

diff --git a/Makefile b/Makefile
@@ -14,11 +14,11 @@ watch:
 
 .PHONY: test
 test:
-	ROWS_MAX_NUMBER=5 MONGO_CACHE_DATABASE="datasets_preview_cache_test" MONGO_QUEUE_DATABASE="datasets_preview_queue_test" poetry run python -m pytest -x tests
+	ROWS_MIN_NUMBER=2 ROWS_MAX_NUMBER=5 MONGO_CACHE_DATABASE="datasets_preview_cache_test" MONGO_QUEUE_DATABASE="datasets_preview_queue_test" poetry run python -m pytest -x tests
 
 .PHONY: coverage
 coverage:
-	ROWS_MAX_NUMBER=5 MONGO_CACHE_DATABASE="datasets_preview_cache_test" MONGO_QUEUE_DATABASE="datasets_preview_queue_test" poetry run python -m pytest -s --cov --cov-report xml:coverage.xml --cov-report=term tests
+	ROWS_MIN_NUMBER=2 ROWS_MAX_NUMBER=5 MONGO_CACHE_DATABASE="datasets_preview_cache_test" MONGO_QUEUE_DATABASE="datasets_preview_queue_test" poetry run python -m pytest -s --cov --cov-report xml:coverage.xml --cov-report=term tests
 
 # Check that source code meets quality standards + security
 .PHONY: quality

diff --git a/README.md b/README.md
@@ -40,6 +40,7 @@ Set environment variables to configure the following aspects:
 - `MONGO_URL`: the URL used to connect to the mongo db server. Defaults to `"mongodb://localhost:27018"`.
 - `ROWS_MAX_BYTES`: max size of the /rows endpoint response in bytes. Defaults to `1_000_000` (1 MB).
 - `ROWS_MAX_NUMBER`: max number of rows in the /rows endpoint response. Defaults to `100`.
+- `ROWS_MIN_NUMBER`: min number of rows in the /rows endpoint response. Defaults to `10`.
 - `WEB_CONCURRENCY`: the number of workers. For now, it's ignored and hardcoded to 1 because the cache is not shared yet. Defaults to `2`.
 
 For example:
@@ -605,7 +606,7 @@ Parameters:
 
 Responses:
 
-- `200`: JSON content that provides the types of the columns (see features at https://huggingface.co/docs/datasets/about_dataset_features.html) and the data rows, with the following structure. Note that the features are ordered and this order can be used to display the columns in a table for example. Binary values are transmitted in UTF-8 encoded base64 strings. The number of rows depends on `ROWS_MAX_BYTES` and `ROWS_MAX_NUMBER`.
+- `200`: JSON content that provides the types of the columns (see features at https://huggingface.co/docs/datasets/about_dataset_features.html) and the data rows, with the following structure. Note that the features are ordered and this order can be used to display the columns in a table for example. Binary values are transmitted in UTF-8 encoded base64 strings. The number of rows depends on `ROWS_MAX_BYTES`, `ROWS_MIN_NUMBER` and `ROWS_MAX_NUMBER`. Note that the content of a cell might be truncated to fit within the limits, in which case the `truncated_cells` array will contain the name of the cell (see the last element in the example), and the cell content will always be a string.
 
   ```json
   {
@@ -654,7 +655,8 @@ Responses:
           "hypothesis": "The cat did not sit on the mat.",
           "label": -1,
           "idx": 0
-        }
+        },
+        "truncated_cells": []
       },
       {
         "dataset": "glue",
@@ -666,19 +668,21 @@ Responses:
           "hypothesis": "The cat sat on the mat.",
           "label": -1,
           "idx": 1
-        }
+        },
+        "truncated_cells": []
       },
       {
         "dataset": "glue",
         "config": "ax",
         "split": "test",
         "row_idx": 2,
         "row": {
-          "premise": "When you've got no snow, it's really hard to learn a snow sport so we looked at all the different ways I could mimic being on snow without actually being on snow.",
-          "hypothesis": "When you've got snow, it's really hard to learn a snow sport so we looked at all the different ways I could mimic being on snow without actually being on snow.",
+          "premise": "When you've got no snow, it's really hard to learn a snow sport so we lo",
+          "hypothesis": "When you've got snow, it's really hard to learn a snow sport so we looke",
           "label": -1,
           "idx": 2
-        }
+        },
+        "truncated_cells": ["premise", "hypothesis"]
       }
     ]
   }

diff --git a/src/datasets_preview_backend/config.py b/src/datasets_preview_backend/config.py
@@ -16,6 +16,7 @@
     DEFAULT_MONGO_URL,
     DEFAULT_ROWS_MAX_BYTES,
     DEFAULT_ROWS_MAX_NUMBER,
+    DEFAULT_ROWS_MIN_NUMBER,
     DEFAULT_WEB_CONCURRENCY,
 )
 from datasets_preview_backend.utils import (
@@ -43,6 +44,7 @@
 MONGO_URL = get_str_value(d=os.environ, key="MONGO_URL", default=DEFAULT_MONGO_URL)
 ROWS_MAX_BYTES = get_int_value(d=os.environ, key="ROWS_MAX_BYTES", default=DEFAULT_ROWS_MAX_BYTES)
 ROWS_MAX_NUMBER = get_int_value(d=os.environ, key="ROWS_MAX_NUMBER", default=DEFAULT_ROWS_MAX_NUMBER)
+ROWS_MIN_NUMBER = get_int_value(d=os.environ, key="ROWS_MIN_NUMBER", default=DEFAULT_ROWS_MIN_NUMBER)
 WEB_CONCURRENCY = get_int_value(d=os.environ, key="WEB_CONCURRENCY", default=DEFAULT_WEB_CONCURRENCY)
 
 # Ensure datasets library uses the expected revision for canonical datasets

diff --git a/src/datasets_preview_backend/constants.py b/src/datasets_preview_backend/constants.py
@@ -13,6 +13,7 @@
 DEFAULT_MONGO_URL: str = "mongodb://localhost:27018"
 DEFAULT_ROWS_MAX_BYTES: int = 1_000_000
 DEFAULT_ROWS_MAX_NUMBER: int = 100
+DEFAULT_ROWS_MIN_NUMBER: int = 10
 DEFAULT_WEB_CONCURRENCY: int = 2
 
 DEFAULT_HF_TOKEN: Optional[str] = None
@@ -25,6 +26,9 @@
 
 DEFAULT_REFRESH_PCT: int = 1
 
+# below 100 bytes, the cell content will not be truncated
+DEFAULT_MIN_CELL_BYTES: int = 100
+
 # these datasets take too much time, we block them beforehand
 DATASETS_BLOCKLIST: List[str] = [
     "imthanhlv/binhvq_news21_raw",

diff --git a/src/datasets_preview_backend/io/cache.py b/src/datasets_preview_backend/io/cache.py
@@ -28,6 +28,7 @@
 from mongoengine.queryset.queryset import QuerySet
 
 from datasets_preview_backend.config import MONGO_CACHE_DATABASE, MONGO_URL
+from datasets_preview_backend.constants import DEFAULT_MIN_CELL_BYTES
 from datasets_preview_backend.exceptions import (
     Status400Error,
     Status500Error,
@@ -134,6 +135,7 @@ class RowItem(TypedDict):
     split: str
     row_idx: int
     row: Dict[str, Any]
+    truncated_cells: List[str]
 
 
 class DbRow(Document):
@@ -150,6 +152,7 @@ def to_item(self) -> RowItem:
             "split": self.split_name,
             "row_idx": self.row_idx,
             "row": self.row,
+            "truncated_cells": [],
         }
 
     meta = {"collection": "rows", "db_alias": "cache"}
@@ -484,8 +487,8 @@ def get_splits_response(dataset_name: str) -> Tuple[Union[SplitsResponse, None],
     return splits_response, None, 200
 
 
-def get_size_in_bytes(row: RowItem):
-    return sys.getsizeof(orjson_dumps(row))
+def get_size_in_bytes(obj: Any):
+    return sys.getsizeof(orjson_dumps(obj))
     # ^^ every row is transformed here in a string, because it corresponds to
     # the size the row will contribute in the JSON response to /rows endpoint.
     # The size of the string is measured in bytes.
@@ -494,25 +497,97 @@ def get_size_in_bytes(row: RowItem):
     # dataset viewer table on the hub)
 
 
-def to_row_items(rows: QuerySet[DbRow], rows_max_bytes: Optional[int]) -> List[RowItem]:
+def truncate_cell(cell: Any, min_cell_bytes: int) -> str:
+    return orjson_dumps(cell)[:min_cell_bytes].decode("utf8", "ignore")
+
+
+# Mutates row_item, and returns it anyway
+def truncate_row_item(row_item: RowItem) -> RowItem:
+    min_cell_bytes = DEFAULT_MIN_CELL_BYTES
+    row = {}
+    for column_name, cell in row_item["row"].items():
+        # for now: all the cells, but the smallest ones, are truncated
+        cell_bytes = get_size_in_bytes(cell)
+        if cell_bytes > min_cell_bytes:
+            row_item["truncated_cells"].append(column_name)
+            row[column_name] = truncate_cell(cell, min_cell_bytes)
+        else:
+            row[column_name] = cell
+    row_item["row"] = row
+    return row_item
+
+
+# Mutates row_items, and returns them anyway
+def truncate_row_items(row_items: List[RowItem], rows_max_bytes: int) -> List[RowItem]:
+    # compute the current size
+    rows_bytes = sum(get_size_in_bytes(row_item) for row_item in row_items)
+
+    # Loop backwards, so that the last rows are truncated first
+    for row_item in reversed(row_items):
+        previous_size = get_size_in_bytes(row_item)
+        row_item = truncate_row_item(row_item)
+        new_size = get_size_in_bytes(row_item)
+        rows_bytes += new_size - previous_size
+        row_idx = row_item["row_idx"]
+        logger.debug(f"the size of the rows is now ({rows_bytes}) after truncating row idx={row_idx}")
+        if rows_bytes < rows_max_bytes:
+            break
+    return row_items
+
+
+def to_row_items(
+    rows: QuerySet[DbRow], rows_max_bytes: Optional[int], rows_min_number: Optional[int]
+) -> List[RowItem]:
     row_items = []
-    bytes = 0
-    for idx, row in enumerate(rows):
+    rows_bytes = 0
+    if rows_min_number is None:
+        rows_min_number = 0
+    else:
+        logger.debug(f"min number of rows in the response: '{rows_min_number}'")
+    if rows_max_bytes is not None:
+        logger.debug(f"max number of bytes in the response: '{rows_max_bytes}'")
+
+    # two restrictions must be enforced:
+    # - at least rows_min_number rows
+    # - at most rows_max_bytes bytes
+    # To enforce this:
+    # 1. first get the first rows_min_number rows
+    for row in rows[:rows_min_number]:
+        row_item = row.to_item()
+        if rows_max_bytes is not None:
+            rows_bytes += get_size_in_bytes(row_item)
+        row_items.append(row_item)
+
+    # 2. if the total is over the bytes limit, truncate the values, iterating backwards starting
+    # from the last rows, until getting under the threshold
+    if rows_max_bytes is not None and rows_bytes >= rows_max_bytes:
+        logger.debug(
+            f"the size of the first {rows_min_number} rows ({rows_bytes}) is above the max number of bytes"
+            f" ({rows_max_bytes}), they will be truncated"
+        )
+        return truncate_row_items(row_items, rows_max_bytes)
+
+    # 3. else: add the remaining rows until the end, or until the bytes threshold
+    for idx, row in enumerate(rows[rows_min_number:]):
         row_item = row.to_item()
         if rows_max_bytes is not None:
-            bytes += get_size_in_bytes(row_item)
-            if bytes >= rows_max_bytes:
+            rows_bytes += get_size_in_bytes(row_item)
+            if rows_bytes >= rows_max_bytes:
                 logger.debug(
-                    f"the rows in the split have been truncated to {idx} row(s) to keep the size ({bytes}) under the"
-                    f" limit ({rows_max_bytes})"
+                    f"the rows in the split have been truncated to {rows_min_number + idx} row(s) to keep the size"
+                    f" ({rows_bytes}) under the limit ({rows_max_bytes})"
                 )
                 break
         row_items.append(row_item)
     return row_items
 
 
 def get_rows_response(
-    dataset_name: str, config_name: str, split_name: str, rows_max_bytes: Optional[int] = None
+    dataset_name: str,
+    config_name: str,
+    split_name: str,
+    rows_max_bytes: Optional[int] = None,
+    rows_min_number: Optional[int] = None,
 ) -> Tuple[Union[RowsResponse, None], Union[ErrorItem, None], int]:
     try:
         split = DbSplit.objects(dataset_name=dataset_name, config_name=config_name, split_name=split_name).get()
@@ -540,7 +615,7 @@ def get_rows_response(
     rows = DbRow.objects(dataset_name=dataset_name, config_name=config_name, split_name=split_name).order_by(
         "+row_idx"
     )
-    row_items = to_row_items(rows, rows_max_bytes)
+    row_items = to_row_items(rows, rows_max_bytes, rows_min_number)
     rows_response: RowsResponse = {
         "columns": [column.to_item() for column in columns],
         "rows": row_items,

diff --git a/src/datasets_preview_backend/models/row.py b/src/datasets_preview_backend/models/row.py
@@ -31,11 +31,9 @@ def get_rows(
     elif not isinstance(dataset, Dataset):
         raise TypeError("load_dataset should return a Dataset")
     rows_plus_one = list(itertools.islice(dataset, ROWS_MAX_NUMBER + 1))
-    # ^^ to be able to detect if a split has exactly DEFAULT_ROWS_MAX_NUMBER rows
+    # ^^ to be able to detect if a split has exactly ROWS_MAX_NUMBER rows
     if len(rows_plus_one) <= ROWS_MAX_NUMBER:
         logger.debug(f"all the rows in the split have been fetched ({len(rows_plus_one)})")
     else:
         logger.debug(f"the rows in the split have been truncated ({ROWS_MAX_NUMBER} rows)")
     return rows_plus_one[:ROWS_MAX_NUMBER]
-    # ^^ note that DEFAULT_ROWS_MAX_BYTES is not enforced here, but in typed_row.py
-    # after the type of the fields is known (ie: the row can be converted to JSON)
diff --git a/src/datasets_preview_backend/routes/rows.py b/src/datasets_preview_backend/routes/rows.py
@@ -3,7 +3,11 @@
 from starlette.requests import Request
 from starlette.responses import Response
 
-from datasets_preview_backend.config import MAX_AGE_LONG_SECONDS, ROWS_MAX_BYTES
+from datasets_preview_backend.config import (
+    MAX_AGE_LONG_SECONDS,
+    ROWS_MAX_BYTES,
+    ROWS_MIN_NUMBER,
+)
 from datasets_preview_backend.exceptions import StatusError
 from datasets_preview_backend.io.cache import get_rows_response
 from datasets_preview_backend.routes._utils import get_response
@@ -21,7 +25,7 @@ async def rows_endpoint(request: Request) -> Response:
         if not isinstance(dataset_name, str) or not isinstance(config_name, str) or not isinstance(split_name, str):
             raise StatusError("Parameters 'dataset', 'config' and 'split' are required", 400)
         rows_response, rows_error, status_code = get_rows_response(
-            dataset_name, config_name, split_name, ROWS_MAX_BYTES
+            dataset_name, config_name, split_name, ROWS_MAX_BYTES, ROWS_MIN_NUMBER
         )
         return get_response(rows_response or rows_error, status_code, MAX_AGE_LONG_SECONDS)
     except StatusError as err: