Merge pull request #827 from martindurant/bits

Useful bits
intake · Jun 20, 2024 · 63420dc · 63420dc
2 parents cdea0c9 + dde0080
commit 63420dc
Show file tree

Hide file tree

Showing 4 changed files with 51 additions and 3 deletions.
diff --git a/intake/readers/convert.py b/intake/readers/convert.py
@@ -10,7 +10,7 @@
 
 from intake import import_name, conf
 from intake.readers import BaseData, BaseReader, readers
-from intake.readers.utils import all_to_one, subclasses
+from intake.readers.utils import all_to_one, subclasses, safe_dict
 
 
 class ImportsProperty:
@@ -388,6 +388,32 @@ class PandasToPolars(BaseConverter):
     func = "polars:from_pandas"
 
 
+class DataFrameToMetadata(BaseConverter):
+    instances = all_to_one(
+        ["pandas:DataFrame", "dask.dataframe:DataFrame", "polars:DataFrame"], "builtins:dict"
+    )
+
+    def run(self, x, *args, **kwargs):
+        out = {"repr": repr(x), "shape": x.shape}  # cf Repr, the output converter
+        t = str(type(x)).lower()
+        # TODO: perhaps can split this class into several
+        # TODO: implement spark, daft, modin, ibis ...
+        # Note that FileSizeReader can give file size on disk (if origin is files)
+        if "pandas" in t:
+            out["memory"] = x.memory_usage(deep=True).sum()
+            out["schema"] = x.dtypes if hasattr(x, "dtypes") else x.dtype
+            out["shape"] = x.shape
+        elif "polars" in t:
+            out["memory"] = x.estimated_size()
+            out["shape"] = x.shape
+            out["schema"] = x.schema
+        elif "ray" in t:
+            out["memory"] = x.size_bytes()
+            out["shape"] = [x.count(), len(x.columns)]
+            out["schema"] = safe_dict(x.schema)
+        return safe_dict(out)
+
+
 def convert_class(data, out_type: str):
     """Get conversion class from given data to out_type
 
@@ -413,7 +439,7 @@ def convert_classes(in_type: str):
     package = in_type.split(":", 1)[0].split(".", 1)[0]
     for cls in subclasses(BaseConverter):
         for intype, outtype in cls.instances.items():
-            if intype.split(":", 1)[0].split(".", 1)[0] != package:
+            if "*" not in intype and intype.split(":", 1)[0].split(".", 1)[0] != package:
                 continue
             if re.findall(intype.lower(), in_type.lower()) or re.findall(
                 in_type.lower(), intype.lower()

diff --git a/intake/readers/datatypes.py b/intake/readers/datatypes.py
@@ -506,7 +506,7 @@ class CatalogAPI(Catalog, Service):
 class JSONFile(FileData):
     """Nested record format as readable text, very common over HTTP"""
 
-    filepattern = "json$"
+    filepattern = "json[l]$"
     mimetypes = "(text|application)/json"
     structure = {"nested", "table"}
     magic = {b"{"}

diff --git a/intake/readers/readers.py b/intake/readers/readers.py
@@ -229,6 +229,16 @@ def _read(self, data, encoding=None, **kwargs):
         return "".join(out)
 
 
+class FileSizeReader(FileReader):
+    output_instance = "builtins:int"
+    implements = {datatypes.FileData}
+
+    def _read(self, data, **kw):
+        fs, path = fsspec.url_to_fs(data.url, **(data.storage_options or {}))
+        path = fs.expand_path(path)  # or use fs.du with deep
+        return sum(fs.info(p)["size"] for p in path)
+
+
 class Pandas(FileReader):
     imports = {"pandas"}
     output_instance = "pandas:DataFrame"

diff --git a/intake/readers/utils.py b/intake/readers/utils.py
@@ -3,6 +3,7 @@
 import importlib.metadata
 import numbers
 import re
+import typing
 from functools import lru_cache as cache
 from hashlib import md5
 from itertools import zip_longest
@@ -471,3 +472,14 @@ def pattern_to_glob(pattern: str) -> str:
             except ValueError:
                 glob_path += "*"
     return glob_path
+
+
+def safe_dict(x):
+    """Make a dict or list-like int a form you can JSON serialize"""
+    if isinstance(x, str):
+        return x
+    if isinstance(x, typing.Mapping):
+        return {k: safe_dict(v) for k, v in x.items()}
+    if isinstance(x, typing.Iterable):
+        return [safe_dict(v) for v in x]
+    return str(x)