Skip to content

Commit

Permalink
Merge pull request #827 from martindurant/bits
Browse files Browse the repository at this point in the history
Useful bits
  • Loading branch information
martindurant committed Jun 20, 2024
2 parents cdea0c9 + dde0080 commit 63420dc
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 3 deletions.
30 changes: 28 additions & 2 deletions intake/readers/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from intake import import_name, conf
from intake.readers import BaseData, BaseReader, readers
from intake.readers.utils import all_to_one, subclasses
from intake.readers.utils import all_to_one, subclasses, safe_dict


class ImportsProperty:
Expand Down Expand Up @@ -388,6 +388,32 @@ class PandasToPolars(BaseConverter):
func = "polars:from_pandas"


class DataFrameToMetadata(BaseConverter):
instances = all_to_one(
["pandas:DataFrame", "dask.dataframe:DataFrame", "polars:DataFrame"], "builtins:dict"
)

def run(self, x, *args, **kwargs):
out = {"repr": repr(x), "shape": x.shape} # cf Repr, the output converter
t = str(type(x)).lower()
# TODO: perhaps can split this class into several
# TODO: implement spark, daft, modin, ibis ...
# Note that FileSizeReader can give file size on disk (if origin is files)
if "pandas" in t:
out["memory"] = x.memory_usage(deep=True).sum()
out["schema"] = x.dtypes if hasattr(x, "dtypes") else x.dtype
out["shape"] = x.shape
elif "polars" in t:
out["memory"] = x.estimated_size()
out["shape"] = x.shape
out["schema"] = x.schema
elif "ray" in t:
out["memory"] = x.size_bytes()
out["shape"] = [x.count(), len(x.columns)]
out["schema"] = safe_dict(x.schema)
return safe_dict(out)


def convert_class(data, out_type: str):
"""Get conversion class from given data to out_type
Expand All @@ -413,7 +439,7 @@ def convert_classes(in_type: str):
package = in_type.split(":", 1)[0].split(".", 1)[0]
for cls in subclasses(BaseConverter):
for intype, outtype in cls.instances.items():
if intype.split(":", 1)[0].split(".", 1)[0] != package:
if "*" not in intype and intype.split(":", 1)[0].split(".", 1)[0] != package:
continue
if re.findall(intype.lower(), in_type.lower()) or re.findall(
in_type.lower(), intype.lower()
Expand Down
2 changes: 1 addition & 1 deletion intake/readers/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,7 +506,7 @@ class CatalogAPI(Catalog, Service):
class JSONFile(FileData):
"""Nested record format as readable text, very common over HTTP"""

filepattern = "json$"
filepattern = "json[l]$"
mimetypes = "(text|application)/json"
structure = {"nested", "table"}
magic = {b"{"}
Expand Down
10 changes: 10 additions & 0 deletions intake/readers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,16 @@ def _read(self, data, encoding=None, **kwargs):
return "".join(out)


class FileSizeReader(FileReader):
output_instance = "builtins:int"
implements = {datatypes.FileData}

def _read(self, data, **kw):
fs, path = fsspec.url_to_fs(data.url, **(data.storage_options or {}))
path = fs.expand_path(path) # or use fs.du with deep
return sum(fs.info(p)["size"] for p in path)


class Pandas(FileReader):
imports = {"pandas"}
output_instance = "pandas:DataFrame"
Expand Down
12 changes: 12 additions & 0 deletions intake/readers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import importlib.metadata
import numbers
import re
import typing
from functools import lru_cache as cache
from hashlib import md5
from itertools import zip_longest
Expand Down Expand Up @@ -471,3 +472,14 @@ def pattern_to_glob(pattern: str) -> str:
except ValueError:
glob_path += "*"
return glob_path


def safe_dict(x):
"""Make a dict or list-like int a form you can JSON serialize"""
if isinstance(x, str):
return x
if isinstance(x, typing.Mapping):
return {k: safe_dict(v) for k, v in x.items()}
if isinstance(x, typing.Iterable):
return [safe_dict(v) for v in x]
return str(x)

0 comments on commit 63420dc

Please sign in to comment.