Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 122 additions & 0 deletions sdata/sclass/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import io
import os
import base64
import json
from typing import Any, Dict, Optional, Union
import logging

Expand Down Expand Up @@ -408,6 +409,127 @@
tt.df.attrs.pop("_sdata")
return tt

# ------------------------------------------------------------------ CSV
def to_csv(self, path=None, filename=None, sidecar=False, **kwargs):
"""Serialize the df to CSV (pure pandas, no extra dependency).

Check notice on line 414 in sdata/sclass/dataframe.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

sdata/sclass/dataframe.py#L414

Multi-line docstring summary should start at the second line (D213)

CSV carries data only; the qualifying metadata travels in the optional
``<sname>.meta.jsonld`` sidecar. The index is dropped by default
(override via ``index=True``).

:param path: directory to write ``<sname>.csv`` into (if given).
:param filename: exact output filename (defaults to ``<sname>.csv``).
:param sidecar: also write a JSON-LD metadata sidecar next to the file.
:param kwargs: forwarded to :meth:`pandas.DataFrame.to_csv`.
:return: the file path (if written to disk) or the CSV string.
"""
kwargs.setdefault("index", False)
if filename is None and path is not None:
filename = self.sname + ".csv"
if filename is not None:
filepath = os.path.join(path, filename) if path else filename
self.df.to_csv(filepath, **kwargs)
logger.info(f"DataFrame CSV saved to {filepath}")
if sidecar:
self.write_sidecar(path)
return filepath
return self.df.to_csv(**kwargs)

@classmethod
def from_csv(cls, filepath, **kwargs):
"""Load a DataFrame from a CSV file (pure pandas).

Check notice on line 440 in sdata/sclass/dataframe.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

sdata/sclass/dataframe.py#L440

Multi-line docstring summary should start at the second line (D213)

:param filepath: path to the CSV file.
:param kwargs: forwarded to :func:`pandas.read_csv`.
:return: a :class:`DataFrame` instance (data only; use a sidecar for metadata).
:raises FileNotFoundError: if ``filepath`` does not exist.
"""
if not os.path.exists(filepath):
raise FileNotFoundError(f"no CSV file {filepath}")
df = pd.read_csv(filepath, **kwargs)
tt = cls(name=filepath)
tt.df = df
return tt

# ---------------------------------------------------------------- Arrow
def to_arrow(self):
"""Return a :class:`pyarrow.Table` with sdata metadata in the schema.

Check notice on line 456 in sdata/sclass/dataframe.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

sdata/sclass/dataframe.py#L456

Multi-line docstring summary should start at the second line (D213)

The metadata, column_metadata and description are embedded as JSON under
the ``b"_sdata"`` schema-metadata key (alongside pandas' own metadata).

:return: a ``pyarrow.Table``.
:raises ImportError: if pyarrow is not installed (``pip install sdata[parquet]``).
"""
_require_parquet("pyarrow")
import pyarrow as pa
table = pa.Table.from_pandas(self.df)
meta = dict(table.schema.metadata or {})
meta[b"_sdata"] = json.dumps({
"metadata": self.metadata.to_dict(),
"column_metadata": self.column_metadata.to_dict(),
"description": self.description,
}).encode("utf-8")
return table.replace_schema_metadata(meta)

@classmethod
def from_arrow(cls, table):
"""Build a DataFrame from a :class:`pyarrow.Table` written by :meth:`to_arrow`.

Check notice on line 477 in sdata/sclass/dataframe.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

sdata/sclass/dataframe.py#L477

Multi-line docstring summary should start at the second line (D213)

:param table: a ``pyarrow.Table`` (sdata metadata restored if present).
:return: a :class:`DataFrame` instance.
:raises ImportError: if pyarrow is not installed.
"""
_require_parquet("pyarrow")
tt = cls()
tt.df = table.to_pandas()
raw = (table.schema.metadata or {}).get(b"_sdata")
if raw is not None:
tt._restore_from_attrs(json.loads(raw.decode("utf-8")))
return tt

# -------------------------------------------------------------- Feather
def to_feather(self, path=None, filename=None, sidecar=False, **kwargs):
"""Serialize to the Feather (Arrow IPC) format, embedding sdata metadata.

Check notice on line 493 in sdata/sclass/dataframe.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

sdata/sclass/dataframe.py#L493

Multi-line docstring summary should start at the second line (D213)

:param path: directory to write ``<sname>.feather`` into (if given).
:param filename: exact output filename (defaults to ``<sname>.feather``).
:param sidecar: also write a JSON-LD metadata sidecar next to the file.
:param kwargs: forwarded to :func:`pyarrow.feather.write_feather`.
:return: the file path (if written to disk) or the Feather bytes.
:raises ImportError: if pyarrow is not installed.
"""
_require_parquet("pyarrow")
import pyarrow.feather as feather
table = self.to_arrow()
if filename is None and path is not None:
filename = self.sname + ".feather"
if filename is not None:
filepath = os.path.join(path, filename) if path else filename
feather.write_feather(table, filepath, **kwargs)
logger.info(f"DataFrame Feather saved to {filepath}")
if sidecar:
self.write_sidecar(path)
return filepath
sink = io.BytesIO()
feather.write_feather(table, sink, **kwargs)
return sink.getvalue()

@classmethod
def from_feather(cls, filepath):
"""Load a DataFrame from a Feather file written by :meth:`to_feather`.

Check notice on line 520 in sdata/sclass/dataframe.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

sdata/sclass/dataframe.py#L520

Multi-line docstring summary should start at the second line (D213)

:param filepath: path to the ``.feather`` file.
:return: a :class:`DataFrame` instance.
:raises FileNotFoundError: if ``filepath`` does not exist.
:raises ImportError: if pyarrow is not installed.
"""
if not os.path.exists(filepath):
raise FileNotFoundError(f"no Feather file {filepath}")
_require_parquet("pyarrow")
import pyarrow.feather as feather
return cls.from_arrow(feather.read_table(filepath))


if __name__ == '__main__':
# Erstelle einen Pandas DataFrame
Expand Down
103 changes: 103 additions & 0 deletions tests/test_sclass_dataframe_interop.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# -*- coding: utf-8 -*-
"""Interop/Export (PR3) für sdata/sclass/dataframe.py:

Check notice on line 2 in tests/test_sclass_dataframe_interop.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

tests/test_sclass_dataframe_interop.py#L2

1 blank line required between summary line and description (found 0) (D205)

Check notice on line 2 in tests/test_sclass_dataframe_interop.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

tests/test_sclass_dataframe_interop.py#L2

First line should end with a period, question mark, or exclamation point (not ':') (D415)

Check notice on line 2 in tests/test_sclass_dataframe_interop.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

tests/test_sclass_dataframe_interop.py#L2

Multi-line docstring summary should start at the second line (D213)
CSV (pure pandas, immer verfügbar) sowie Arrow/Feather (pyarrow-guarded).
"""
import os

import pandas as pd
import pytest

from sdata.sclass.dataframe import DataFrame


def _df():
return pd.DataFrame({"weight": [10, 20, 30], "height": [1.5, 1.6, 1.7]})


# ----------------------------------------------------------------- CSV (immer)
def test_to_csv_string_roundtrip(tmp_path):
sdf = DataFrame(df=_df(), name="rt")
text = sdf.to_csv() # ohne path/filename -> String
assert isinstance(text, str) and "weight" in text

Check warning on line 21 in tests/test_sclass_dataframe_interop.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

tests/test_sclass_dataframe_interop.py#L21

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
fp = tmp_path / "data.csv"
fp.write_text(text)
back = DataFrame.from_csv(str(fp))
assert list(back.df.columns) == ["weight", "height"]

Check warning on line 25 in tests/test_sclass_dataframe_interop.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

tests/test_sclass_dataframe_interop.py#L25

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.


def test_to_csv_file_and_sidecar(tmp_path):
sdf = DataFrame(df=_df(), name="onfile", description="d")
fp = sdf.to_csv(path=str(tmp_path), sidecar=True)
assert fp.endswith(".csv") and os.path.exists(fp)

Check warning on line 31 in tests/test_sclass_dataframe_interop.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

tests/test_sclass_dataframe_interop.py#L31

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
# Sidecar liegt daneben
sidecars = list(tmp_path.glob("*.meta.jsonld"))
assert len(sidecars) == 1

Check warning on line 34 in tests/test_sclass_dataframe_interop.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

tests/test_sclass_dataframe_interop.py#L34

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.


def test_to_csv_explicit_filename_only(tmp_path):
sdf = DataFrame(df=_df(), name="f")
target = str(tmp_path / "explicit.csv")
fp = sdf.to_csv(filename=target) # filename ohne path -> else-Zweig
assert fp == target and os.path.exists(target)

Check warning on line 41 in tests/test_sclass_dataframe_interop.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

tests/test_sclass_dataframe_interop.py#L41

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
loaded = DataFrame.from_csv(fp)
assert list(loaded.df.columns) == ["weight", "height"]

Check warning on line 43 in tests/test_sclass_dataframe_interop.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

tests/test_sclass_dataframe_interop.py#L43

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.


def test_from_csv_missing_file_raises(tmp_path):
with pytest.raises(FileNotFoundError):
DataFrame.from_csv(str(tmp_path / "nope.csv"))


# ------------------------------------------------------------ Arrow (pyarrow)
def test_to_arrow_embeds_metadata():
pytest.importorskip("pyarrow")
sdf = DataFrame(df=_df(), name="a", description="d")
sdf.set_column("weight", unit="kg")
table = sdf.to_arrow()
assert b"_sdata" in (table.schema.metadata or {})

Check warning on line 57 in tests/test_sclass_dataframe_interop.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

tests/test_sclass_dataframe_interop.py#L57

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.


def test_arrow_roundtrip_preserves_annotations():
pytest.importorskip("pyarrow")
sdf = DataFrame(df=_df(), name="a", description="desc")
sdf.set_column("weight", unit="kg")
back = DataFrame.from_arrow(sdf.to_arrow())
assert list(back.df.columns) == ["weight", "height"]
assert back.description == "desc"

Check warning on line 66 in tests/test_sclass_dataframe_interop.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

tests/test_sclass_dataframe_interop.py#L66

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
assert back.get_column("weight").unit == "kg"

Check warning on line 67 in tests/test_sclass_dataframe_interop.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

tests/test_sclass_dataframe_interop.py#L67

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.


def test_from_arrow_plain_table_without_sdata():
pa = pytest.importorskip("pyarrow")
table = pa.Table.from_pandas(_df()) # kein _sdata -> raw None
back = DataFrame.from_arrow(table)
assert list(back.df.columns) == ["weight", "height"]


# ---------------------------------------------------------- Feather (pyarrow)
def test_feather_file_roundtrip(tmp_path):
pytest.importorskip("pyarrow")
sdf = DataFrame(df=_df(), name="feat", description="fd")
sdf.set_column("height", unit="m")
fp = sdf.to_feather(path=str(tmp_path), sidecar=True)
assert fp.endswith(".feather") and os.path.exists(fp)

Check warning on line 83 in tests/test_sclass_dataframe_interop.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

tests/test_sclass_dataframe_interop.py#L83

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
assert list(tmp_path.glob("*.meta.jsonld")) # Sidecar geschrieben

Check warning on line 84 in tests/test_sclass_dataframe_interop.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

tests/test_sclass_dataframe_interop.py#L84

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
back = DataFrame.from_feather(fp)
assert back.description == "fd"

Check warning on line 86 in tests/test_sclass_dataframe_interop.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

tests/test_sclass_dataframe_interop.py#L86

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
assert back.get_column("height").unit == "m"

Check warning on line 87 in tests/test_sclass_dataframe_interop.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

tests/test_sclass_dataframe_interop.py#L87

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.


def test_feather_bytes_and_explicit_filename(tmp_path):
pytest.importorskip("pyarrow")
sdf = DataFrame(df=_df(), name="feat")
raw = sdf.to_feather() # ohne path -> bytes
assert isinstance(raw, (bytes, bytearray))

Check warning on line 94 in tests/test_sclass_dataframe_interop.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

tests/test_sclass_dataframe_interop.py#L94

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
target = str(tmp_path / "explicit.feather")
fp = sdf.to_feather(filename=target) # filename ohne path -> else-Zweig
assert fp == target and os.path.exists(target)


def test_from_feather_missing_file_raises(tmp_path):
pytest.importorskip("pyarrow")
with pytest.raises(FileNotFoundError):
DataFrame.from_feather(str(tmp_path / "nope.feather"))