Skip to content

Commit

Permalink
Fix regression for pandas < 2.0.0 in JSON loader (#6978)
Browse files Browse the repository at this point in the history
Pass dtype_backend to pd.read_json only for version >=2
  • Loading branch information
albertvillanova committed Jun 19, 2024
1 parent e59582a commit e47a746
Showing 1 changed file with 9 additions and 2 deletions.
11 changes: 9 additions & 2 deletions src/datasets/packaged_modules/json/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pyarrow.json as paj

import datasets
import datasets.config
from datasets.table import table_cast
from datasets.utils.file_utils import readline

Expand All @@ -31,6 +32,12 @@ def ujson_loads(*args, **kwargs):
return pd.io.json.loads(*args, **kwargs)


def pandas_read_json(path_or_buf, **kwargs):
if datasets.config.PANDAS_VERSION.major >= 2:
kwargs["dtype_backend"] = "pyarrow"
return pd.read_json(path_or_buf, **kwargs)


@dataclass
class JsonConfig(datasets.BuilderConfig):
"""BuilderConfig for JSON."""
Expand Down Expand Up @@ -96,7 +103,7 @@ def _generate_tables(self, files):
dataset = ujson_loads(f.read())
# We keep only the field we are interested in
dataset = dataset[self.config.field]
df = pd.read_json(io.StringIO(ujson_dumps(dataset)), dtype_backend="pyarrow")
df = pandas_read_json(io.StringIO(ujson_dumps(dataset)))
if df.columns.tolist() == [0]:
df.columns = list(self.config.features) if self.config.features else ["text"]
pa_table = pa.Table.from_pandas(df, preserve_index=False)
Expand Down Expand Up @@ -150,7 +157,7 @@ def _generate_tables(self, files):
with open(
file, encoding=self.config.encoding, errors=self.config.encoding_errors
) as f:
df = pd.read_json(f, dtype_backend="pyarrow")
df = pandas_read_json(f)
except ValueError:
logger.error(f"Failed to load JSON from file '{file}' with error {type(e)}: {e}")
raise e
Expand Down

0 comments on commit e47a746

Please sign in to comment.