From 0036a5ea9b10719b735084038d7d98402d1fadfe Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 30 May 2024 12:49:23 +0200 Subject: [PATCH] Revert "Use pandas ujson in JSON loader to improve performance (#6874)" This reverts commit eafed0de8ddc95d34b37c4ca9cc5070015bf456f. --- src/datasets/packaged_modules/json/json.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/src/datasets/packaged_modules/json/json.py b/src/datasets/packaged_modules/json/json.py index 8fa1e975d5a..6076af6b37a 100644 --- a/src/datasets/packaged_modules/json/json.py +++ b/src/datasets/packaged_modules/json/json.py @@ -1,9 +1,9 @@ import io import itertools +import json from dataclasses import dataclass from typing import Optional -import pandas as pd import pyarrow as pa import pyarrow.json as paj @@ -15,14 +15,6 @@ logger = datasets.utils.logging.get_logger(__name__) -def ujson_loads(*args, **kwargs): - try: - return pd.io.json.ujson_loads(*args, **kwargs) - except AttributeError: - # Before pandas-2.2.0, ujson_loads was renamed to loads: import ujson_loads as loads - return pd.io.json.loads(*args, **kwargs) - - @dataclass class JsonConfig(datasets.BuilderConfig): """BuilderConfig for JSON.""" @@ -88,7 +80,7 @@ def _generate_tables(self, files): # If the file is one json object and if we need to look at the list of items in one specific field if self.config.field is not None: with open(file, encoding=self.config.encoding, errors=self.config.encoding_errors) as f: - dataset = ujson_loads(f.read()) + dataset = json.load(f) # We keep only the field we are interested in dataset = dataset[self.config.field] @@ -150,8 +142,8 @@ def _generate_tables(self, files): with open( file, encoding=self.config.encoding, errors=self.config.encoding_errors ) as f: - dataset = ujson_loads(f.read()) - except ValueError: + dataset = json.load(f) + except json.JSONDecodeError: logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}") raise e # If possible, parse the file as a list of json objects/strings and exit the loop