Skip to content

Commit

Permalink
Use pandas ujson in JSON loader to improve performance (#6874)
Browse files Browse the repository at this point in the history
* Use pandas ujson in JSON loader

* Fix renaming of ujson_loads to loads before pandas-2.1.0

* Fix pandas version in comment
  • Loading branch information
albertvillanova authored and lhoestq committed May 29, 2024
1 parent aeb162b commit eafed0d
Showing 1 changed file with 12 additions and 4 deletions.
16 changes: 12 additions & 4 deletions src/datasets/packaged_modules/json/json.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import io
import itertools
import json
from dataclasses import dataclass
from typing import Optional

import pandas as pd
import pyarrow as pa
import pyarrow.json as paj

Expand All @@ -15,6 +15,14 @@
logger = datasets.utils.logging.get_logger(__name__)


def ujson_loads(*args, **kwargs):
try:
return pd.io.json.ujson_loads(*args, **kwargs)
except AttributeError:
# Before pandas-2.2.0, ujson_loads was renamed to loads: import ujson_loads as loads
return pd.io.json.loads(*args, **kwargs)


@dataclass
class JsonConfig(datasets.BuilderConfig):
"""BuilderConfig for JSON."""
Expand Down Expand Up @@ -80,7 +88,7 @@ def _generate_tables(self, files):
# If the file is one json object and if we need to look at the list of items in one specific field
if self.config.field is not None:
with open(file, encoding=self.config.encoding, errors=self.config.encoding_errors) as f:
dataset = json.load(f)
dataset = ujson_loads(f.read())

# We keep only the field we are interested in
dataset = dataset[self.config.field]
Expand Down Expand Up @@ -142,8 +150,8 @@ def _generate_tables(self, files):
with open(
file, encoding=self.config.encoding, errors=self.config.encoding_errors
) as f:
dataset = json.load(f)
except json.JSONDecodeError:
dataset = ujson_loads(f.read())
except ValueError:
logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}")
raise e
# If possible, parse the file as a list of json objects/strings and exit the loop
Expand Down

0 comments on commit eafed0d

Please sign in to comment.