Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use pandas ujson in JSON loader to improve performance #6874

Merged
merged 3 commits into from
May 17, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions src/datasets/packaged_modules/json/json.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import io
import itertools
import json
from dataclasses import dataclass
from typing import Optional

import pandas as pd
import pyarrow as pa
import pyarrow.json as paj

Expand All @@ -15,6 +15,14 @@
logger = datasets.utils.logging.get_logger(__name__)


def ujson_loads(*args, **kwargs):
try:
return pd.io.json.ujson_loads(*args, **kwargs)
except AttributeError:
# Before pandas-2.2.0, ujson_loads was renamed to loads: import ujson_loads as loads
return pd.io.json.loads(*args, **kwargs)


@dataclass
class JsonConfig(datasets.BuilderConfig):
"""BuilderConfig for JSON."""
Expand Down Expand Up @@ -80,7 +88,7 @@ def _generate_tables(self, files):
# If the file is one json object and if we need to look at the list of items in one specific field
if self.config.field is not None:
with open(file, encoding=self.config.encoding, errors=self.config.encoding_errors) as f:
dataset = json.load(f)
dataset = ujson_loads(f.read())

# We keep only the field we are interested in
dataset = dataset[self.config.field]
Expand Down Expand Up @@ -142,8 +150,8 @@ def _generate_tables(self, files):
with open(
file, encoding=self.config.encoding, errors=self.config.encoding_errors
) as f:
dataset = json.load(f)
except json.JSONDecodeError:
dataset = ujson_loads(f.read())
except ValueError:
logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}")
raise e
# If possible, parse the file as a list of json objects/strings and exit the loop
Expand Down
Loading