From 3782bd56b1673400e7080134c145a5bcf19b9ed0 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Mon, 29 Apr 2024 18:55:49 +0800 Subject: [PATCH] fix webdataset filename split --- src/datasets/packaged_modules/webdataset/webdataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/datasets/packaged_modules/webdataset/webdataset.py b/src/datasets/packaged_modules/webdataset/webdataset.py index 3ac1e86fc41..b0a188124a4 100644 --- a/src/datasets/packaged_modules/webdataset/webdataset.py +++ b/src/datasets/packaged_modules/webdataset/webdataset.py @@ -1,5 +1,6 @@ import io import json +import os from itertools import islice from typing import Any, Callable, Dict, List @@ -24,7 +25,8 @@ def _get_pipeline_from_tar(cls, tar_path, tar_iterator): current_example = {} for filename, f in tar_iterator: if "." in filename: - example_key, field_name = filename.split(".", 1) + example_key, field_name = os.path.splitext(filename) + field_name = field_name.lstrip(".") if current_example and current_example["__key__"] != example_key: yield current_example current_example = {}