### Simple conversion from large json.gz that doesn't fit into memory, to a huggingface dataset that use pyarrow to reduce memory usage to be able to use arbitrarily large dataset

In [2]:
from datasets import load_dataset

In [3]:
import json
import gzip
with gzip.open('tmp.json.gz', 'w') as f:
    f.write(json.dumps({"text": "x", "label": 1}).encode('utf-8'))
    f.write(json.dumps({"text": "y", "label": 0}).encode('utf-8'))

#### i. map-style dataset

In [4]:
ds = load_dataset('json', data_files='tmp.json.gz', streaming=False)

Using custom data configuration default-fb38c71641562337


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-fb38c71641562337/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-fb38c71641562337/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 2
    })
})

In [8]:
ds['train'][0]

{'text': 'x', 'label': 1}

In [9]:
ds['train'][:]

{'text': ['x', 'y'], 'label': [1, 0]}

#### ii. iterable-style dataset

In [10]:
ds = load_dataset('json', data_files='tmp.json.gz', streaming=True)

Using custom data configuration default-fb38c71641562337


In [11]:
for i in ds['train']:
    print(i)

{'text': 'x', 'label': 1}
{'text': 'y', 'label': 0}


### Load custom datasets .yaml config that contain dvc info into huggingface dataset

In [1]:
import yaml 
import glob 
import logging 
logging.getLogger().setLevel(logging.INFO)
import dvc.api
import pathlib 
import hashlib
import os
import json
from tqdm import tqdm
import gzip
from datasets import load_dataset

In [2]:
data_source = yaml.safe_load(open('../../datasets/test.yaml', 'r'))

##### Note: if you don't want hassle, "cols" in data_source should only contain primitive types such as str, int, not list or dict

In [3]:
data_source

{'train': [{'path': 'data/wish_products/Wish_Meta_Val.json',
   'repo': 'git@github.com:junwang-wish/query_understanding_data.git',
   'rev': None,
   'cols': ['text']}],
 'val': [{'path': 'data/wish_products/Wish_Meta_Val.json',
   'repo': 'git@github.com:junwang-wish/query_understanding_data.git',
   'rev': None,
   'cols': ['text']}],
 'test': [{'path': 'data/wish_products/wish-mturk-labelled-09202022-clean.json',
   'repo': 'git@github.com:junwang-wish/query_understanding_data.git',
   'rev': None,
   'cols': ['text']}]}

In [4]:
cache_dir = '.tmp'
overwrite_cache = False

In [5]:
# setup cache folder
data_hash = hashlib.md5(
    json.dumps(data_source).encode('utf-8')
).hexdigest()
cache_dir_folder = os.path.join(cache_dir, 
    data_hash)
pathlib.Path(cache_dir_folder).mkdir(parents=True, exist_ok=True)

In [6]:
cache_dir_folder

'.tmp/9858c132a96d50d4f57e377afe5798f6'

In [7]:
existing_files = glob.glob(cache_dir_folder + '/*.json.gz')
if len(existing_files) == 0 or overwrite_cache:
    data_dict = data_source
    for stage in data_dict:
        if stage in ['train', 'val', 'test']:
            logging.info(f"Write {stage}.json.gz to {cache_dir_folder}")
            with gzip.open(os.path.join(
                        cache_dir_folder, 
                        f'{stage}.json.gz'), 'w'
                    ) as fout:
                for file_dict in data_dict[stage]:
                    with dvc.api.open(
                        path=file_dict['path'],
                        repo=file_dict['repo'],
                        rev=file_dict['rev']
                    ) as f:
                        for l in tqdm(f):
                            dat = json.loads(l)
                            fout.write(
                                (json.dumps(
                                    {k: dat[k] for k in file_dict['cols'] if k in dat}
                                ) + '\n').encode('utf-8')
                            )
else:
    logging.info(f"Use cache stored in {cache_dir_folder}")

INFO:root:Write train.json.gz to .tmp/9858c132a96d50d4f57e377afe5798f6
INFO:asyncssh:Opening SSH connection to github.com, port 22
INFO:asyncssh:[conn=0] Connected to SSH server at github.com, port 22
INFO:asyncssh:[conn=0]   Local address: 192.168.0.3, port 54862
INFO:asyncssh:[conn=0]   Peer address: 140.82.112.4, port 22
INFO:asyncssh:[conn=0] Beginning auth for user git
INFO:asyncssh:[conn=0] Auth for user git succeeded
INFO:asyncssh:[conn=0, chan=0] Requesting new SSH session
INFO:asyncssh:[conn=0, chan=0]   Command: git-upload-pack 'junwang-wish/query_understanding_data.git'
INFO:asyncssh:[conn=0, chan=0] Received exit status 0
INFO:asyncssh:[conn=0, chan=0] Received channel close
INFO:asyncssh:[conn=0, chan=0] Channel closed
INFO:asyncssh:[conn=0] Closing connection
INFO:asyncssh:[conn=0] Sending disconnect: Disconnected by application (11)
INFO:asyncssh:[conn=0] Connection closed
INFO:asyncssh:Opening SSH connection to github.com, port 22
INFO:asyncssh:[conn=1] Connected to SSH

In [8]:
ds = load_dataset('json', data_files={
    stage: os.path.join(cache_dir_folder, f'{stage}.json.gz') for stage in data_dict
})



Downloading and preparing dataset json/default to /data/junwang/.cache/huggingface/datasets/json/default-62e3ee0ddcaf453c/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /data/junwang/.cache/huggingface/datasets/json/default-62e3ee0ddcaf453c/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
ds

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 102471
    })
    val: Dataset({
        features: ['text'],
        num_rows: 102471
    })
    test: Dataset({
        features: ['text'],
        num_rows: 6401
    })
})

In [10]:
ds['test'][:10]

{'text': ['Upgrade Pets Safety Waterproof Dog Car Mats Hammock Protector Rear Back Pet Dog Car  Cover -> [home & garden][pet products][dog doors, houses, & furniture][dog beds & mats]',
  'Turquoise Howlite Lapis Lazuli Earrings Gold Geometric Gemstone Boho Earrings Blue Statement Chendelier Earrings -> [jewelry & accessories][fine jewelry][earrings]',
  "Funny Mens T-shirt I Have A Crazy Wife t-shirt Birthday Gift For Husband Christmas Gifts For Husband -> [men's clothing][tops & tees][t-shirts]",
  'Travel Laptop Backpack Water Resistant Business College Print Pig Florals Farm Computer Bag For Men Women -> [computer & office][laptop accessories][laptop bags & cases]',
  "New Women Fashion Double-Layer Divided Skirt Sports Shorts Quick-Drying Yoga Sports Leggings Fitness Shorts -> [women's clothing][activewear & loungewear][skirts]",
  "Men Pants Cotton Linen Vintage Solid Wide Leg Loose Casual Comfy Trousers -> [men's clothing][pants][casual pants]",
  "Men's Vintage Nordic Viking Ha

In [11]:
ds_test = load_dataset('json', data_files={
    stage: os.path.join(cache_dir_folder, f'{stage}.json.gz') for stage in data_dict
}, split='test')



In [12]:
ds_test

Dataset({
    features: ['text'],
    num_rows: 6401
})

### cleanup notebook generated temp files

In [13]:
os.system('rm -rf tmp*')
os.system('rm -rf .tmp')

0