# Huggingface Dataset API

## load_dataset
1. Dataset Preprocessing Script 다운로드
2. Preprocessing Script에서 Dataset Builder Class 얻기
3. DatasetBuilder 객체화
4. Data Download
5. Dataset Build

## 수정할 부분
1. 무조건 w-lda repo 내부에 data 폴더로 파일이 생기게 만들 것
2. ./cache/huggingface/modules엔 cache 파일이 생기지 않게 수정할 것
3. extracted 파일이 원본 파일. 이를 정확히 매핑하기
4. 내부 파일에서 cache를 체크해서 데이터셋 로드하도록 코드 수정하기

In [1]:
import os
import copy
from pathlib import Path
from dataclasses import dataclass, field
from typing import Optional, Callable, List, Union, Any, NewType, Iterable, Dict

In [2]:
path = "wikitext.py"
name = "wikitext-103-v1"
data_dir = None
data_files = None
split = "train"
cache_dir = "data"
features = None
download_config = None
download_mode = None
ignore_verifications = False
keep_in_memory = False
save_infos = False
script_version = None
use_auth_token = None
config_kwargs = {}

## 1. Dataset Preprocessing Script 다운
```python
module_path = prepare_module(
    path : str,
    name : str,
    cache_dir : str,
    download_mode : Optional[GenerateMode],
)
```

In [3]:
module_type = "dataset"
script_name = list(filter(lambda x: x, path.replace(os.sep, "/").split("/")))[-1]
if not script_name.endswith(".py"):
    raise AttributeError("")
short_name = script_name[:-3]
short_name

'wikitext'

In [4]:
__name__

'__main__'

In [5]:
dynamic_modules_path = os.path.join(os.path.abspath(cache_dir), "datasets_modules")
dynamic_modules_path

'C:\\workspace\\W-LDA\\new\\data\\datasets_modules'

In [6]:
module_name_for_dynamic_modules = os.path.basename(dynamic_modules_path)
datasets_modules_path = os.path.join(dynamic_modules_path, "datasets")
datasets_modules_name = module_name_for_dynamic_modules + ".datasets"

module_name_for_dynamic_modules, datasets_modules_path, datasets_modules_name

('datasets_modules',
 'C:\\workspace\\W-LDA\\new\\data\\datasets_modules\\datasets',
 'datasets_modules.datasets')

In [7]:
main_folder_path = os.path.join(datasets_modules_path, short_name)
main_folder_path

'C:\\workspace\\W-LDA\\new\\data\\datasets_modules\\datasets\\wikitext'

In [8]:
file_path = path
local_path = path
path

'wikitext.py'

In [9]:
from datasets.utils.file_utils import (
    url_or_path_parent, url_or_path_join, cached_path)
from datasets.info import DATASET_INFOS_DICT_FILE_NAME

In [10]:
base_path = url_or_path_parent(file_path)  # remove the filename
dataset_infos = url_or_path_join(base_path, DATASET_INFOS_DICT_FILE_NAME)

base_path, dataset_infos

('', 'dataset_infos.json')

In [11]:
from datasets.load import get_imports # 굉장히 유용한 함수!

In [12]:
# Download external imports if needed
imports = get_imports(local_path)
local_path, imports # import_type, import_name, import_path, sub_directory

('wikitext.py',
 [('library', '__future__', '__future__', None),
  ('library', 'os', 'os', None),
  ('library', 'datasets', 'datasets', None)])

- only support library

In [13]:
library_imports = []
for import_type, import_name, import_path, sub_directory in imports:
    if import_name == short_name:
        raise ValueError(
            f"Error in {module_type} script at {file_path}, importing relative {import_name} module "
            f"but {import_name} is the name of the {module_type} script. "
            f"Please change relative import {import_name} to another name and add a '# From: URL_OR_PATH' "
            f"comment pointing to the original realtive import file path."
        )
    if import_type == "library":
        library_imports.append((import_name, import_path))  # Import from a library
    else:
        raise ValueError("Wrong import_type")
library_imports

[('__future__', '__future__'), ('os', 'os'), ('datasets', 'datasets')]

In [14]:
import importlib

In [15]:
# Check library imports
needs_to_be_installed = []
for library_import_name, library_import_path in library_imports:
    try:
        lib = importlib.import_module(library_import_name)  # noqa F841
    except ImportError:
        needs_to_be_installed.append((library_import_name, library_import_path))
if needs_to_be_installed:
    raise ImportError(
        f"To be able to use this {module_type}, you need to install the following dependencies"
        f"{[lib_name for lib_name, lib_path in needs_to_be_installed]} using 'pip install "
        f"{' '.join([lib_path for lib_name, lib_path in needs_to_be_installed])}' for instance'"
    )

- hash 안씀

In [48]:
[local_path] + []

['wikitext.py']

In [16]:
from datasets.load import files_to_hash


local_imports = []
hash_ = files_to_hash([local_path] + [loc[1] for loc in local_imports])
# hash_folder_path = os.path.join(main_folder_path, hash_)
hash_folder_path = main_folder_path

In [17]:
local_file_path = os.path.join(hash_folder_path, name)
dataset_infos_path = os.path.join(hash_folder_path, DATASET_INFOS_DICT_FILE_NAME)

local_file_path, dataset_infos_path

('C:\\workspace\\W-LDA\\new\\data\\datasets_modules\\datasets\\wikitext\\wikitext-103-v1',
 'C:\\workspace\\W-LDA\\new\\data\\datasets_modules\\datasets\\wikitext\\dataset_infos.json')

In [18]:
from datasets.utils.filelock import FileLock


# Prevent parallel disk operations
lock_path = local_path + ".lock"
filelock = FileLock(lock_path)

lock_path, filelock

('wikitext.py.lock',
 <datasets.utils.filelock.WindowsFileLock at 0x1d13d1ec748>)

In [19]:
import enum


class GenerateMode(enum.Enum):
    """`Enum` for how to treat pre-existing downloads and data.
    The default mode is `REUSE_DATASET_IF_EXISTS`, which will reuse both
    raw downloads and the prepared dataset if they exist.
    The generations modes:
    +------------------------------------+-----------+---------+
    |                                    | Downloads | Dataset |
    +====================================+===========+=========+
    | `REUSE_DATASET_IF_EXISTS` (default)| Reuse     | Reuse   |
    +------------------------------------+-----------+---------+
    | `REUSE_CACHE_IF_EXISTS`            | Reuse     | Fresh   |
    +------------------------------------+-----------+---------+
    | `FORCE_REDOWNLOAD`                 | Fresh     | Fresh   |
    +------------------------------------+-----------+---------+
    """

    REUSE_DATASET_IF_EXISTS = "reuse_dataset_if_exists"
    REUSE_CACHE_IF_EXISTS = "reuse_cache_if_exists"
    FORCE_REDOWNLOAD = "force_redownload"

In [20]:
download_mode = None

In [21]:
if (download_mode == GenerateMode.FORCE_REDOWNLOAD and
    os.path.exists(main_folder_path)
):
    print("FORCE REDOWNLOAD")

In [22]:
if not os.path.exists(main_folder_path):
    os.makedirs(main_folder_path, exist_ok=True)
else:
    print("Do not")

In [23]:
# add an __init__ file to the main dataset folder if needed
init_file_path = os.path.join(main_folder_path, "__init__.py")
print(init_file_path)
if not os.path.exists(init_file_path):
    with open(init_file_path, "w"):
        pass

C:\workspace\W-LDA\new\data\datasets_modules\datasets\wikitext\__init__.py


In [27]:
import shutil

In [30]:
local_path, local_file_path

('wikitext.py',
 'C:\\workspace\\W-LDA\\new\\data\\datasets_modules\\datasets\\wikitext\\wikitext-103-v1')

In [28]:
# Copy dataset.py file in hash folder if needed
if not os.path.exists(local_file_path):
    print("Copy")
    shutil.copyfile(local_path, local_file_path)
else:
    print("Do not")

Copy


- dataset info 아직 X

In [34]:
import json

In [37]:
# Record metadata associating original dataset path with local unique folder
meta_path = local_file_path.split(".py")[0] + ".json"
print(meta_path)
if not os.path.exists(meta_path):
    print("Meta file")
    meta = {"original file path": file_path, "local file path": local_file_path}
    # the filename is *.py in our case, so better rename to filenam.json instead of filename.py.json
    with open(meta_path, "w", encoding="utf-8") as meta_file:
        json.dump(meta, meta_file)
else:
    print("Do not")

C:\workspace\W-LDA\new\data\datasets_modules\datasets\wikitext\wikitext-103-v1.json
Meta file


In [39]:
meta

{'original file path': 'wikitext.py',
 'local file path': 'C:\\workspace\\W-LDA\\new\\data\\datasets_modules\\datasets\\wikitext\\wikitext-103-v1'}

In [41]:
filelock.release() # EXIT

In [44]:
module_path = ".".join(
    [datasets_modules_name, short_name]
)
module_path

'datasets_modules.datasets.wikitext'

In [45]:
file_path

'wikitext.py'