# 概要
[PyTorch Tutorial](https://pytorch.org/data/main/tutorial.html) を元にPyTorch DataのDataPipeの使い方を解説。

In [1]:
import pandas as pd
import torchdata.datapipes as dp
from torchdata.datapipes.iter import IterDataPipe

## Using DataPipes
DataPipeをつなげて使う方法の基本

In [2]:
ls data/

a.csv  b.csv  c.csv


In [3]:
# サンプルのCSVファイル
pd.read_csv('data/a.csv').head()

Unnamed: 0,a,b,c,d
0,3,2,8,7
1,7,7,5,0
2,8,3,4,5
3,0,1,8,4
4,8,6,0,4


In [4]:
# FileListerでファイルのリストを取得する
FOLDER = 'data'
datapipe = dp.iter.FileLister([FOLDER]).filter(filter_fn=(lambda filename: filename.endswith('.csv')))
datapipe



<torch.utils.data.datapipes.iter.selecting.FilterIterDataPipe at 0x15756f550>

In [5]:
list(datapipe)

['data/a.csv', 'data/b.csv', 'data/c.csv']

In [6]:
# FileOpenerでファイルを開く
# 引数にFileListenerで取得したDataPipeを与えている
datapipe = dp.iter.FileOpener(datapipe, mode='rt')
datapipe

<torch.utils.data.datapipes.iter.fileopener.FileOpenerIterDataPipe at 0x15756fa60>

In [7]:
# (ファイル名, TextIO, StreamWrapper) というtupleで構成されたイテレータが返ってくる
list(datapipe)

[('data/a.csv',
  StreamWrapper<<_io.TextIOWrapper name='data/a.csv' mode='rt' encoding='UTF-8'>>),
 ('data/b.csv',
  StreamWrapper<<_io.TextIOWrapper name='data/b.csv' mode='rt' encoding='UTF-8'>>),
 ('data/c.csv',
  StreamWrapper<<_io.TextIOWrapper name='data/c.csv' mode='rt' encoding='UTF-8'>>)]

In [8]:
# StreamWrapperからファイルの内容を取得できる
sw = list(datapipe)[0][1]
sw.read()

'a,b,c,d\n3,2,8,7\n7,7,5,0\n8,3,4,5\n0,1,8,4\n8,6,0,4\n7,0,6,2\n7,4,5,5\n9,6,0,2\n9,9,8,8\n8,3,8,9\n3,7,2,1\n8,8,9,2\n6,6,2,9\n4,9,4,6\n2,1,0,4\n2,2,8,2\n8,6,0,5\n4,6,6,4\n1,2,4,7\n3,8,9,3\n7,5,1,4\n9,7,9,6\n2,6,2,5\n1,4,0,7\n3,6,5,0\n8,0,2,0\n1,3,9,0\n7,3,8,2\n4,4,6,1\n2,0,8,3\n0,4,6,6\n9,6,6,4\n9,2,9,1\n8,2,7,0\n0,1,8,4\n1,3,4,7\n6,9,0,1\n0,3,4,9\n5,3,3,5\n5,6,3,1\n9,7,2,9\n7,1,1,7\n4,0,7,6\n7,6,3,5\n8,6,9,3\n2,9,1,9\n4,5,5,2\n3,0,7,4\n0,1,5,6\n6,5,8,7\n4,4,8,1\n6,4,9,0\n3,6,8,0\n3,1,6,5\n8,8,5,3\n8,1,6,1\n7,3,9,8\n9,2,7,2\n4,5,4,4\n9,7,9,1\n5,3,9,3\n6,9,0,9\n9,3,4,2\n0,9,0,3\n2,6,5,4\n8,5,6,1\n6,2,9,2\n9,1,3,7\n7,7,4,4\n0,5,3,7\n4,6,0,3\n0,0,2,2\n6,8,3,0\n1,4,6,6\n4,7,8,6\n2,3,9,9\n6,6,4,8\n2,1,2,7\n0,8,3,0\n0,2,4,1\n4,0,8,0\n8,9,8,6\n7,0,1,5\n0,7,5,6\n3,2,9,6\n8,6,0,5\n2,4,1,7\n1,7,9,1\n5,9,8,5\n9,3,4,0\n2,8,0,2\n9,4,0,5\n4,9,1,1\n1,0,0,6\n0,0,8,1\n8,0,6,0\n5,1,2,3\n2,5,2,0\n4,5,1,0\n7,8,7,3\n'

In [9]:
# parse_csvメソッドでCSVをパース
datapipe_csv = datapipe.parse_csv(delimiter=',')
datapipe_csv

<torchdata.datapipes.iter.util.plain_text_reader.CSVParserIterDataPipe at 0x15758cbb0>

In [10]:
list(datapipe_csv)

[['a', 'b', 'c', 'd'],
 ['3', '2', '8', '7'],
 ['7', '7', '5', '0'],
 ['8', '3', '4', '5'],
 ['0', '1', '8', '4'],
 ['8', '6', '0', '4'],
 ['7', '0', '6', '2'],
 ['7', '4', '5', '5'],
 ['9', '6', '0', '2'],
 ['9', '9', '8', '8'],
 ['8', '3', '8', '9'],
 ['3', '7', '2', '1'],
 ['8', '8', '9', '2'],
 ['6', '6', '2', '9'],
 ['4', '9', '4', '6'],
 ['2', '1', '0', '4'],
 ['2', '2', '8', '2'],
 ['8', '6', '0', '5'],
 ['4', '6', '6', '4'],
 ['1', '2', '4', '7'],
 ['3', '8', '9', '3'],
 ['7', '5', '1', '4'],
 ['9', '7', '9', '6'],
 ['2', '6', '2', '5'],
 ['1', '4', '0', '7'],
 ['3', '6', '5', '0'],
 ['8', '0', '2', '0'],
 ['1', '3', '9', '0'],
 ['7', '3', '8', '2'],
 ['4', '4', '6', '1'],
 ['2', '0', '8', '3'],
 ['0', '4', '6', '6'],
 ['9', '6', '6', '4'],
 ['9', '2', '9', '1'],
 ['8', '2', '7', '0'],
 ['0', '1', '8', '4'],
 ['1', '3', '4', '7'],
 ['6', '9', '0', '1'],
 ['0', '3', '4', '9'],
 ['5', '3', '3', '5'],
 ['5', '6', '3', '1'],
 ['9', '7', '2', '9'],
 ['7', '1', '1', '7'],
 ['4', '0',

In [11]:
# parse_csvはCSVParserをDataPipeに登録したものなので、CSVParserを使ってもできる
datapipe_with_csv_parser = dp.iter.CSVParser(datapipe, delimiter=',')

In [12]:
list(datapipe_with_csv_parser)

[['a', 'b', 'c', 'd'],
 ['3', '2', '8', '7'],
 ['7', '7', '5', '0'],
 ['8', '3', '4', '5'],
 ['0', '1', '8', '4'],
 ['8', '6', '0', '4'],
 ['7', '0', '6', '2'],
 ['7', '4', '5', '5'],
 ['9', '6', '0', '2'],
 ['9', '9', '8', '8'],
 ['8', '3', '8', '9'],
 ['3', '7', '2', '1'],
 ['8', '8', '9', '2'],
 ['6', '6', '2', '9'],
 ['4', '9', '4', '6'],
 ['2', '1', '0', '4'],
 ['2', '2', '8', '2'],
 ['8', '6', '0', '5'],
 ['4', '6', '6', '4'],
 ['1', '2', '4', '7'],
 ['3', '8', '9', '3'],
 ['7', '5', '1', '4'],
 ['9', '7', '9', '6'],
 ['2', '6', '2', '5'],
 ['1', '4', '0', '7'],
 ['3', '6', '5', '0'],
 ['8', '0', '2', '0'],
 ['1', '3', '9', '0'],
 ['7', '3', '8', '2'],
 ['4', '4', '6', '1'],
 ['2', '0', '8', '3'],
 ['0', '4', '6', '6'],
 ['9', '6', '6', '4'],
 ['9', '2', '9', '1'],
 ['8', '2', '7', '0'],
 ['0', '1', '8', '4'],
 ['1', '3', '4', '7'],
 ['6', '9', '0', '1'],
 ['0', '3', '4', '9'],
 ['5', '3', '3', '5'],
 ['5', '6', '3', '1'],
 ['9', '7', '2', '9'],
 ['7', '1', '1', '7'],
 ['4', '0',

## Working with DataLoader
DataPipeのDataLoaderへの渡し方。

In [13]:
# 1個のlabel、num_features個(defaultは20)のfeature、labelとfeatureは0 ~ 9の整数となるデータnum_rows行(defaultは5000行)を生成する関数。
# sample_data{file_label}.csvというファイルに保存される
import csv
import random

def generate_csv(file_label, num_rows: int = 5000, num_features: int = 20) -> None:
    fieldnames = ["label"] + [f"c{i}" for i in range(num_features)]
    writer = csv.DictWriter(
        open(f"sample_data{file_label}.csv", "w"), fieldnames=fieldnames
    )
    writer.writeheader()
    for i in range(num_rows):
        row_data = {col: random.random() for col in fieldnames}
        row_data["label"] = random.randint(0, 9)
        writer.writerow(row_data)

In [14]:
# sample_data*.csvというファイルをFileListerでリストし、FileOpenerで開き、parse_csvでCSVをparseし、
# mapにより、labelとdataに分割する関数。
# DataPipeを返す。

import numpy as np
import torchdata.datapipes as dp

def build_datapipes(root_dir="."):
    datapipe = dp.iter.FileLister(root_dir)
    datapipe = datapipe.filter(
        filter_fn=(
            lambda filename: "sample_data" in filename and filename.endswith(".csv")
        )
    )
    datapipe = dp.iter.FileOpener(datapipe, mode="rt")
    datapipe = datapipe.parse_csv(delimiter=",", skip_lines=1)
    datapipe = datapipe.map(
        lambda row: {
            "label": np.array(row[0], np.int32),
            "data": np.array(row[1:], dtype=np.float64),
        }
    )
    return datapipe

In [15]:
# ファイルを生成
num_files_to_generate = 3
for i in range(num_files_to_generate):
    generate_csv(file_label=i)

In [16]:
ls sample* # 3つのファイルが作成されている

sample_data0.csv  sample_data1.csv  sample_data2.csv


In [17]:
# 5000行21列のデータが存在
pd.read_csv("sample_data0.csv").shape

(5000, 21)

In [18]:
# labelとc0~c19のfeatureのデータ
pd.read_csv("sample_data0.csv").head()

Unnamed: 0,label,c0,c1,c2,c3,c4,c5,c6,c7,c8,...,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19
0,9,0.770069,0.029474,0.198095,0.688346,0.191504,0.198946,0.956313,0.726481,0.376896,...,0.81793,0.879342,0.801088,0.023843,0.892752,0.057292,0.224261,0.58492,0.194753,0.481447
1,7,0.580594,0.46902,0.319878,0.646977,0.638935,0.471579,0.323761,0.206051,0.306807,...,0.949363,0.341413,0.46015,0.903608,0.605171,0.713195,0.773231,0.014193,0.425304,0.611953
2,3,0.788178,0.598047,0.340142,0.308882,0.307197,0.713945,0.027379,0.091055,0.309071,...,0.968641,0.912998,0.269261,0.917444,0.56371,0.253239,0.926488,0.376882,0.777439,0.016354
3,3,0.515739,0.711961,0.508292,0.045265,0.496492,0.251312,0.811969,0.653827,0.621458,...,0.240137,0.273841,0.191646,0.762991,0.073985,0.642444,0.39838,0.325599,0.971472,0.274836
4,3,0.609357,0.289502,0.159863,0.907769,0.90141,0.666342,0.436811,0.683564,0.874084,...,0.1053,0.231176,0.624455,0.807864,0.43045,0.900816,0.02449,0.330543,0.84941,0.104651


In [19]:
# datapipeを作成
datapipe = build_datapipes()



In [20]:
# DataLoaderにはdataset=datapipeで渡せば良い
from torch.utils.data import DataLoader

dl = DataLoader(dataset=datapipe, batch_size=50, shuffle=True)
dl

<torch.utils.data.dataloader.DataLoader at 0x1575b2b50>

In [21]:
first = next(iter(dl))
first

{'label': tensor([9, 7, 3, 3, 3, 2, 2, 1, 7, 1, 3, 0, 5, 4, 2, 8, 7, 0, 8, 5, 1, 9, 1, 8,
         3, 0, 0, 5, 4, 5, 8, 3, 0, 7, 7, 9, 0, 8, 7, 7, 9, 4, 7, 2, 4, 2, 0, 5,
         1, 3], dtype=torch.int32),
 'data': tensor([[7.7007e-01, 2.9474e-02, 1.9809e-01, 6.8835e-01, 1.9150e-01, 1.9895e-01,
          9.5631e-01, 7.2648e-01, 3.7690e-01, 4.1570e-01, 8.1793e-01, 8.7934e-01,
          8.0109e-01, 2.3843e-02, 8.9275e-01, 5.7292e-02, 2.2426e-01, 5.8492e-01,
          1.9475e-01, 4.8145e-01],
         [5.8059e-01, 4.6902e-01, 3.1988e-01, 6.4698e-01, 6.3893e-01, 4.7158e-01,
          3.2376e-01, 2.0605e-01, 3.0681e-01, 3.5842e-01, 9.4936e-01, 3.4141e-01,
          4.6015e-01, 9.0361e-01, 6.0517e-01, 7.1319e-01, 7.7323e-01, 1.4193e-02,
          4.2530e-01, 6.1195e-01],
         [7.8818e-01, 5.9805e-01, 3.4014e-01, 3.0888e-01, 3.0720e-01, 7.1394e-01,
          2.7379e-02, 9.1055e-02, 3.0907e-01, 1.9919e-01, 9.6864e-01, 9.1300e-01,
          2.6926e-01, 9.1744e-01, 5.6371e-01, 2.5324e-01, 9

In [22]:
labels, features = first["label"], first["data"]

In [23]:
print(f"Labels batch shape: {labels.size()}")
print(f"Feature batch shape: {features.size()}")

Labels batch shape: torch.Size([50])
Feature batch shape: torch.Size([50, 20])


## Implementing a Custom DataPipe
独自のDataPipeを作成する。

命名規則は"Operation"-eｒ + IterDataPipe or MapDataPipe。エイリアスではIterDataPipeとMapDataPipeは取り除く。

この例では、 `MapperIterDataPipe` を作る。

In [24]:
# IterDataPipeを継承して、MapperIterDataPipeを作成。
from torchdata.datapipes.iter import IterDataPipe



@functional_datapipe("new_map")   # DataPipeにmapメソッドを登録
class MapperIterDataPipe(IterDataPipe):
    def __init__(self, source_dp: IterDataPipe, fn) -> None:
        super().__init__()
        self.dp = source_dp
        self.fn = fn   # 関数により変換を加える

    def __iter__(self):
        for d in self.dp:
            yield self.fn(d["data"])   # 変換を加えたあとのiteratorを作成

    def __len__(self):   # DataPipeの長さを返す
        return len(self.dp)

NameError: name 'functional_datapipe' is not defined

In [None]:
# MaapperIterDataPipe内で実行したい関数を定義
def decoder(x):
    return x*2

In [None]:
datapipe = build_datapipes()
list(datapipe)

In [None]:
list(datapipe.new_map(fn=decoder))