# 概要
[PyTorch Tutorial](https://pytorch.org/data/main/tutorial.html) を元にPyTorch DataのDataPipeの使い方を解説。

In [1]:
import pandas as pd
import torchdata.datapipes as dp

## Using DataPipes
DataPipeをつなげて使う方法の基本

In [2]:
ls data/

a.csv  b.csv  c.csv


In [3]:
# サンプルのCSVファイル
pd.read_csv('data/a.csv').head()

Unnamed: 0,a,b,c,d
0,3,2,8,7
1,7,7,5,0
2,8,3,4,5
3,0,1,8,4
4,8,6,0,4


In [4]:
# FileListerでファイルのリストを取得する
FOLDER = 'data'
datapipe = dp.iter.FileLister([FOLDER]).filter(filter_fn=(lambda filename: filename.endswith('.csv')))
datapipe



<torch.utils.data.datapipes.iter.selecting.FilterIterDataPipe at 0x129284850>

In [5]:
list(datapipe)

['data/a.csv', 'data/b.csv', 'data/c.csv']

In [6]:
# FileOpenerでファイルを開く
# 引数にFileListenerで取得したDataPipeを与えている
datapipe = dp.iter.FileOpener(datapipe, mode='rt')
datapipe

<torch.utils.data.datapipes.iter.fileopener.FileOpenerIterDataPipe at 0x129284910>

In [7]:
# (ファイル名, TextIO, StreamWrapper) というtupleで構成されたイテレータが返ってくる
list(datapipe)

[('data/a.csv',
  StreamWrapper<<_io.TextIOWrapper name='data/a.csv' mode='rt' encoding='UTF-8'>>),
 ('data/b.csv',
  StreamWrapper<<_io.TextIOWrapper name='data/b.csv' mode='rt' encoding='UTF-8'>>),
 ('data/c.csv',
  StreamWrapper<<_io.TextIOWrapper name='data/c.csv' mode='rt' encoding='UTF-8'>>)]

In [8]:
# StreamWrapperからファイルの内容を取得できる
sw = list(datapipe)[0][1]
sw.read()

'a,b,c,d\n3,2,8,7\n7,7,5,0\n8,3,4,5\n0,1,8,4\n8,6,0,4\n7,0,6,2\n7,4,5,5\n9,6,0,2\n9,9,8,8\n'

In [9]:
# parse_csvメソッドでCSVをパース
datapipe_csv = datapipe.parse_csv(delimiter=',')
datapipe_csv

<torchdata.datapipes.iter.util.plain_text_reader.CSVParserIterDataPipe at 0x1292a1100>

In [10]:
list(datapipe_csv)

[['a', 'b', 'c', 'd'],
 ['3', '2', '8', '7'],
 ['7', '7', '5', '0'],
 ['8', '3', '4', '5'],
 ['0', '1', '8', '4'],
 ['8', '6', '0', '4'],
 ['7', '0', '6', '2'],
 ['7', '4', '5', '5'],
 ['9', '6', '0', '2'],
 ['9', '9', '8', '8'],
 ['a', 'b', 'c', 'd'],
 ['7', '4', '7', '4'],
 ['5', '8', '5', '6'],
 ['6', '8', '4', '5'],
 ['6', '7', '7', '7'],
 ['9', '3', '2', '1'],
 ['1', '9', '3', '5'],
 ['2', '1', '7', '2'],
 ['4', '4', '2', '6'],
 ['8', '2', '1', '6'],
 ['a', 'b', 'c', 'd'],
 ['1', '1', '2', '8'],
 ['9', '9', '9', '3'],
 ['4', '4', '1', '3'],
 ['8', '5', '5', '8'],
 ['9', '6', '3', '5'],
 ['9', '3', '9', '8'],
 ['7', '6', '7', '6'],
 ['7', '2', '5', '1'],
 ['9', '3', '7', '3']]

In [11]:
# parse_csvはCSVParserをDataPipeに登録したものなので、CSVParserを使ってもできる
datapipe_with_csv_parser = dp.iter.CSVParser(datapipe, delimiter=',')

In [12]:
list(datapipe_with_csv_parser)

[['a', 'b', 'c', 'd'],
 ['3', '2', '8', '7'],
 ['7', '7', '5', '0'],
 ['8', '3', '4', '5'],
 ['0', '1', '8', '4'],
 ['8', '6', '0', '4'],
 ['7', '0', '6', '2'],
 ['7', '4', '5', '5'],
 ['9', '6', '0', '2'],
 ['9', '9', '8', '8'],
 ['a', 'b', 'c', 'd'],
 ['7', '4', '7', '4'],
 ['5', '8', '5', '6'],
 ['6', '8', '4', '5'],
 ['6', '7', '7', '7'],
 ['9', '3', '2', '1'],
 ['1', '9', '3', '5'],
 ['2', '1', '7', '2'],
 ['4', '4', '2', '6'],
 ['8', '2', '1', '6'],
 ['a', 'b', 'c', 'd'],
 ['1', '1', '2', '8'],
 ['9', '9', '9', '3'],
 ['4', '4', '1', '3'],
 ['8', '5', '5', '8'],
 ['9', '6', '3', '5'],
 ['9', '3', '9', '8'],
 ['7', '6', '7', '6'],
 ['7', '2', '5', '1'],
 ['9', '3', '7', '3']]

## Working with DataLoader
DataPipeのDataLoaderへの渡し方。

In [13]:
# 1個のlabel、num_features個(defaultは20)のfeature、labelとfeatureは0 ~ 9の整数となるデータnum_rows行(defaultは5000行)を生成する関数。
# sample_data{file_label}.csvというファイルに保存される
import csv
import random

def generate_csv(file_label, num_rows: int = 5000, num_features: int = 20) -> None:
    fieldnames = ["label"] + [f"c{i}" for i in range(num_features)]
    writer = csv.DictWriter(
        open(f"sample_data{file_label}.csv", "w"), fieldnames=fieldnames
    )
    writer.writeheader()
    for i in range(num_rows):
        row_data = {col: random.random() for col in fieldnames}
        row_data["label"] = random.randint(0, 9)
        writer.writerow(row_data)

In [14]:
# sample_data*.csvというファイルをFileListerでリストし、FileOpenerで開き、parse_csvでCSVをparseし、
# mapにより、labelとdataに分割する関数。
# DataPipeを返す。

import numpy as np
import torchdata.datapipes as dp

def build_datapipes(root_dir="."):
    datapipe = dp.iter.FileLister(root_dir)
    datapipe = datapipe.filter(
        filter_fn=(
            lambda filename: "sample_data" in filename and filename.endswith(".csv")
        )
    )
    datapipe = dp.iter.FileOpener(datapipe, mode="rt")
    datapipe = datapipe.parse_csv(delimiter=",", skip_lines=1)
    datapipe = datapipe.map(
        lambda row: {
            "label": np.array(row[0], np.int32),
            "data": np.array(row[1:], dtype=np.float64),
        }
    )
    return datapipe

In [15]:
# ファイルを生成
num_files_to_generate = 3
for i in range(num_files_to_generate):
    generate_csv(file_label=i, num_rows=10, num_features=3)

In [16]:
ls sample* # 3つのファイルが作成されている

sample_data0.csv  sample_data1.csv  sample_data2.csv


In [17]:
# 10行4列のデータが存在
pd.read_csv("sample_data0.csv").shape

(10, 4)

In [18]:
# labelとc0~c3のfeatureのデータ
pd.read_csv("sample_data0.csv").head()

Unnamed: 0,label,c0,c1,c2
0,3,0.895064,0.924749,0.431571
1,6,0.414127,0.905068,0.726391
2,2,0.852688,0.701216,0.855399
3,8,0.65512,0.511357,0.131295
4,0,0.759595,0.098484,0.654312


In [19]:
# datapipeを作成
datapipe = build_datapipes()



In [20]:
# DataLoaderにはdataset=datapipeで渡せば良い
from torch.utils.data import DataLoader

dl = DataLoader(dataset=datapipe, batch_size=50, shuffle=True)
dl

<torch.utils.data.dataloader.DataLoader at 0x1292ae100>

In [21]:
first = next(iter(dl))
first

{'label': tensor([3, 6, 2, 8, 0, 8, 7, 3, 8, 4, 2, 3, 9, 9, 1, 6, 8, 6, 3, 1, 6, 5, 8, 3,
         0, 1, 7, 6, 6, 1], dtype=torch.int32),
 'data': tensor([[0.8951, 0.9247, 0.4316],
         [0.4141, 0.9051, 0.7264],
         [0.8527, 0.7012, 0.8554],
         [0.6551, 0.5114, 0.1313],
         [0.7596, 0.0985, 0.6543],
         [0.4558, 0.4930, 0.9421],
         [0.3073, 0.6333, 0.9986],
         [0.6445, 0.2302, 0.7013],
         [0.2375, 0.0948, 0.8011],
         [0.3167, 0.2150, 0.4711],
         [0.3578, 0.3191, 0.4842],
         [0.9679, 0.7934, 0.3112],
         [0.8872, 0.8991, 0.1010],
         [0.9713, 0.0377, 0.3574],
         [0.8612, 0.2653, 0.5862],
         [0.9899, 0.6150, 0.5098],
         [0.9031, 0.0252, 0.1263],
         [0.2821, 0.9735, 0.5889],
         [0.5679, 0.3990, 0.5816],
         [0.4920, 0.8569, 0.3976],
         [0.6824, 0.4844, 0.5860],
         [0.5530, 0.1539, 0.7113],
         [0.7474, 0.1720, 0.3412],
         [0.9984, 0.9120, 0.3845],
         [0.62

In [22]:
labels, features = first["label"], first["data"]

In [23]:
print(f"Labels batch shape: {labels.size()}")
print(f"Feature batch shape: {features.size()}")

Labels batch shape: torch.Size([30])
Feature batch shape: torch.Size([30, 3])


## Implementing a Custom DataPipe
独自のDataPipeを作成する。

命名規則は"Operation"-eｒ + IterDataPipe or MapDataPipe。エイリアスではIterDataPipeとMapDataPipeは取り除く。

この例では、 `MapperIterDataPipe` を作る。

In [24]:
# IterDataPipeを継承して、MapperIterDataPipeを作成。
from torchdata.datapipes import functional_datapipe
from torchdata.datapipes.iter import IterDataPipe


@functional_datapipe("new_map")   # DataPipeにmapメソッドを登録
class MapperIterDataPipe(IterDataPipe):
    def __init__(self, source_dp: IterDataPipe, fn) -> None:
        super().__init__()
        self.dp = source_dp
        self.fn = fn   # 関数により変換を加える

    def __iter__(self):
        for d in self.dp:
            yield self.fn(d["data"])   # 変換を加えたあとのiteratorを作成

    def __len__(self):   # DataPipeの長さを返す
        return len(self.dp)

In [25]:
# MaapperIterDataPipe内で実行したい関数を定義
def decoder(x):
    return x*2

In [26]:
datapipe = build_datapipes()
list(datapipe)

[{'label': array(3, dtype=int32),
  'data': array([0.89506394, 0.92474888, 0.43157119])},
 {'label': array(6, dtype=int32),
  'data': array([0.41412658, 0.90506806, 0.72639143])},
 {'label': array(2, dtype=int32),
  'data': array([0.85268786, 0.70121568, 0.85539912])},
 {'label': array(8, dtype=int32),
  'data': array([0.65512039, 0.51135713, 0.13129459])},
 {'label': array(0, dtype=int32),
  'data': array([0.75959461, 0.09848438, 0.654312  ])},
 {'label': array(8, dtype=int32),
  'data': array([0.45578675, 0.49300666, 0.94212415])},
 {'label': array(7, dtype=int32),
  'data': array([0.30729635, 0.6333446 , 0.99856541])},
 {'label': array(3, dtype=int32),
  'data': array([0.64453027, 0.23023957, 0.70131864])},
 {'label': array(8, dtype=int32),
  'data': array([0.23751087, 0.09482439, 0.80113369])},
 {'label': array(4, dtype=int32),
  'data': array([0.31673474, 0.21504659, 0.47108446])},
 {'label': array(2, dtype=int32),
  'data': array([0.35781245, 0.31914146, 0.48423898])},
 {'label':

In [27]:
list(datapipe.new_map(fn=decoder))

[array([1.79012787, 1.84949777, 0.86314238]),
 array([0.82825316, 1.81013611, 1.45278286]),
 array([1.70537573, 1.40243137, 1.71079824]),
 array([1.31024078, 1.02271426, 0.26258918]),
 array([1.51918922, 0.19696876, 1.308624  ]),
 array([0.91157351, 0.98601332, 1.88424831]),
 array([0.6145927 , 1.26668919, 1.99713081]),
 array([1.28906054, 0.46047913, 1.40263729]),
 array([0.47502174, 0.18964879, 1.60226738]),
 array([0.63346948, 0.43009319, 0.94216893]),
 array([0.7156249 , 0.63828292, 0.96847796]),
 array([1.93583951, 1.58679532, 0.62233211]),
 array([1.77435307, 1.79828352, 0.20207387]),
 array([1.94263223, 0.07542345, 0.7147563 ]),
 array([1.72234111, 0.53055875, 1.17242697]),
 array([1.97984012, 1.22992541, 1.01965569]),
 array([1.80622778, 0.05045025, 0.25257663]),
 array([0.56414267, 1.94695729, 1.177755  ]),
 array([1.13576472, 0.7979792 , 1.16318282]),
 array([0.98399653, 1.71371626, 0.79520561]),
 array([1.3648904 , 0.9688196 , 1.17198577]),
 array([1.10601677, 0.30772393, 1.