# 概要
[PyTorch Tutorial](https://pytorch.org/data/main/tutorial.html) を元にPyTorch DataのDataPipeの使い方を解説。

In [28]:
import pandas as pd
import torchdata.datapipes as dp
from torchdata.datapipes.iter import IterDataPipe

## Using DataPipes
DataPipeをつなげて使う方法の基本

In [29]:
ls data/

a.csv  b.csv  c.csv


In [30]:
# サンプルのCSVファイル
pd.read_csv('data/a.csv').head()

Unnamed: 0,a,b,c,d
0,3,2,8,7
1,7,7,5,0
2,8,3,4,5
3,0,1,8,4
4,8,6,0,4


In [31]:
# FileListerでファイルのリストを取得する
FOLDER = 'data'
datapipe = dp.iter.FileLister([FOLDER]).filter(filter_fn=(lambda filename: filename.endswith('.csv')))
datapipe



<torch.utils.data.datapipes.iter.selecting.FilterIterDataPipe at 0x1230a47f0>

In [32]:
list(datapipe)

['data/a.csv', 'data/b.csv', 'data/c.csv']

In [33]:
# FileOpenerでファイルを開く
# 引数にFileListenerで取得したDataPipeを与えている
datapipe = dp.iter.FileOpener(datapipe, mode='rt')
datapipe

<torch.utils.data.datapipes.iter.fileopener.FileOpenerIterDataPipe at 0x1230a4370>

In [34]:
# (ファイル名, TextIO, StreamWrapper) というtupleで構成されたイテレータが返ってくる
list(datapipe)

[('data/a.csv',
  StreamWrapper<<_io.TextIOWrapper name='data/a.csv' mode='rt' encoding='UTF-8'>>),
 ('data/b.csv',
  StreamWrapper<<_io.TextIOWrapper name='data/b.csv' mode='rt' encoding='UTF-8'>>),
 ('data/c.csv',
  StreamWrapper<<_io.TextIOWrapper name='data/c.csv' mode='rt' encoding='UTF-8'>>)]

In [35]:
# StreamWrapperからファイルの内容を取得できる
sw = list(datapipe)[0][1]
sw.read()

'a,b,c,d\n3,2,8,7\n7,7,5,0\n8,3,4,5\n0,1,8,4\n8,6,0,4\n7,0,6,2\n7,4,5,5\n9,6,0,2\n9,9,8,8\n'

In [36]:
# parse_csvメソッドでCSVをパース
datapipe_csv = datapipe.parse_csv(delimiter=',')
datapipe_csv

<torchdata.datapipes.iter.util.plain_text_reader.CSVParserIterDataPipe at 0x1230a4640>

In [37]:
list(datapipe_csv)

[['a', 'b', 'c', 'd'],
 ['3', '2', '8', '7'],
 ['7', '7', '5', '0'],
 ['8', '3', '4', '5'],
 ['0', '1', '8', '4'],
 ['8', '6', '0', '4'],
 ['7', '0', '6', '2'],
 ['7', '4', '5', '5'],
 ['9', '6', '0', '2'],
 ['9', '9', '8', '8'],
 ['a', 'b', 'c', 'd'],
 ['7', '4', '7', '4'],
 ['5', '8', '5', '6'],
 ['6', '8', '4', '5'],
 ['6', '7', '7', '7'],
 ['9', '3', '2', '1'],
 ['1', '9', '3', '5'],
 ['2', '1', '7', '2'],
 ['4', '4', '2', '6'],
 ['8', '2', '1', '6'],
 ['a', 'b', 'c', 'd'],
 ['1', '1', '2', '8'],
 ['9', '9', '9', '3'],
 ['4', '4', '1', '3'],
 ['8', '5', '5', '8'],
 ['9', '6', '3', '5'],
 ['9', '3', '9', '8'],
 ['7', '6', '7', '6'],
 ['7', '2', '5', '1'],
 ['9', '3', '7', '3']]

In [38]:
# parse_csvはCSVParserをDataPipeに登録したものなので、CSVParserを使ってもできる
datapipe_with_csv_parser = dp.iter.CSVParser(datapipe, delimiter=',')

In [39]:
list(datapipe_with_csv_parser)

[['a', 'b', 'c', 'd'],
 ['3', '2', '8', '7'],
 ['7', '7', '5', '0'],
 ['8', '3', '4', '5'],
 ['0', '1', '8', '4'],
 ['8', '6', '0', '4'],
 ['7', '0', '6', '2'],
 ['7', '4', '5', '5'],
 ['9', '6', '0', '2'],
 ['9', '9', '8', '8'],
 ['a', 'b', 'c', 'd'],
 ['7', '4', '7', '4'],
 ['5', '8', '5', '6'],
 ['6', '8', '4', '5'],
 ['6', '7', '7', '7'],
 ['9', '3', '2', '1'],
 ['1', '9', '3', '5'],
 ['2', '1', '7', '2'],
 ['4', '4', '2', '6'],
 ['8', '2', '1', '6'],
 ['a', 'b', 'c', 'd'],
 ['1', '1', '2', '8'],
 ['9', '9', '9', '3'],
 ['4', '4', '1', '3'],
 ['8', '5', '5', '8'],
 ['9', '6', '3', '5'],
 ['9', '3', '9', '8'],
 ['7', '6', '7', '6'],
 ['7', '2', '5', '1'],
 ['9', '3', '7', '3']]

## Working with DataLoader
DataPipeのDataLoaderへの渡し方。

In [40]:
# 1個のlabel、num_features個(defaultは20)のfeature、labelとfeatureは0 ~ 9の整数となるデータnum_rows行(defaultは5000行)を生成する関数。
# sample_data{file_label}.csvというファイルに保存される
import csv
import random

def generate_csv(file_label, num_rows: int = 5000, num_features: int = 20) -> None:
    fieldnames = ["label"] + [f"c{i}" for i in range(num_features)]
    writer = csv.DictWriter(
        open(f"sample_data{file_label}.csv", "w"), fieldnames=fieldnames
    )
    writer.writeheader()
    for i in range(num_rows):
        row_data = {col: random.random() for col in fieldnames}
        row_data["label"] = random.randint(0, 9)
        writer.writerow(row_data)

In [41]:
# sample_data*.csvというファイルをFileListerでリストし、FileOpenerで開き、parse_csvでCSVをparseし、
# mapにより、labelとdataに分割する関数。
# DataPipeを返す。

import numpy as np
import torchdata.datapipes as dp

def build_datapipes(root_dir="."):
    datapipe = dp.iter.FileLister(root_dir)
    datapipe = datapipe.filter(
        filter_fn=(
            lambda filename: "sample_data" in filename and filename.endswith(".csv")
        )
    )
    datapipe = dp.iter.FileOpener(datapipe, mode="rt")
    datapipe = datapipe.parse_csv(delimiter=",", skip_lines=1)
    datapipe = datapipe.map(
        lambda row: {
            "label": np.array(row[0], np.int32),
            "data": np.array(row[1:], dtype=np.float64),
        }
    )
    return datapipe

In [44]:
# ファイルを生成
num_files_to_generate = 3
for i in range(num_files_to_generate):
    generate_csv(file_label=i, num_rows=10, num_features=3)

In [45]:
ls sample* # 3つのファイルが作成されている

sample_data0.csv  sample_data1.csv  sample_data2.csv


In [49]:
# 10行4列のデータが存在
pd.read_csv("sample_data0.csv").shape

(10, 4)

In [50]:
# labelとc0~c3のfeatureのデータ
pd.read_csv("sample_data0.csv").head()

Unnamed: 0,label,c0,c1,c2
0,8,0.707899,0.640586,0.926556
1,5,0.978543,0.194637,0.646268
2,2,0.028777,0.924437,0.190284
3,0,0.575103,0.59161,0.003323
4,0,0.929907,0.17142,0.194055


In [51]:
# datapipeを作成
datapipe = build_datapipes()



In [52]:
# DataLoaderにはdataset=datapipeで渡せば良い
from torch.utils.data import DataLoader

dl = DataLoader(dataset=datapipe, batch_size=50, shuffle=True)
dl

<torch.utils.data.dataloader.DataLoader at 0x1230a3910>

In [53]:
first = next(iter(dl))
first

{'label': tensor([8, 5, 2, 0, 0, 1, 2, 9, 6, 1, 4, 2, 5, 3, 7, 4, 1, 7, 9, 3, 4, 7, 4, 7,
         5, 1, 5, 8, 6, 0], dtype=torch.int32),
 'data': tensor([[0.7079, 0.6406, 0.9266],
         [0.9785, 0.1946, 0.6463],
         [0.0288, 0.9244, 0.1903],
         [0.5751, 0.5916, 0.0033],
         [0.9299, 0.1714, 0.1941],
         [0.0785, 0.3823, 0.1205],
         [0.6802, 0.3624, 0.4522],
         [0.9347, 0.0974, 0.5963],
         [0.9029, 0.9254, 0.6475],
         [0.1692, 0.6611, 0.8039],
         [0.2860, 0.9179, 0.6300],
         [0.9532, 0.9159, 0.3532],
         [0.1223, 0.5679, 0.4849],
         [0.6386, 0.4697, 0.5507],
         [0.3606, 0.0493, 0.8139],
         [0.7041, 0.6794, 0.8758],
         [0.7119, 0.7821, 0.8630],
         [0.6355, 0.1004, 0.5058],
         [0.1063, 0.2484, 0.1809],
         [0.0642, 0.0136, 0.3147],
         [0.6619, 0.8079, 0.5933],
         [0.2914, 0.1478, 0.6013],
         [0.1344, 0.0756, 0.5934],
         [0.9839, 0.7426, 0.3704],
         [0.00

In [54]:
labels, features = first["label"], first["data"]

In [55]:
print(f"Labels batch shape: {labels.size()}")
print(f"Feature batch shape: {features.size()}")

Labels batch shape: torch.Size([30])
Feature batch shape: torch.Size([30, 3])


## Implementing a Custom DataPipe
独自のDataPipeを作成する。

命名規則は"Operation"-eｒ + IterDataPipe or MapDataPipe。エイリアスではIterDataPipeとMapDataPipeは取り除く。

この例では、 `MapperIterDataPipe` を作る。

In [56]:
# IterDataPipeを継承して、MapperIterDataPipeを作成。
from torchdata.datapipes import functional_datapipe
from torchdata.datapipes.iter import IterDataPipe


@functional_datapipe("new_map")   # DataPipeにmapメソッドを登録
class MapperIterDataPipe(IterDataPipe):
    def __init__(self, source_dp: IterDataPipe, fn) -> None:
        super().__init__()
        self.dp = source_dp
        self.fn = fn   # 関数により変換を加える

    def __iter__(self):
        for d in self.dp:
            yield self.fn(d["data"])   # 変換を加えたあとのiteratorを作成

    def __len__(self):   # DataPipeの長さを返す
        return len(self.dp)

Exception: Unable to add DataPipe function name new_map as it is already taken

In [57]:
# MaapperIterDataPipe内で実行したい関数を定義
def decoder(x):
    return x*2

In [58]:
datapipe = build_datapipes()
list(datapipe)

[{'label': array(8, dtype=int32),
  'data': array([0.70789942, 0.64058597, 0.92655558])},
 {'label': array(5, dtype=int32),
  'data': array([0.978543  , 0.19463733, 0.64626799])},
 {'label': array(2, dtype=int32),
  'data': array([0.02877691, 0.92443743, 0.19028357])},
 {'label': array(0, dtype=int32),
  'data': array([0.57510324, 0.59160979, 0.00332339])},
 {'label': array(0, dtype=int32),
  'data': array([0.92990661, 0.17142011, 0.19405499])},
 {'label': array(1, dtype=int32),
  'data': array([0.07845751, 0.38231127, 0.12054644])},
 {'label': array(2, dtype=int32),
  'data': array([0.6801781 , 0.36237215, 0.45218438])},
 {'label': array(9, dtype=int32),
  'data': array([0.93472486, 0.09737738, 0.59626577])},
 {'label': array(6, dtype=int32),
  'data': array([0.90292777, 0.92543488, 0.64747137])},
 {'label': array(1, dtype=int32),
  'data': array([0.16919006, 0.66110592, 0.8039367 ])},
 {'label': array(4, dtype=int32),
  'data': array([0.28600261, 0.91787277, 0.62996581])},
 {'label':

In [59]:
list(datapipe.new_map(fn=decoder))

[array([1.41579885, 1.28117194, 1.85311117]),
 array([1.957086  , 0.38927466, 1.29253598]),
 array([0.05755382, 1.84887486, 0.38056714]),
 array([1.15020649, 1.18321958, 0.00664677]),
 array([1.85981322, 0.34284023, 0.38810997]),
 array([0.15691501, 0.76462253, 0.24109289]),
 array([1.3603562 , 0.72474431, 0.90436875]),
 array([1.86944973, 0.19475476, 1.19253155]),
 array([1.80585554, 1.85086977, 1.29494274]),
 array([0.33838012, 1.32221183, 1.60787341]),
 array([0.57200521, 1.83574555, 1.25993161]),
 array([1.90639935, 1.83173671, 0.70645664]),
 array([0.24466339, 1.13576874, 0.96970183]),
 array([1.27721084, 0.93933899, 1.10138735]),
 array([0.72115058, 0.09863202, 1.62778735]),
 array([1.40825237, 1.35884443, 1.75167997]),
 array([1.42385689, 1.56427786, 1.72594981]),
 array([1.27096698, 0.20072711, 1.01153497]),
 array([0.21266189, 0.49679598, 0.36177743]),
 array([0.1283542 , 0.02727019, 0.62936114]),
 array([1.32379681, 1.61582757, 1.18666437]),
 array([0.58281502, 0.29555189, 1.