# 概要
[PyTorch Tutorial](https://pytorch.org/data/main/tutorial.html) を元にPyTorch DataのDataPipeの使い方を解説。

In [1]:
import pandas as pd
import torchdata.datapipes as dp
from torchdata.datapipes.iter import IterDataPipe

## Using DataPipes
DataPipeをつなげて使う方法の基本

In [2]:
ls data/

a.csv  b.csv  c.csv


In [3]:
# サンプルのCSVファイル
pd.read_csv('data/a.csv').head()

Unnamed: 0,a,b,c,d
0,3,2,8,7
1,7,7,5,0
2,8,3,4,5
3,0,1,8,4
4,8,6,0,4


In [4]:
# FileListerでファイルのリストを取得する
FOLDER = 'data'
datapipe = dp.iter.FileLister([FOLDER]).filter(filter_fn=(lambda filename: filename.endswith('.csv')))
datapipe



<torch.utils.data.datapipes.iter.selecting.FilterIterDataPipe at 0x12d4ef4c0>

In [5]:
list(datapipe)

['data/a.csv', 'data/b.csv', 'data/c.csv']

In [6]:
# FileOpenerでファイルを開く
# 引数にFileListenerで取得したDataPipeを与えている
datapipe = dp.iter.FileOpener(datapipe, mode='rt')
datapipe

<torch.utils.data.datapipes.iter.fileopener.FileOpenerIterDataPipe at 0x12d4ef760>

In [7]:
# (ファイル名, TextIO, StreamWrapper) というtupleで構成されたイテレータが返ってくる
list(datapipe)

[('data/a.csv',
  StreamWrapper<<_io.TextIOWrapper name='data/a.csv' mode='rt' encoding='UTF-8'>>),
 ('data/b.csv',
  StreamWrapper<<_io.TextIOWrapper name='data/b.csv' mode='rt' encoding='UTF-8'>>),
 ('data/c.csv',
  StreamWrapper<<_io.TextIOWrapper name='data/c.csv' mode='rt' encoding='UTF-8'>>)]

In [8]:
# StreamWrapperからファイルの内容を取得できる
sw = list(datapipe)[0][1]
sw.read()

'a,b,c,d\n3,2,8,7\n7,7,5,0\n8,3,4,5\n0,1,8,4\n8,6,0,4\n7,0,6,2\n7,4,5,5\n9,6,0,2\n9,9,8,8\n'

In [9]:
# parse_csvメソッドでCSVをパース
datapipe_csv = datapipe.parse_csv(delimiter=',')
datapipe_csv

<torchdata.datapipes.iter.util.plain_text_reader.CSVParserIterDataPipe at 0x12d50ebb0>

In [10]:
list(datapipe_csv)

[['a', 'b', 'c', 'd'],
 ['3', '2', '8', '7'],
 ['7', '7', '5', '0'],
 ['8', '3', '4', '5'],
 ['0', '1', '8', '4'],
 ['8', '6', '0', '4'],
 ['7', '0', '6', '2'],
 ['7', '4', '5', '5'],
 ['9', '6', '0', '2'],
 ['9', '9', '8', '8'],
 ['a', 'b', 'c', 'd'],
 ['7', '4', '7', '4'],
 ['5', '8', '5', '6'],
 ['6', '8', '4', '5'],
 ['6', '7', '7', '7'],
 ['9', '3', '2', '1'],
 ['1', '9', '3', '5'],
 ['2', '1', '7', '2'],
 ['4', '4', '2', '6'],
 ['8', '2', '1', '6'],
 ['a', 'b', 'c', 'd'],
 ['1', '1', '2', '8'],
 ['9', '9', '9', '3'],
 ['4', '4', '1', '3'],
 ['8', '5', '5', '8'],
 ['9', '6', '3', '5'],
 ['9', '3', '9', '8'],
 ['7', '6', '7', '6'],
 ['7', '2', '5', '1'],
 ['9', '3', '7', '3']]

In [11]:
# parse_csvはCSVParserをDataPipeに登録したものなので、CSVParserを使ってもできる
datapipe_with_csv_parser = dp.iter.CSVParser(datapipe, delimiter=',')

In [12]:
list(datapipe_with_csv_parser)

[['a', 'b', 'c', 'd'],
 ['3', '2', '8', '7'],
 ['7', '7', '5', '0'],
 ['8', '3', '4', '5'],
 ['0', '1', '8', '4'],
 ['8', '6', '0', '4'],
 ['7', '0', '6', '2'],
 ['7', '4', '5', '5'],
 ['9', '6', '0', '2'],
 ['9', '9', '8', '8'],
 ['a', 'b', 'c', 'd'],
 ['7', '4', '7', '4'],
 ['5', '8', '5', '6'],
 ['6', '8', '4', '5'],
 ['6', '7', '7', '7'],
 ['9', '3', '2', '1'],
 ['1', '9', '3', '5'],
 ['2', '1', '7', '2'],
 ['4', '4', '2', '6'],
 ['8', '2', '1', '6'],
 ['a', 'b', 'c', 'd'],
 ['1', '1', '2', '8'],
 ['9', '9', '9', '3'],
 ['4', '4', '1', '3'],
 ['8', '5', '5', '8'],
 ['9', '6', '3', '5'],
 ['9', '3', '9', '8'],
 ['7', '6', '7', '6'],
 ['7', '2', '5', '1'],
 ['9', '3', '7', '3']]

## Working with DataLoader
DataPipeのDataLoaderへの渡し方。

In [13]:
# 1個のlabel、num_features個(defaultは20)のfeature、labelとfeatureは0 ~ 9の整数となるデータnum_rows行(defaultは5000行)を生成する関数。
# sample_data{file_label}.csvというファイルに保存される
import csv
import random

def generate_csv(file_label, num_rows: int = 5000, num_features: int = 20) -> None:
    fieldnames = ["label"] + [f"c{i}" for i in range(num_features)]
    writer = csv.DictWriter(
        open(f"sample_data{file_label}.csv", "w"), fieldnames=fieldnames
    )
    writer.writeheader()
    for i in range(num_rows):
        row_data = {col: random.random() for col in fieldnames}
        row_data["label"] = random.randint(0, 9)
        writer.writerow(row_data)

In [14]:
# sample_data*.csvというファイルをFileListerでリストし、FileOpenerで開き、parse_csvでCSVをparseし、
# mapにより、labelとdataに分割する関数。
# DataPipeを返す。

import numpy as np
import torchdata.datapipes as dp

def build_datapipes(root_dir="."):
    datapipe = dp.iter.FileLister(root_dir)
    datapipe = datapipe.filter(
        filter_fn=(
            lambda filename: "sample_data" in filename and filename.endswith(".csv")
        )
    )
    datapipe = dp.iter.FileOpener(datapipe, mode="rt")
    datapipe = datapipe.parse_csv(delimiter=",", skip_lines=1)
    datapipe = datapipe.map(
        lambda row: {
            "label": np.array(row[0], np.int32),
            "data": np.array(row[1:], dtype=np.float64),
        }
    )
    return datapipe

In [15]:
# ファイルを生成
num_files_to_generate = 3
for i in range(num_files_to_generate):
    generate_csv(file_label=i, num_rows=10, num_features=3)

In [16]:
ls sample* # 3つのファイルが作成されている

sample_data0.csv  sample_data1.csv  sample_data2.csv


In [17]:
# 10行4列のデータが存在
pd.read_csv("sample_data0.csv").shape

(10, 4)

In [18]:
# labelとc0~c3のfeatureのデータ
pd.read_csv("sample_data0.csv").head()

Unnamed: 0,label,c0,c1,c2
0,2,0.411509,0.089731,0.151847
1,1,0.534293,0.877951,0.164484
2,5,0.772742,0.544814,0.025273
3,2,0.289256,0.841444,0.164631
4,1,0.287817,0.998701,0.249298


In [19]:
# datapipeを作成
datapipe = build_datapipes()



In [20]:
# DataLoaderにはdataset=datapipeで渡せば良い
from torch.utils.data import DataLoader

dl = DataLoader(dataset=datapipe, batch_size=50, shuffle=True)
dl

<torch.utils.data.dataloader.DataLoader at 0x12d524d60>

In [21]:
first = next(iter(dl))
first

{'label': tensor([2, 1, 5, 2, 1, 0, 3, 4, 9, 2, 6, 4, 6, 8, 1, 4, 5, 8, 3, 0, 9, 5, 7, 3,
         9, 8, 2, 7, 5, 7], dtype=torch.int32),
 'data': tensor([[0.4115, 0.0897, 0.1518],
         [0.5343, 0.8780, 0.1645],
         [0.7727, 0.5448, 0.0253],
         [0.2893, 0.8414, 0.1646],
         [0.2878, 0.9987, 0.2493],
         [0.8975, 0.2730, 0.2066],
         [0.2429, 0.1378, 0.5813],
         [0.3795, 0.9935, 0.3914],
         [0.7206, 0.1795, 0.3394],
         [0.4527, 0.1677, 0.7876],
         [0.4920, 0.1908, 0.6641],
         [0.6943, 0.6721, 0.0710],
         [0.9745, 0.6595, 0.6428],
         [0.4184, 0.6974, 0.3108],
         [0.4663, 0.2757, 0.9421],
         [0.4060, 0.4092, 0.4178],
         [0.2787, 0.9335, 0.9715],
         [0.3401, 0.6360, 0.7720],
         [0.6377, 0.2902, 0.2166],
         [0.4633, 0.1607, 0.6943],
         [0.6654, 0.2164, 0.6375],
         [0.1918, 0.0659, 0.5253],
         [0.5968, 0.0169, 0.0211],
         [0.6368, 0.4482, 0.7481],
         [0.09

In [22]:
labels, features = first["label"], first["data"]

In [23]:
print(f"Labels batch shape: {labels.size()}")
print(f"Feature batch shape: {features.size()}")

Labels batch shape: torch.Size([30])
Feature batch shape: torch.Size([30, 3])


## Implementing a Custom DataPipe
独自のDataPipeを作成する。

命名規則は"Operation"-eｒ + IterDataPipe or MapDataPipe。エイリアスではIterDataPipeとMapDataPipeは取り除く。

この例では、 `MapperIterDataPipe` を作る。

In [24]:
# IterDataPipeを継承して、MapperIterDataPipeを作成。
from torchdata.datapipes import functional_datapipe
from torchdata.datapipes.iter import IterDataPipe


@functional_datapipe("new_map")   # DataPipeにmapメソッドを登録
class MapperIterDataPipe(IterDataPipe):
    def __init__(self, source_dp: IterDataPipe, fn) -> None:
        super().__init__()
        self.dp = source_dp
        self.fn = fn   # 関数により変換を加える

    def __iter__(self):
        for d in self.dp:
            yield self.fn(d["data"])   # 変換を加えたあとのiteratorを作成

    def __len__(self):   # DataPipeの長さを返す
        return len(self.dp)

In [25]:
# MaapperIterDataPipe内で実行したい関数を定義
def decoder(x):
    return x*2

In [26]:
datapipe = build_datapipes()
list(datapipe)

[{'label': array(2, dtype=int32),
  'data': array([0.41150948, 0.08973145, 0.15184653])},
 {'label': array(1, dtype=int32),
  'data': array([0.53429336, 0.87795149, 0.16448412])},
 {'label': array(5, dtype=int32),
  'data': array([0.77274194, 0.54481436, 0.02527348])},
 {'label': array(2, dtype=int32),
  'data': array([0.2892562 , 0.84144434, 0.16463116])},
 {'label': array(1, dtype=int32),
  'data': array([0.28781748, 0.9987013 , 0.24929841])},
 {'label': array(0, dtype=int32),
  'data': array([0.8974815 , 0.27295833, 0.20664081])},
 {'label': array(3, dtype=int32),
  'data': array([0.24289124, 0.13775199, 0.58130367])},
 {'label': array(4, dtype=int32),
  'data': array([0.37947377, 0.99350872, 0.391429  ])},
 {'label': array(9, dtype=int32),
  'data': array([0.72064325, 0.17954854, 0.33941531])},
 {'label': array(2, dtype=int32),
  'data': array([0.45265091, 0.16765636, 0.78756425])},
 {'label': array(6, dtype=int32),
  'data': array([0.49195699, 0.19082781, 0.66414669])},
 {'label':

In [27]:
list(datapipe.new_map(fn=decoder))

[array([0.82301896, 0.1794629 , 0.30369307]),
 array([1.06858672, 1.75590297, 0.32896825]),
 array([1.54548389, 1.08962872, 0.05054696]),
 array([0.5785124 , 1.68288869, 0.32926233]),
 array([0.57563495, 1.9974026 , 0.49859681]),
 array([1.794963  , 0.54591666, 0.41328163]),
 array([0.48578248, 0.27550397, 1.16260735]),
 array([0.75894754, 1.98701744, 0.782858  ]),
 array([1.4412865 , 0.35909708, 0.67883062]),
 array([0.90530181, 0.33531271, 1.5751285 ]),
 array([0.98391398, 0.38165563, 1.32829339]),
 array([1.38864058, 1.34418865, 0.14190761]),
 array([1.94890809, 1.31906146, 1.28564795]),
 array([0.83683492, 1.39473268, 0.62160024]),
 array([0.9326525 , 0.55146405, 1.88425235]),
 array([0.81196392, 0.81833195, 0.83553118]),
 array([0.55732696, 1.86692046, 1.94295986]),
 array([0.6802617 , 1.27198617, 1.54393995]),
 array([1.27537602, 0.58040838, 0.43326925]),
 array([0.92663378, 0.32131246, 1.38858627]),
 array([1.33087933, 0.43274836, 1.27495289]),
 array([0.38360256, 0.13185853, 1.