# PyTorch Dataloaders
Dataloaders allow you to load into memory chunks of data. By using the `__getitem__` function, you can load a set of files that would be used for a single batch of model training/evaluation/validation.

In [None]:
import os
from torch.utils.data import Dataset
import pandas as pd
from torchvision.io import read_image

class CustomImageDataset(Dataset):
    def __init__(self, img_dir, transform=None, target_transform=None):
        self.img_labels = ['JP','T','ISIS']
        self.img_dir = img_dir
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return 10

    def __getitem__(self, idx):
        print(idx)
        # print(self.img_dir, idx, self.img_labels[idx])
        # img_path = os.path.join(self.img_dir, f"{str(self.img_labels[idx])}.csv")
        img_path = "./data/customers-2000000.csv"
        # image = pd.read_csv(img_path).to_dict(orient='records')
        # label = self.img_labels[idx]
        # print(img_path, label)
        # if self.transform:
        #     image = self.transform(image)
        # if self.target_transform:
        #     label = self.target_transform(label)
        return {}, 0

In [None]:
data = CustomImageDataset(img_dir="./data")

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(data, batch_size=5, shuffle=False)

In [None]:
train_features, train_labels = next(iter(train_dataloader))

In [None]:
it = iter(train_dataloader)

In [None]:
next(it)

By using a dataloader, you can repeatedly read multiple large files (in this case 300mb csvs) without the need to load them all into memory at once.

The caveat is you have to divide your data into chunks prior to this. Or if your data is already partitioned, then its even better.

In [None]:
for t, l in train_dataloader:
    pass

In [None]:
next(it)

In [None]:
next(it)

## Working with Parquet files
The dataset dataloader works very well with parquet files generated by spark.
See [this post](https://stackoverflow.com/questions/68199072/pytorch-dataloader-for-reading-a-large-parquet-csv-file)

This uses dask but the idea is the same.
```python
# Define the Dataset class
class UsersDataset(Dataset):
    def __init__(self, dask_df, labels):
        self.dask_df = dask_df
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx): 
        X_df = self.dask_df.get_partition(idx).compute()
        X = np.row_stack([X_df])
        X_tensor = torch.tensor(X, dtype=torch.float32)
        y = self.labels[idx]
        y_tensor = torch.tensor(y, dtype=torch.long)
        sample = (X_tensor, y_tensor) 
        return sample
```