In [1]:
from sklearn.datasets import make_classification
import torch

In [13]:
X, y = make_classification(n_samples=10, n_features=2, n_informative=2, n_redundant=0, random_state=42, n_classes=2)
X, y

(array([[ 1.06833894, -0.97007347],
        [-1.14021544, -0.83879234],
        [-2.8953973 ,  1.97686236],
        [-0.72063436, -0.96059253],
        [-1.96287438, -0.99225135],
        [-0.9382051 , -0.54304815],
        [ 1.72725924, -1.18582677],
        [ 1.77736657,  1.51157598],
        [ 1.89969252,  0.83444483],
        [-0.58723065, -1.97171753]]),
 array([1, 0, 0, 0, 0, 1, 1, 1, 1, 0]))

In [None]:
import matplotlib.pyplot as plt

plt.scatter(X[:, 0], X[:, 1], c=y)
plt.show()

In [15]:
X = torch.from_numpy(X).type(torch.float32)
y = torch.from_numpy(y).type(torch.float32)

In [17]:
X, y

(tensor([[ 1.0683, -0.9701],
         [-1.1402, -0.8388],
         [-2.8954,  1.9769],
         [-0.7206, -0.9606],
         [-1.9629, -0.9923],
         [-0.9382, -0.5430],
         [ 1.7273, -1.1858],
         [ 1.7774,  1.5116],
         [ 1.8997,  0.8344],
         [-0.5872, -1.9717]]),
 tensor([1., 0., 0., 0., 0., 1., 1., 1., 1., 0.]))

In [18]:
from torch.utils.data import Dataset, DataLoader

In [37]:
class CustomDatasetClass(Dataset):
  def __init__(self, features, labels):
    self.features = features
    self.labels = labels

  def __len__(self):
    return self.features.shape[0]

  def __getitem__(self, index):
    # transformation to data can be applied here,
    # image aug, text preprocessing etc
    return self.features[index], self.labels[index]

In [38]:
dataset = CustomDatasetClass(X, y)

In [39]:
len(dataset)

10

In [40]:
# both are same
dataset.__getitem__(5), dataset[5]

((tensor([-0.9382, -0.5430]), tensor(1.)),
 (tensor([-0.9382, -0.5430]), tensor(1.)))

In [41]:
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In pytorch the sampler in the dataloader determines the strategy for selecting samples from the dataset during data loading. It controls how indices of the datset are drawn for each batch.

Types:
pytorch provides several predefined samplers, and you can create custom ones:
  1. Sequential Sampler
  - samples elements sequentially, in the order they appear in the dataset.
  - default when shuffle=False
  2. Random Sampler
  - samples elements randomly without replacement.
  - Default when shuffle=True

In [42]:
for batch_features, batch_labels in dataloader:
  print(batch_features)
  print(batch_labels)
  print("-"*30)

tensor([[-2.8954,  1.9769],
        [-0.9382, -0.5430]])
tensor([0., 1.])
------------------------------
tensor([[-1.9629, -0.9923],
        [-1.1402, -0.8388]])
tensor([0., 0.])
------------------------------
tensor([[-0.7206, -0.9606],
        [ 1.7273, -1.1858]])
tensor([0., 1.])
------------------------------
tensor([[ 1.7774,  1.5116],
        [ 1.0683, -0.9701]])
tensor([1., 1.])
------------------------------
tensor([[ 1.8997,  0.8344],
        [-0.5872, -1.9717]])
tensor([1., 0.])
------------------------------


### workers brings parallelism in forming batches



### collate_fn :
The collate_fn in pytorch's DataLoader is a function that specifies how to combine a list of samples from a dataset into a single batch. By default, the DataLoader uses a simple batch collation mechanism, but collate_fn allow you to customize how the data should be processed and batched.

use case:
Keeping text length same in sequential models via padding.
