<a href="https://colab.research.google.com/github/kaifkh20/d2l/blob/main/d2laich3_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# !pip install d2l
%matplotlib inline
import random
import torch
from d2l import torch as d2l


In [2]:
class SyntheticRegressionData(d2l.DataModule):
    """Synthetic data for linear regression."""
    def __init__(self, w, b, noise=0.01, num_train=1000 ,num_val=1000,
                 batch_size=32):
        super().__init__()
        self.save_hyperparameters()
        n = num_train + num_val
        self.X = torch.randn(n, len(w))
        noise = torch.randn(n, 1) * noise
        self.y = torch.matmul(self.X, w.reshape((-1, 1))) + b + noise

data = SyntheticRegressionData(w=torch.tensor([2, -3.4]), b=4.2)
# data.w,data.X, data.y


In [3]:
print('features:', data.X[0],'\nlabel:', data.y[0])


features: tensor([1.3110, 1.4650]) 
label: tensor([1.8469])


In [4]:
@d2l.add_to_class(SyntheticRegressionData)
def get_dataloader(self, train):
    if train:
        indices = list(range(0, self.num_train))
        # The examples are read in random order
        random.shuffle(indices)
    else:
      # examples are read in squential manner in batches
        indices = list(range(self.num_train, self.num_train+self.num_val))
    for i in range(0, len(indices), self.batch_size):
        batch_indices = torch.tensor(indices[i: i+self.batch_size])
        yield self.X[batch_indices], self.y[batch_indices]


In [5]:
X, y = next(iter(data.train_dataloader()))
print('X shape:', X.shape, '\ny shape:', y.shape)
# X, y

X shape: torch.Size([32, 2]) 
y shape: torch.Size([32, 1])


In [14]:
@d2l.add_to_class(d2l.DataModule)
def get_tensorloader(self, tensors, train, indices=slice(0, None)):
    tensors = tuple(a[indices] for a in tensors)
    dataset = torch.utils.data.TensorDataset(*tensors)
    return torch.utils.data.DataLoader(dataset, self.batch_size,
                                       shuffle=train)

@d2l.add_to_class(SyntheticRegressionData)
def get_dataloader(self, train):
    i = slice(0, self.num_train) if train else slice(self.num_train, None)
    return self.get_tensorloader((self.X, self.y), train, i)


In [15]:
#train_dataloader meaning trainingset data
# val meaning validateset data

X, y = next(iter(data.train_dataloader()))
print('X shape:', X.shape, '\ny shape:', y.shape)


X shape: torch.Size([32, 2]) 
y shape: torch.Size([32, 1])


In [16]:
# ex 3.3.1

# ?torch.utils.data.DataLoader

# set the drop = True so it drops the incomplete batch
@d2l.add_to_class(d2l.DataModule)
def get_tensorloader(self, tensors, train, indices=slice(0, None)):
    tensors = tuple(a[indices] for a in tensors)
    dataset = torch.utils.data.TensorDataset(*tensors)
    return torch.utils.data.DataLoader(dataset, self.batch_size,
                                       shuffle=train,drop=True)

@d2l.add_to_class(SyntheticRegressionData)
def get_dataloader(self, train):
    i = slice(0, self.num_train) if train else slice(self.num_train, None)
    return self.get_tensorloader((self.X, self.y), train, i)


In [17]:
class SyntheticRegressionData_onTheFly(d2l.HyperParameters):
    def __init__(self, w, b, noise=0.01, batch_size=8):
        self.save_hyperparameters()
        self.w = self.w.reshape((-1, 1))
    def get_dataloader(self):
        X = torch.randn(self.batch_size, len(self.w))
        noise_tmp = torch.randn(self.batch_size, 1) * self.noise
        y = torch.matmul(X, self.w) + self.b + noise_tmp
        return X, y

test = SyntheticRegressionData_onTheFly(w=torch.tensor([1., -2.]), b=3.)
print(test.get_dataloader()[0],'\n',test.get_dataloader()[1])

tensor([[-1.3147,  0.7387],
        [ 0.1440,  0.7255],
        [ 0.2668,  1.4983],
        [-0.7530, -1.5150],
        [-0.1774, -0.8184],
        [-2.2493, -2.8079],
        [ 0.3686,  0.2213],
        [ 1.6447, -1.2799]]) 
 tensor([[ 2.2353],
        [ 4.9785],
        [ 2.0221],
        [ 3.2174],
        [ 1.0880],
        [ 2.9492],
        [ 3.0800],
        [-0.1181]])


In [None]:
# this would generate same because there is only 8 examples and batch size is
# 8 when we call next iter the same batch is repeated

data = SyntheticRegressionData(w=torch.tensor([2, -3.4]), b=4.2, num_train=1, num_val=8, batch_size = 8)
X, y = next(iter(data.val_dataloader()))
print(X)
print(y)
X_, y_ = next(iter(data.val_dataloader()))
print(X_)
print(y_)