In [23]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import math
import os
from pathlib import Path

## Iris

In [24]:
data = pd.read_csv('datasets/iris.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [25]:
data.head(3)

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa


In [26]:
data.Species.unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

## Dataset

In [27]:
class IrisDataset(Dataset):
    def __init__(self, root=os.path.join(Path(os.getcwd()), 'datasets'), data_file='Iris.csv'):
        # data loading
        xy = pd.read_csv(os.path.join(root, data_file))
        map = {specie:i for i, specie in enumerate(xy.Species.unique())}
        
        self.x = torch.from_numpy(xy.loc[:, 'SepalLengthCm':'PetalWidthCm'].values)
        self.y = torch.from_numpy(xy.Species.replace(map).values.astype('int'))
        self.n_samples = xy.shape[0]
        
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return self.n_samples

In [28]:
dataset = IrisDataset()

In [29]:
sample, label = dataset[4]

In [30]:
sample

tensor([5.0000, 3.6000, 1.4000, 0.2000], dtype=torch.float64)

In [31]:
label

tensor(0)

## Dataloader

In [48]:
batch_size = 30
dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)

In [49]:
dataiter = iter(dataloader)

In [50]:
data = next(dataiter)
data

[tensor([[6.5000, 2.8000, 4.6000, 1.5000],
         [5.5000, 2.6000, 4.4000, 1.2000],
         [5.2000, 3.5000, 1.5000, 0.2000],
         [6.8000, 2.8000, 4.8000, 1.4000],
         [4.8000, 3.4000, 1.6000, 0.2000],
         [6.3000, 2.9000, 5.6000, 1.8000],
         [5.6000, 3.0000, 4.5000, 1.5000],
         [5.0000, 3.6000, 1.4000, 0.2000],
         [5.7000, 3.0000, 4.2000, 1.2000],
         [4.9000, 3.1000, 1.5000, 0.1000],
         [4.6000, 3.6000, 1.0000, 0.2000],
         [6.9000, 3.1000, 5.1000, 2.3000],
         [6.5000, 3.0000, 5.5000, 1.8000],
         [7.0000, 3.2000, 4.7000, 1.4000],
         [5.7000, 2.6000, 3.5000, 1.0000],
         [5.4000, 3.0000, 4.5000, 1.5000],
         [5.8000, 2.8000, 5.1000, 2.4000],
         [4.6000, 3.1000, 1.5000, 0.2000],
         [5.5000, 2.5000, 4.0000, 1.3000],
         [4.7000, 3.2000, 1.6000, 0.2000],
         [5.0000, 2.0000, 3.5000, 1.0000],
         [6.7000, 3.1000, 4.7000, 1.5000],
         [7.9000, 3.8000, 6.4000, 2.0000],
         [4

In [51]:
data = next(dataiter)
data

[tensor([[6.7000, 3.1000, 4.4000, 1.4000],
         [5.2000, 4.1000, 1.5000, 0.1000],
         [5.8000, 4.0000, 1.2000, 0.2000],
         [5.4000, 3.4000, 1.5000, 0.4000],
         [5.5000, 3.5000, 1.3000, 0.2000],
         [5.1000, 3.4000, 1.5000, 0.2000],
         [5.1000, 3.8000, 1.9000, 0.4000],
         [4.4000, 2.9000, 1.4000, 0.2000],
         [6.4000, 3.1000, 5.5000, 1.8000],
         [5.8000, 2.7000, 5.1000, 1.9000],
         [5.0000, 3.5000, 1.3000, 0.3000],
         [6.4000, 3.2000, 5.3000, 2.3000],
         [6.5000, 3.0000, 5.8000, 2.2000],
         [6.6000, 3.0000, 4.4000, 1.4000],
         [6.1000, 3.0000, 4.6000, 1.4000],
         [6.3000, 3.3000, 6.0000, 2.5000],
         [6.7000, 3.0000, 5.0000, 1.7000],
         [5.0000, 3.4000, 1.5000, 0.2000],
         [4.5000, 2.3000, 1.3000, 0.3000],
         [5.6000, 2.8000, 4.9000, 2.0000],
         [5.8000, 2.6000, 4.0000, 1.2000],
         [5.7000, 2.9000, 4.2000, 1.3000],
         [5.1000, 3.7000, 1.5000, 0.4000],
         [5

## Training

In [56]:
max_epochs = 3
total_samples = len(dataset)
n_iterations = math.ceil(total_samples/batch_size)
print(max_epochs, total_samples, n_iterations)

3 150 5


In [57]:
for epoch in range(max_epochs):
    for i, (inputs, labels) in enumerate(dataloader):
        # forward backward update
   
        print(f'epoch {epoch+1}/{max_epochs}, step {i+1}/{n_iterations}, inputs {inputs.shape}')

epoch 1/3, step 1/5, inputs torch.Size([30, 4])
epoch 1/3, step 2/5, inputs torch.Size([30, 4])
epoch 1/3, step 3/5, inputs torch.Size([30, 4])
epoch 1/3, step 4/5, inputs torch.Size([30, 4])
epoch 1/3, step 5/5, inputs torch.Size([30, 4])
epoch 2/3, step 1/5, inputs torch.Size([30, 4])
epoch 2/3, step 2/5, inputs torch.Size([30, 4])
epoch 2/3, step 3/5, inputs torch.Size([30, 4])
epoch 2/3, step 4/5, inputs torch.Size([30, 4])
epoch 2/3, step 5/5, inputs torch.Size([30, 4])
epoch 3/3, step 1/5, inputs torch.Size([30, 4])
epoch 3/3, step 2/5, inputs torch.Size([30, 4])
epoch 3/3, step 3/5, inputs torch.Size([30, 4])
epoch 3/3, step 4/5, inputs torch.Size([30, 4])
epoch 3/3, step 5/5, inputs torch.Size([30, 4])


O número de iterações do dataloader é igual ao número total de amostras dividido pelo tamanho do batch. Se o dataset tem tamanho 100 e o batch tamanho 20, o dataloader gera 5 batchs que podem ser iterados e encerra. Repetimos essas 5 iterações n épocas.