<a href="https://colab.research.google.com/github/lima-breno/deep_learning_frameworks/blob/main/frameworks_pytorch_DATASET_LOADERS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **FRAMEWORKS DE DEEPLEARNING**
# **Prática - PyTorch**


**Descrição**: Este notebook apresenta uma introdução ao framework PyTorch.

##**Datasets e DataLoaders**

### Dataset
Um **`Dataset`** no PyTorch é uma **coleção de dados**. Ele define **como acessar** cada amostra individualmente.

Exemplo: Um conjunto de imagens com seus rótulos, onde cada item acessado é um par `(imagem, label)`.

---

### DataLoader
O **`DataLoader`** é um **iterador** que carrega os dados do `Dataset` em **mini-batches**, de forma eficiente.

Ele cuida de:
- Embaralhar os dados (`shuffle=True`)
- Carregar em lotes (`batch_size=32`, por exemplo)
- Fazer leitura paralela (`num_workers`)

---

### Analogia
- `Dataset`: é a **biblioteca com os livros**.
- `DataLoader`: é o **bibliotecário** que traz os livros para você em pilhas (batches), embaralha a ordem, e te entrega um por um.


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

import pandas as pd

In [None]:
file_path = 'https://raw.githubusercontent.com/renansantosmendes/lectures-cdas-2023/master/fetal_health.csv'

In [None]:
df = pd.read_csv(file_path)
df

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,120.0,0.000,0.000,0.000,0.000,0.0,0.0,73.0,0.5,43.0,...,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,0.006,0.000,0.006,0.003,0.0,0.0,17.0,2.1,0.0,...,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,0.003,0.000,0.008,0.003,0.0,0.0,16.0,2.1,0.0,...,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,0.003,0.000,0.008,0.003,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,0.007,0.000,0.008,0.000,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2121,140.0,0.000,0.000,0.007,0.000,0.0,0.0,79.0,0.2,25.0,...,137.0,177.0,4.0,0.0,153.0,150.0,152.0,2.0,0.0,2.0
2122,140.0,0.001,0.000,0.007,0.000,0.0,0.0,78.0,0.4,22.0,...,103.0,169.0,6.0,0.0,152.0,148.0,151.0,3.0,1.0,2.0
2123,140.0,0.001,0.000,0.007,0.000,0.0,0.0,79.0,0.4,20.0,...,103.0,170.0,5.0,0.0,153.0,148.0,152.0,4.0,1.0,2.0
2124,140.0,0.001,0.000,0.006,0.000,0.0,0.0,78.0,0.4,27.0,...,103.0,169.0,6.0,0.0,152.0,147.0,151.0,4.0,1.0,2.0


In [None]:
#Primeiro preciso separar dado de entrada e dado de saída. Para isto, vamos remover a coluna final, que trata sobre os resultados
df.drop(['fetal_health'], axis=1)

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_width,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency
0,120.0,0.000,0.000,0.000,0.000,0.0,0.0,73.0,0.5,43.0,...,64.0,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0
1,132.0,0.006,0.000,0.006,0.003,0.0,0.0,17.0,2.1,0.0,...,130.0,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0
2,133.0,0.003,0.000,0.008,0.003,0.0,0.0,16.0,2.1,0.0,...,130.0,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0
3,134.0,0.003,0.000,0.008,0.003,0.0,0.0,16.0,2.4,0.0,...,117.0,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0
4,132.0,0.007,0.000,0.008,0.000,0.0,0.0,16.0,2.4,0.0,...,117.0,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2121,140.0,0.000,0.000,0.007,0.000,0.0,0.0,79.0,0.2,25.0,...,40.0,137.0,177.0,4.0,0.0,153.0,150.0,152.0,2.0,0.0
2122,140.0,0.001,0.000,0.007,0.000,0.0,0.0,78.0,0.4,22.0,...,66.0,103.0,169.0,6.0,0.0,152.0,148.0,151.0,3.0,1.0
2123,140.0,0.001,0.000,0.007,0.000,0.0,0.0,79.0,0.4,20.0,...,67.0,103.0,170.0,5.0,0.0,153.0,148.0,152.0,4.0,1.0
2124,140.0,0.001,0.000,0.006,0.000,0.0,0.0,78.0,0.4,27.0,...,66.0,103.0,169.0,6.0,0.0,152.0,147.0,151.0,4.0,1.0


In [None]:
#criando tensor com os dados sem o resultado final --> Para uso nos inputs!
torch.tensor(data=df.drop(['fetal_health'], axis=1).values, dtype=torch.float32) # o values faz com que seja lido em array para poder converter para tensor

tensor([[1.2000e+02, 0.0000e+00, 0.0000e+00,  ..., 1.2100e+02, 7.3000e+01,
         1.0000e+00],
        [1.3200e+02, 6.0000e-03, 0.0000e+00,  ..., 1.4000e+02, 1.2000e+01,
         0.0000e+00],
        [1.3300e+02, 3.0000e-03, 0.0000e+00,  ..., 1.3800e+02, 1.3000e+01,
         0.0000e+00],
        ...,
        [1.4000e+02, 1.0000e-03, 0.0000e+00,  ..., 1.5200e+02, 4.0000e+00,
         1.0000e+00],
        [1.4000e+02, 1.0000e-03, 0.0000e+00,  ..., 1.5100e+02, 4.0000e+00,
         1.0000e+00],
        [1.4200e+02, 2.0000e-03, 2.0000e-03,  ..., 1.4500e+02, 1.0000e+00,
         0.0000e+00]])

In [None]:
#selecionando apenas a coluna final, com seus valores ---> para uso nos targets

torch.tensor(data=df['fetal_health'].values, dtype=torch.float32)

tensor([2., 1., 1.,  ..., 2., 2., 1.])

In [None]:
'''
O código acima tem como resultado um shapede uma unica linha com 2mil e poucos dados. Precisamos ajustar de maneira que fique
contemplando um dado com um shape [2,1]. É necessário ajustar
'''

#adicionando dimensão extra ao tensor
(
torch
.tensor(
    data=df['fetal_health'].values,
    dtype=torch.float32)
.unsqueeze(dim=1)
)

tensor([[2.],
        [1.],
        [1.],
        ...,
        [2.],
        [2.],
        [1.]])

In [None]:
torch.tensor(data=df['fetal_health'].values, dtype=torch.float32).unsqueeze(dim=1).shape

torch.Size([2126, 1])

In [None]:
#o resultado ficou como eu queria, x linhas e apenas uma coluna. Agora que organizei o dado, fica mais
  #fácil para poder fazer a criaão do dataset e dataloader
    # a indexação de um tensor é a mesma de uma lista!

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd

#criando uma classe para leitura do dataset
class CSVDataset(Dataset):
    def __init__(self, csv_path: str) -> None:
       df = pd.read_csv(csv_path)
       self.inputs = (
           torch
           .tensor(
               data=df.drop(['fetal_health'],
                            axis=1).values,
               dtype=torch.float32)
           )
       self.targets = (
           torch
           .tensor(
               data=df['fetal_health'].values,
               dtype=torch.float32)
           .unsqueeze(dim=1)
           )

    def __len__(self) -> int:
      return len(self.inputs)

    def __getitem__(self, idx: int) -> tuple:
      return self.inputs[idx], self.targets[idx]

# Instanciando dataset
dataset = CSVDataset(csv_path = file_path)

In [None]:
# Criando DataLoader (para o dataloader preciso de um dataset, o contrario n é necesario!)
dataloader = DataLoader(
    dataset = dataset,
    batch_size = 128,
    shuffle = True,
   )

# Usando o DataLoader
for x_batch, y_batch in dataloader:
    print("x:", x_batch.shape)
    print("y:", y_batch.shape)
    print("-" * 30)
    break

#Aqui ele fraciona o dataset em 128 exemplos e e itera em cima de todo o  até completar os 2126 exemplos (qtd total de dados) e fecha uma época.

x: torch.Size([128, 21])
y: torch.Size([128, 1])
------------------------------
