# Pytorch Dataloader
---
by Pranpaveen Lay.  
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/layel2/pytorch-course/blob/main/4.dataloader.ipynb)

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/iris/iris.csv')
df.head(10)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
5,5.4,3.9,1.7,0.4,Setosa
6,4.6,3.4,1.4,0.3,Setosa
7,5.0,3.4,1.5,0.2,Setosa
8,4.4,2.9,1.4,0.2,Setosa
9,4.9,3.1,1.5,0.1,Setosa


In [3]:
df['variety'] = df['variety'].astype('category')
df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [4]:
df['variety_code'] = df['variety'].cat.codes
df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety,variety_code
0,5.1,3.5,1.4,0.2,Setosa,0
1,4.9,3.0,1.4,0.2,Setosa,0
2,4.7,3.2,1.3,0.2,Setosa,0
3,4.6,3.1,1.5,0.2,Setosa,0
4,5.0,3.6,1.4,0.2,Setosa,0


In [5]:
df = df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety,variety_code
0,5.0,3.2,1.2,0.2,Setosa,0
1,4.6,3.2,1.4,0.2,Setosa,0
2,5.1,3.8,1.5,0.3,Setosa,0
3,4.5,2.3,1.3,0.3,Setosa,0
4,5.0,3.5,1.3,0.3,Setosa,0
...,...,...,...,...,...,...
145,5.0,3.5,1.6,0.6,Setosa,0
146,5.1,2.5,3.0,1.1,Versicolor,1
147,6.0,2.9,4.5,1.5,Versicolor,1
148,7.4,2.8,6.1,1.9,Virginica,2


In [6]:
n_split = int(0.8*len(df))
n_split

120

In [7]:
df_train = df[:n_split]
df_test = df[n_split:]
len(df_train), len(df_test)

(120, 30)

In [8]:
# Assume
def train(data,label):
    ...

In [9]:
X_train = df_train.drop(columns = ['variety','variety_code']).values
y_train = df_train['variety_code'].values

In [10]:
n_epochs = 100
batch_size = 32

In [13]:
n_batch = int(np.ceil(len(X_train)/batch_size))
n_batch

4

In [14]:
for epoch in range(1):
    for batch_idx in range(n_batch):
        X_batch = X_train[batch_idx*batch_size : (batch_idx+1)*batch_size]
        y_batch = y_train[batch_idx*batch_size : (batch_idx+1)*batch_size]
        print(X_batch.shape)
        train(X_batch,y_batch)

(32, 4)
(32, 4)
(32, 4)
(24, 4)


In [16]:
32+32+32+24, len(X_train)

(120, 120)

# Torch dataset / dataloader

In [17]:
import torch

In [18]:
class myDataset(torch.utils.data.Dataset):
    def __init__(self,data,labels):
        self.data = data
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx): #retrun sample given idx
        
        return self.data[idx],self.labels[idx]

In [19]:
train_ds = myDataset(X_train,y_train)

In [20]:
train_ds.__getitem__(range(0,10))

(array([[5. , 3.2, 1.2, 0.2],
        [4.6, 3.2, 1.4, 0.2],
        [5.1, 3.8, 1.5, 0.3],
        [4.5, 2.3, 1.3, 0.3],
        [5. , 3.5, 1.3, 0.3],
        [5.7, 2.9, 4.2, 1.3],
        [5.1, 3.8, 1.6, 0.2],
        [5.5, 2.3, 4. , 1.3],
        [6.5, 3. , 5.5, 1.8],
        [6.3, 2.8, 5.1, 1.5]]),
 array([0, 0, 0, 0, 0, 1, 0, 1, 2, 2], dtype=int8))

In [58]:
class myDataset(torch.utils.data.Dataset):
    def __init__(self,df):
        self.data = df.drop(columns = ['variety','variety_code']).values
        self.labels = df['variety_code'].values
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx): #retrun sample given idx
        
        return self.data[idx],self.labels[idx]

In [59]:
train_ds.__getitem__(range(0,10))

(array([[5. , 3.2, 1.2, 0.2],
        [4.6, 3.2, 1.4, 0.2],
        [5.1, 3.8, 1.5, 0.3],
        [4.5, 2.3, 1.3, 0.3],
        [5. , 3.5, 1.3, 0.3],
        [5.7, 2.9, 4.2, 1.3],
        [5.1, 3.8, 1.6, 0.2],
        [5.5, 2.3, 4. , 1.3],
        [6.5, 3. , 5.5, 1.8],
        [6.3, 2.8, 5.1, 1.5]]),
 array([0, 0, 0, 0, 0, 1, 0, 1, 2, 2], dtype=int8))

In [65]:
train_ds

<__main__.myDataset at 0x2145a0038b0>

In [66]:
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=32, shuffle=True)

In [69]:
for x,y in train_loader:
    print(x.shape)

torch.Size([32, 4])
torch.Size([32, 4])
torch.Size([32, 4])
torch.Size([24, 4])
