# Import Packages

In [4]:
import torch
import pandas as pd

In [5]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

# Pandas

In [6]:
# load the csv file
csv_data = pd.read_csv('simple.csv')

In [7]:
# print the csv file
print(csv_data['values'])

0      [1,2,3,4]
1      [2,4,6,8]
2     [3,6,9,12]
3    [4,8,12,16]
Name: values, dtype: object


## check the values attribute

In [8]:
# (opt) ast: change the type from str to list
import ast
csv_data['values'] = csv_data['values'].apply(ast.literal_eval)

In [9]:
csv_data['values']

0      [1, 2, 3, 4]
1      [2, 4, 6, 8]
2     [3, 6, 9, 12]
3    [4, 8, 12, 16]
Name: values, dtype: object

In [10]:
# show the values
print(csv_data['values'])

0      [1, 2, 3, 4]
1      [2, 4, 6, 8]
2     [3, 6, 9, 12]
3    [4, 8, 12, 16]
Name: values, dtype: object


In [11]:
# read a row at index 3
csv_data.iloc[3]

id                     3
values    [4, 8, 12, 16]
label                  2
Name: 3, dtype: object

In [12]:
# turn the list to a tensor
# check the type

In [13]:
# turn the list to a float tensor
# check the type

## check the label attribute

# From DataFrame to Dataset

In [14]:
# what does a dataset class look like
from torch.utils.data import Dataset
class MyDataset(Dataset):

    def __init__(self):
        super().__init__()

    def len(self):
        pass

    def __getitem__(self, index):
        pass


In [15]:
# build our own dataset

class NumberSequence(Dataset):

    def __init__(self):
        super().__init__()
        self.data = pd.read_csv('simple.csv')
        self.data['values'] = self.data['values'].apply(ast.literal_eval)

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        # data
        # row = self.data[index]
        row = self.data.iloc[index]
        values = row.get('values')
        print('type values', type(values))
        label = row.get('label')
        print('label', type(label))
        return [torch.tensor(values), torch.tensor(label)]

In [16]:
# check the len()
dataset = NumberSequence()

In [17]:
print(len(dataset))

4


In [18]:
print(dataset[3])

type values <class 'list'>
label <class 'numpy.int64'>
[tensor([ 4,  8, 12, 16]), tensor(2)]


# collate_fn

In [19]:
# opt: typing
from typing import List
def collate_fn(list: List[List[torch.Tensor]]):
    value_list = []
    label_list = []
    for batch in list:
        value_list.append(batch[0])
        label_list.append(batch[1])
    
    value_batch = torch.stack(value_list)
    label_batch = torch.stack(label_list)
    return value_batch, label_batch

In [20]:
# collate all samples to a list
alllist = []
for i in range(4):
    alllist.append(dataset[i])
print(alllist)

type values <class 'list'>
label <class 'numpy.int64'>
type values <class 'list'>
label <class 'numpy.int64'>
type values <class 'list'>
label <class 'numpy.int64'>
type values <class 'list'>
label <class 'numpy.int64'>
[[tensor([1, 2, 3, 4]), tensor(0)], [tensor([2, 4, 6, 8]), tensor(1)], [tensor([ 3,  6,  9, 12]), tensor(1)], [tensor([ 4,  8, 12, 16]), tensor(2)]]


In [21]:
audio_batch, label_batch = collate_fn(alllist)
print(audio_batch.shape)
print(label_batch.shape)
print(audio_batch)
print(label_batch)

torch.Size([4, 4])
torch.Size([4])
tensor([[ 1,  2,  3,  4],
        [ 2,  4,  6,  8],
        [ 3,  6,  9, 12],
        [ 4,  8, 12, 16]])
tensor([0, 1, 1, 2])


# Combine everything together

In [22]:
# dataloader
# batch_size = 2
# (opt) shuffle
from torch.utils.data import dataloader
dataloader = DataLoader(dataset, 
                        batch_size=2, 
                        shuffle=True, 
                        collate_fn=collate_fn)

In [23]:
# loop over the dataloader
for batch in dataloader:
    print(batch)

type values <class 'list'>
label <class 'numpy.int64'>
type values <class 'list'>
label <class 'numpy.int64'>
(tensor([[ 1,  2,  3,  4],
        [ 4,  8, 12, 16]]), tensor([0, 2]))
type values <class 'list'>
label <class 'numpy.int64'>
type values <class 'list'>
label <class 'numpy.int64'>
(tensor([[ 3,  6,  9, 12],
        [ 2,  4,  6,  8]]), tensor([1, 1]))


# Reading Materials:

https://docs.pytorch.org/tutorials/beginner/basics/data_tutorial.html

Should read this during your self-study time.