# Workshop 12 Datasets & DataLoaders
สำหรับข้อมูลที่กำหนดให้ ให้นักศึกษา
1. สร้าง Dataset
2. สร้าง DataLoader โดยกำหนด batch size เป็น 32
3. ทดลองเรียกข้อมูลจากข้อ 1 มาแสดงให้เห็นว่าสามารถทำได้
4. ทดลองเรียกข้อมูลจากข้อ 2 มาแสดงให้เห็นว่าสามารถทำได้
5. สร้าง transform ของข้อมูลแต่ละชุดแล้วทำตามข้อ 3 และ 4 เพื่อแสดงให้เห็นว่าสามารถทำได้


In [None]:
from torch.utils.data import Dataset,DataLoader
from torchvision import transforms
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import os
from PIL import Image
import numpy as np

## A. Tabular data
ใช้ข้อมูลที่เตรียมไว้ให้

In [None]:
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
header_list = ["sepal_length","sepal_width", "petal_length", "petal_width","species"]
df = pd.read_csv(data_url,header=None,names=header_list,encoding='utf-8')
X=df.iloc[:,:-1].values
y=df.iloc[:,-1].values
le = LabelEncoder()
y = le.fit_transform(y)

## A.1

In [None]:
class CustomTaburaDataset(Dataset):
    def __init__(self, X, y, transform=None):
        self.X = X
        self.y = y
        self.transform = transform

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        data = self.X[[idx], :]
        label = self.y[idx]
        if self.transform:
            data = self.transform(data)
        return data, label


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
train_tabular_data = CustomTaburaDataset(X_train, y_train)
test_tabular_data = CustomTaburaDataset(X_test, y_test)

## A.2

In [None]:
train_tabular_dataloader = DataLoader(train_tabular_data, batch_size = 32, shuffle = True)
test_tabular_dataloader = DataLoader(test_tabular_data, batch_size =32, shuffle = False)

## A.3

In [None]:
train_tabular_data.__getitem__(0)

(array([[4.6, 3.6, 1. , 0.2]]), 0)

## A.4

In [None]:
train_features, train_labels = next(iter(train_tabular_dataloader))
train_features, train_labels

(tensor([[[5.4000, 3.7000, 1.5000, 0.2000]],
 
         [[7.1000, 3.0000, 5.9000, 2.1000]],
 
         [[6.3000, 2.8000, 5.1000, 1.5000]],
 
         [[5.5000, 2.3000, 4.0000, 1.3000]],
 
         [[4.6000, 3.2000, 1.4000, 0.2000]],
 
         [[7.4000, 2.8000, 6.1000, 1.9000]],
 
         [[6.4000, 2.7000, 5.3000, 1.9000]],
 
         [[5.0000, 3.6000, 1.4000, 0.2000]],
 
         [[5.9000, 3.0000, 4.2000, 1.5000]],
 
         [[5.8000, 4.0000, 1.2000, 0.2000]],
 
         [[5.1000, 2.5000, 3.0000, 1.1000]],
 
         [[6.3000, 2.3000, 4.4000, 1.3000]],
 
         [[5.7000, 2.9000, 4.2000, 1.3000]],
 
         [[6.5000, 2.8000, 4.6000, 1.5000]],
 
         [[5.2000, 4.1000, 1.5000, 0.1000]],
 
         [[5.0000, 3.2000, 1.2000, 0.2000]],
 
         [[5.0000, 2.3000, 3.3000, 1.0000]],
 
         [[5.8000, 2.7000, 5.1000, 1.9000]],
 
         [[6.0000, 2.2000, 5.0000, 1.5000]],
 
         [[5.2000, 2.7000, 3.9000, 1.4000]],
 
         [[4.9000, 2.4000, 3.3000, 1.0000]],
 
         [[5.

## A.5

In [None]:
mean_train_tabural = X_train.mean(axis=0)
std_train_tabural = X_train.std(axis=0)
transform=transforms.Lambda(lambda x: x-mean_train_tabural/std_train_tabural)
train_tabular_data = CustomTaburaDataset(X_train, y_train, transform=transform)
test_tabular_data = CustomTaburaDataset(X_test, y_test, transform=transform)

train_tabular_dataloader = DataLoader(train_tabular_data, batch_size = 32, shuffle = True)
test_tabular_dataloader = DataLoader(test_tabular_data, batch_size =32, shuffle = False)


In [None]:
train_tabular_data.__getitem__(49)

(array([[-2.08119459, -4.57799014,  1.16259861, -0.57604513]]), 1)

## B. Image data from folders with labels inferred from folder names
โหลดข้อมูลแล้วสร้าง dataset โดยใช้ ImageFolder เป็นหลัก
data:  https://drive.google.com/file/d/1yCH8zIGo9KjdscRfrvpR1R7mel4uq1GH/view?usp=drive_link

Pytorch doc: [ImageFolder](https://pytorch.org/vision/main/generated/torchvision.datasets.ImageFolder.html)

## B.1

In [None]:
from torchvision.datasets import ImageFolder

In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

In [None]:
train_img_dataset = ImageFolder('/content/image_dataset/Small_Data_CoV2_train', transform = transform)
test_img_dataset = ImageFolder('/content/image_dataset/Small_Data_CoV2_test', transform = transform)
val_img_dataset = ImageFolder('/content/image_dataset/Small_Data_CoV2_val', transform = transform)

## B.2

In [None]:
train_img_dataloader = DataLoader(train_img_dataset, batch_size = 32, shuffle = True)
test_img_dataloader = DataLoader(test_img_dataset, batch_size = 32, shuffle = False)
val_img_dataloader = DataLoader(val_img_dataset, batch_size = 32, shuffle = True)

## B.3

In [None]:
train_img_dataset.__getitem__(0)

(tensor([[[0.7490, 0.7765, 0.7882,  ..., 0.7804, 0.7569, 0.7569],
          [0.7451, 0.7647, 0.7569,  ..., 0.7804, 0.7843, 0.7451],
          [0.7647, 0.7843, 0.7608,  ..., 0.7725, 0.7686, 0.7490],
          ...,
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000]],
 
         [[0.7490, 0.7765, 0.7882,  ..., 0.7804, 0.7569, 0.7569],
          [0.7451, 0.7647, 0.7569,  ..., 0.7804, 0.7843, 0.7451],
          [0.7647, 0.7843, 0.7608,  ..., 0.7725, 0.7686, 0.7490],
          ...,
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000]],
 
         [[0.7490, 0.7765, 0.7882,  ..., 0.7804, 0.7569, 0.7569],
          [0.7451, 0.7647, 0.7569,  ..., 0.7804, 0.7843, 0.7451],
          [0.7647, 0.7843, 0.7608,  ...,

## B.4

In [None]:
train_features, train_labels = next(iter(train_img_dataloader))
train_features, train_labels

(tensor([[[[0.7490, 0.7490, 0.7608,  ..., 0.7529, 0.7529, 0.7647],
           [0.7647, 0.7451, 0.7725,  ..., 0.7647, 0.7725, 0.7725],
           [0.7647, 0.7451, 0.7569,  ..., 0.7490, 0.7608, 0.7686],
           ...,
           [0.7412, 0.7569, 0.7098,  ..., 0.2745, 0.2941, 0.3098],
           [0.7686, 0.7529, 0.7686,  ..., 0.2824, 0.3333, 0.3216],
           [0.7490, 0.7804, 0.7882,  ..., 0.3137, 0.2980, 0.2941]],
 
          [[0.7490, 0.7490, 0.7608,  ..., 0.7529, 0.7529, 0.7647],
           [0.7647, 0.7451, 0.7725,  ..., 0.7647, 0.7725, 0.7725],
           [0.7647, 0.7451, 0.7569,  ..., 0.7490, 0.7608, 0.7686],
           ...,
           [0.7412, 0.7569, 0.7098,  ..., 0.2745, 0.2941, 0.3098],
           [0.7686, 0.7529, 0.7686,  ..., 0.2824, 0.3333, 0.3216],
           [0.7490, 0.7804, 0.7882,  ..., 0.3137, 0.2980, 0.2941]],
 
          [[0.7490, 0.7490, 0.7608,  ..., 0.7529, 0.7529, 0.7647],
           [0.7647, 0.7451, 0.7725,  ..., 0.7647, 0.7725, 0.7725],
           [0.7647, 0.74

## B.5

In [None]:
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=5),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [None]:
train_img_dataset = ImageFolder('/content/image_dataset/Small_Data_CoV2_train', transform = train_transform)
test_img_dataset = ImageFolder('/content/image_dataset/Small_Data_CoV2_test', transform = test_transform)
val_img_dataset = ImageFolder('/content/image_dataset/Small_Data_CoV2_val', transform = test_transform)

In [None]:
train_img_dataset.__getitem__(0)

(tensor([[[-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
          [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
          [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
          ...,
          [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
          [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
          [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179]],
 
         [[-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
          [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
          [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
          ...,
          [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
          [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
          [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357]],
 
         [[-1.8044, -1.8044, -1.8044,  ..., -1.8044, -1.8044, -1.8044],
          [-1.8044, -1.8044,

In [None]:
train_img_dataloader = DataLoader(train_img_dataset, batch_size = 32, shuffle = True)
test_img_dataloader = DataLoader(test_img_dataset, batch_size = 32, shuffle = False)
val_img_dataloader = DataLoader(val_img_dataset, batch_size = 32, shuffle = True)

## C. Image data from file names and label list
โหลดข้อมูลแล้วอ่านชื่อทุกไฟล์ภาพใน folder ย่อยมาเก็บไว้ แล้วสร้าง dataset ขึ้นมาใช้เอง

data: https://drive.google.com/file/d/1yCH8zIGo9KjdscRfrvpR1R7mel4uq1GH/view?usp=drive_link

Pytorch tutorial: [data tutorial](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html)

## C.1

In [None]:
class CustomImageDataset(Dataset):
  def __init__(self, im_filename, y, transform=None):
    self.im_filename = im_filename
    self.y = y
    self.transform = transform

  def __len__(self):
    return len(self.y)

  def __getitem__(self, idx):
    img_path = self.im_filename.iloc[idx]
    image = Image.open(img_path).convert('RGB')
    label = self.y.iloc[idx]

    if self.transform:
       image = self.transform(image)

    image_np = np.array(image)

    return image_np, label

In [None]:
def read_image_dir(path):
    filename = []
    label = []
    #read all file from subfolders
    for dirname, _, filenames in os.walk(path):
        for file in filenames:
            f = os.path.join(dirname, file)
            if '.png' in f:
                filename.append(f)
                label.append(dirname.split('/')[3])
    df = pd.DataFrame({'filename':filename, 'label':label})
    return df

In [None]:
train_img_df = read_image_dir('/content/image_dataset/Small_Data_CoV2_test')
test_img_df = read_image_dir('/content/image_dataset/Small_Data_CoV2_train')
val_img_df = read_image_dir('/content/image_dataset/Small_Data_CoV2_val')

In [None]:
le = LabelEncoder()
for df in [train_img_df, test_img_df, val_img_df]:
    df['label'] = le.fit_transform(df['label'])

In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

In [None]:
train_img_dataset = CustomImageDataset(train_img_df['filename'], train_img_df['label'], transform = transform)
test_img_dataset = CustomImageDataset(test_img_df['filename'], test_img_df['label'], transform = transform)
val_img_dataset = CustomImageDataset(val_img_df['filename'], val_img_df['label'], transform = transform)

## C.2

In [None]:
train_img_dataloader = DataLoader(train_img_dataset, batch_size = 32, shuffle = True)
test_img_dataloader = DataLoader(test_img_dataset, batch_size = 32, shuffle = False)
val_img_dataloader = DataLoader(val_img_dataset, batch_size = 32, shuffle = True)

## C.3

In [None]:
train_img_dataset.__getitem__(0)

(array([[[0.81960785, 0.8235294 , 0.80784315, ..., 0.79607844,
          0.8       , 0.8039216 ],
         [0.8235294 , 0.8039216 , 0.8       , ..., 0.8039216 ,
          0.79607844, 0.80784315],
         [0.8117647 , 0.8       , 0.8       , ..., 0.8117647 ,
          0.80784315, 0.80784315],
         ...,
         [0.8392157 , 0.8392157 , 0.8235294 , ..., 0.79607844,
          0.78431374, 0.7882353 ],
         [0.7882353 , 0.8       , 0.7921569 , ..., 0.8       ,
          0.80784315, 0.8156863 ],
         [0.77254903, 0.7764706 , 0.7764706 , ..., 0.79607844,
          0.8117647 , 0.8235294 ]],
 
        [[0.81960785, 0.8235294 , 0.80784315, ..., 0.79607844,
          0.8       , 0.8039216 ],
         [0.8235294 , 0.8039216 , 0.8       , ..., 0.8039216 ,
          0.79607844, 0.80784315],
         [0.8117647 , 0.8       , 0.8       , ..., 0.8117647 ,
          0.80784315, 0.80784315],
         ...,
         [0.8392157 , 0.8392157 , 0.8235294 , ..., 0.79607844,
          0.78431374, 0.

In [None]:
next(iter(train_img_dataloader))

[tensor([[[[0.8039, 0.8118, 0.8275,  ..., 0.9137, 0.7529, 0.3882],
           [0.7961, 0.7961, 0.8196,  ..., 0.8745, 0.8902, 0.5961],
           [0.8039, 0.8196, 0.8275,  ..., 0.8275, 0.9059, 0.8078],
           ...,
           [0.8392, 0.8275, 0.8275,  ..., 0.8431, 0.8196, 0.8078],
           [0.8078, 0.8118, 0.8157,  ..., 0.7765, 0.7686, 0.7882],
           [0.7608, 0.7647, 0.7725,  ..., 0.8549, 0.8392, 0.8549]],
 
          [[0.8039, 0.8118, 0.8275,  ..., 0.9137, 0.7529, 0.3882],
           [0.7961, 0.7961, 0.8196,  ..., 0.8745, 0.8902, 0.5961],
           [0.8039, 0.8196, 0.8275,  ..., 0.8275, 0.9059, 0.8078],
           ...,
           [0.8392, 0.8275, 0.8275,  ..., 0.8431, 0.8196, 0.8078],
           [0.8078, 0.8118, 0.8157,  ..., 0.7765, 0.7686, 0.7882],
           [0.7608, 0.7647, 0.7725,  ..., 0.8549, 0.8392, 0.8549]],
 
          [[0.8039, 0.8118, 0.8275,  ..., 0.9137, 0.7529, 0.3882],
           [0.7961, 0.7961, 0.8196,  ..., 0.8745, 0.8902, 0.5961],
           [0.8039, 0.81

## C.5


In [None]:
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=5),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [None]:
train_img_dataset = CustomImageDataset(train_img_df['filename'], train_img_df['label'], transform = train_transform)
test_img_dataset = CustomImageDataset(test_img_df['filename'], test_img_df['label'], transform = test_transform)
val_img_dataset = CustomImageDataset(val_img_df['filename'], val_img_df['label'], transform = test_transform)

In [None]:
train_img_dataloader = DataLoader(train_img_dataset, batch_size = 32, shuffle = True)
test_img_dataloader = DataLoader(test_img_dataset, batch_size = 32, shuffle = False)
val_img_dataloader = DataLoader(val_img_dataset, batch_size = 32, shuffle = True)

In [None]:
train_img_dataset.__getitem__(0)

(array([[[-2.117904 , -2.117904 , -2.117904 , ..., -2.117904 ,
          -2.117904 , -2.117904 ],
         [-2.117904 , -2.117904 , -2.117904 , ..., -2.117904 ,
          -2.117904 , -2.117904 ],
         [-2.117904 , -2.117904 , -2.117904 , ..., -2.117904 ,
          -2.117904 , -2.117904 ],
         ...,
         [-2.117904 , -2.117904 , -2.117904 , ..., -2.117904 ,
          -2.117904 , -2.117904 ],
         [-2.117904 , -2.117904 , -2.117904 , ..., -2.117904 ,
          -2.117904 , -2.117904 ],
         [-2.117904 , -2.117904 , -2.117904 , ..., -2.117904 ,
          -2.117904 , -2.117904 ]],
 
        [[-2.0357141, -2.0357141, -2.0357141, ..., -2.0357141,
          -2.0357141, -2.0357141],
         [-2.0357141, -2.0357141, -2.0357141, ..., -2.0357141,
          -2.0357141, -2.0357141],
         [-2.0357141, -2.0357141, -2.0357141, ..., -2.0357141,
          -2.0357141, -2.0357141],
         ...,
         [-2.0357141, -2.0357141, -2.0357141, ..., -2.0357141,
          -2.0357141, -2

In [None]:
next(iter(train_img_dataloader))

[tensor([[[[-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
           [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
           [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
           ...,
           [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
           [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
           [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179]],
 
          [[-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
           [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
           [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
           ...,
           [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
           [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
           [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357]],
 
          [[-1.8044, -1.8044, -1.8044,  ..., -1.8044, -1.8044, -1.8044],
           [-