#2. Kaggle Dataset

- Although simple datasets are provided by PyTorch, constructing own custom dataset is the most difficult and time-consuming part in deep learning.

- Therefore, we will use *somewhat* raw data and train a model.

- We will use [100 Sports Image Classification](https://www.kaggle.com/datasets/gpiosenka/sports-classification) dataset which contains 13,572 training data, 500 validation and 500 test data with the shape of [H, W, C] = [224, 244, 3] in jpeg format.

- Since the structure of dataset is constructed as the subfolders of each class with image, there are two ways we can construct dataset as follows:
    - Building custom dataset 
    - Use `torch.utils.data.DataFolder`

`TODO: Add a brief explanation why we need to make custom dataset`

In [165]:
# Import libraries to use for Deep Learning 
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split, Subset
from torchvision import datasets, transforms
from torchvision.io import read_image
from torchsummary import summary
import pandas as pd
from PIL import Image
import os 

import cv2 as cv2

In [166]:
!pip install gdown && gdown 'https://drive.google.com/uc?id=1rctM1HDoc24XOcRzsYyTSavaFrvuoKZc' && unzip ./archive.zip -d ./sports

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Downloading...
From: https://drive.google.com/uc?id=1rctM1HDoc24XOcRzsYyTSavaFrvuoKZc
To: /content/archive.zip
100% 500M/500M [00:01<00:00, 302MB/s]
Archive:  ./archive.zip
replace ./sports/EfficientNetB3-sports-0.97.h5? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

### Custom Dataset can be constructed as follows:


```
class CustomDataset(torch.utils.data.Dataset):
    # Inherit torch.utils.data.Dataset class

    def __init__(self,):
        # Initialize the dataset (handling data paths, check input and target data, data augmentation, etc.)

    def __len__(self):
        # Return the number of data or sample in dataset 
    
    def __getitem__(self, index):
        # Return the input and target by index
```

- Further information, please refer [here](https://pytorch.org/tutorials/beginner/data_loading_tutorial.html).

- Making custom dataset will be given as an assignment!

In [226]:
import pandas as pd
### PLEASE WRITE YOUR CODE BELOW.

d1 = pd.read_csv('/content/sports/sports.csv')
sports_train = d1[d1['data set'] == 'train']
sports_valid = d1[d1['data set'] == 'valid'].reset_index(drop=True)
sports_test = d1[d1['data set'] == 'test'].reset_index(drop=True)

class_dict = pd.read_csv('/content/sports/class_dict.csv')
class CustomDataset(Dataset):

    def __init__(self,first):
        # Initialize the dataset (handling data paths, check input and target data, data augmentation, etc.)
        
        self.first = first

        dict1 = {}
        for i in range(100):
            dict1[class_dict['class'][i]] = class_dict['class_index'][i]

        self.new_csv = []

        if first == 'original':
            self.sports = d1           
        elif first ==  'train':
            self.sports = sports_train
        elif first ==  'valid':
            self.sports = sports_valid
        elif first ==  'test':
            self.sports = sports_test

        
        leng = len(self.sports['filepaths'])
        for i in range(leng):
           self.new_csv.append((self.sports['filepaths'][i], dict1[self.sports['labels'][i]])) 

    def __len__(self):
        # Return the number of data or sample in dataset 
        return len(self.new_csv)

    def __getitem__(self, index):
        # Return the input and target by index
        data_tuple = self.new_csv[index]
        img_path = data_tuple[0]
        label = data_tuple[1]

        image = cv2.imread('/content/sports/' + img_path)
        image = torch.tensor(image, dtype = torch.float32).permute(2, 0, 1)
        label = torch.tensor(label) 
        # image = cv2.imread(os.path.join('/content/sports/' , img_path))

        return image, label
        
# Deep learning model이 학습이나 model 자체를 통과시키려면, data type이 tensor인 것도 중요하지만, float32
### END OF THE CODE.

In [227]:
len(CustomDataset('original'))

14572

In [228]:
len(CustomDataset('valid'))

500

In [229]:
len(CustomDataset('test'))

500

In [230]:
### PLEASE WRITE YOUR CODE BELOW.

train_dataset = CustomDataset('train')
valid_dataset = CustomDataset('valid')
test_dataset = CustomDataset('test')

### YOU CAN USE ANY TRANSFORMS YOU WANT. MAKE IT RUNNABLE!

### NOTE: Fixed errata - changed train_dataloader, valid_dataloader, test_dataloader to train_loader, valid_loader, test_loader by Seungwoo

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

### END OF THE CODE.

In [231]:
### PLEASE WRITE YOUR CODE BELOW.
class SimpleCNN(nn.Module):
    def __init__(self, in_channels, num_classes):
        super().__init__()

        self.layer1 = nn.Sequential(
            nn.Conv2d(in_channels=in_channels, out_channels=16, kernel_size=3,
                               stride=1, padding=1),
            nn.BatchNorm2d(num_features=16),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.AvgPool2d(kernel_size=2, stride=2),                   
        )

        self.layer2 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3,
                               stride=1, padding=1),
            nn.BatchNorm2d(num_features=32),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.AvgPool2d(kernel_size=2, stride=2),                   
        )

        self.layer3 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3,
                               stride=1, padding=1),
            nn.BatchNorm2d(num_features=64),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.AvgPool2d(kernel_size=2, stride=2),                   
        )

        self.layer4 = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3,
                               stride=1, padding=1),
            nn.BatchNorm2d(num_features=128),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.AvgPool2d(kernel_size=2, stride=2),                   
        )

        self.classifier = nn.Sequential(
            nn.Linear(in_features = 128*14*14, out_features = 5012),
            nn.ReLU(),
            nn.Linear(5012, 100)

        )

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = torch.flatten(out, 1)
        out = self.classifier(out)

        return out


### END OF THE CODE.

In [232]:

### NOTE: Fixed errata - changed the below codes by Seungwoo
### model = SimpleCNN(in_channels=3, num_classes=100).to(device)
### summary(model, (3, 256, 256), device='cuda') 

model = SimpleCNN(in_channels=3, num_classes=100).cuda()
summary(model, (3, 224, 224), device='cuda')

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 16, 224, 224]             448
       BatchNorm2d-2         [-1, 16, 224, 224]              32
              ReLU-3         [-1, 16, 224, 224]               0
           Dropout-4         [-1, 16, 224, 224]               0
         AvgPool2d-5         [-1, 16, 112, 112]               0
            Conv2d-6         [-1, 32, 112, 112]           4,640
       BatchNorm2d-7         [-1, 32, 112, 112]              64
              ReLU-8         [-1, 32, 112, 112]               0
           Dropout-9         [-1, 32, 112, 112]               0
        AvgPool2d-10           [-1, 32, 56, 56]               0
           Conv2d-11           [-1, 64, 56, 56]          18,496
      BatchNorm2d-12           [-1, 64, 56, 56]             128
             ReLU-13           [-1, 64, 56, 56]               0
          Dropout-14           [-1, 64,

In [233]:
lr = 1e-4
epochs = 2

model = SimpleCNN(in_channels=3, num_classes=100).cuda()
criterion = nn.CrossEntropyLoss().cuda()
optimizer = torch.optim.Adam(model.parameters(), lr= lr)

In [234]:
def train(model, optimizer, criterion, data_loader, epoch):
    model.train()
    total_loss = 0.0
    for idx, batch in enumerate(data_loader):
        img, target = batch[0].cuda(), batch[1].cuda()

        optimizer.zero_grad()
        output = model(img)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() 

        if idx % 10 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch + 1, idx * img.size(0), len(data_loader.dataset),
                100. * idx * img.size(0) / len(data_loader.dataset), 
                loss.data))

    return total_loss / len(data_loader)


def validate(model, criterion, data_loader):
    model.eval()
    val_loss = 0.0
    val_acc = 0.0

    with torch.no_grad():
        for idx, batch in enumerate(data_loader):
            img, target = batch[0].cuda(), batch[1].cuda()

            ### PLEASE WRITE YOUR CODE BELOW.

            # Make a prediction
            output = model(img)
            # Calculate validation loss (although it is optional)
            loss = criterion(output, target)
            # Get the right prediction - make sure naming the prediction as 'predicted' 
            _, predicted = torch.max(output.data, 1)

            val_loss += loss.item()
            val_acc += (predicted == target).sum().item()
            
            ### END OF THE CODE.

            val_loss += loss.item()
            val_acc += (predicted == target).sum().item()

        total_val_acc = val_acc / len(data_loader.dataset)
        print('\nValidation set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
            val_loss / len(data_loader), val_acc, len(data_loader.dataset),
            100. * total_val_acc))
    
    return total_val_acc

In [235]:
def test(model, criterion, data_loader):
    model.eval()
    test_loss = 0.0
    test_acc = 0.0

    for idx, batch in enumerate(data_loader):
        img, target = batch[0].cuda(), batch[1].cuda()

        output = model(img)
        loss = criterion(output, target)
        _, predicted = torch.max(output.data, 1)
        test_loss += loss.item()
        test_acc += (predicted == target).sum().item()

    print('\n Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss  / len(data_loader), test_acc, len(data_loader.dataset),
        100. * test_acc / len(data_loader.dataset)))

    return test_acc / len(data_loader.dataset)

In [236]:
for epoch in range(epochs):
    train_loss = train(model, optimizer, criterion, train_loader, epoch)
    validation_accuracy = validate(model, criterion, valid_loader)


Validation set: Average loss: 5.9168, Accuracy: 252.0/500 (50%)


Validation set: Average loss: 4.9979, Accuracy: 360.0/500 (72%)



In [237]:
test_accuracy = test(model, criterion, test_loader)


 Test set: Average loss: 2.2934, Accuracy: 201.0/500 (40%)



In [238]:
print(train_dataset[0][0].size())
print(model(torch.rand(1, 3, 224, 224, device='cuda')).size())
test(model, criterion, test_loader)

torch.Size([3, 224, 224])
torch.Size([1, 100])

 Test set: Average loss: 2.2934, Accuracy: 201.0/500 (40%)



0.402