# Data preparation (customDataset.py)

https://www.learnpytorch.io/04_pytorch_custom_datasets/#what-is-a-custom-dataset
https://github.com/mrdbourke/pytorch-deep-learning/blob/main/extras/04_custom_data_creation.ipynb

Build custom dataset

- Organize into folders
- Turn into tensors

Image classification format contains separate classes of images in seperate directories titled with a particular class name.

The format we aim for:

LBST/ <- overall dataset folder
    train/ <- training images
        Approved/ <- class name as folder name
            Parcel-1
                image01.jpeg
                image02.jpeg
                ...
            Parcel-2
                image-05.jpeg
                image-06.jpeg
                ...
        Not-Approved/
            Parcel-3
                image07.jpeg
                image08.jpeg
                ...
            Parcel-4
                image-09.jpeg
                image-10.jpeg
                ...
    test/ <- testing images
        Approved/ <- class name as folder name
            Parcel-5
                image11.jpeg
                image12.jpeg
                ...
            Parcel-6
                image-13.jpeg
                image-14.jpeg
                ...
        Not-Approved/
            Parcel-7
                image15.jpeg
                image16.jpeg
                ...
            Parcel-8
                image-17.jpeg
                image-18.jpeg
                ...

## Create csv file

In [47]:
import os, sys
import csv
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision
import random
from torch.utils.data import DataLoader
from PIL import Image


def set_all_seeds(seed):
    os.environ["PL_GLOBAL_SEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
# Hyperparameters
in_channel = 3
num_classes = 2
learning_rate = 1e-3
batch_size = 32
num_epochs = 1
random_seed = 1

set_all_seeds(random_seed)

# Find folder paths

base_path = r"C:/Users/kaspe/OneDrive - Aarhus Universitet/Skrivebord/BI/4. semester/Data/LBST/Danish Challenge/2023 J#/"
target_dirs = os.listdir(base_path)
print(target_dirs)

['All', 'Approved', 'NonApproved', 'test_set.csv', 'train_set.csv', 'valid_set.csv', 'xlbst.csv']


In [48]:
# Create 2 separate df's: one for Approved images, one for NonApproved, containing file_name and label

# Assign label = 0 to Approved images
approved = pd.DataFrame(data = os.listdir(os.path.join(base_path, target_dirs[1])), columns = ['file_name'])
approved = approved.assign(label = 0)

# Assign label = 1 to NonApproved images
nonapproved = pd.DataFrame(data = os.listdir(os.path.join(base_path, target_dirs[2])), columns = ['file_name'])
nonapproved = nonapproved.assign(label = 1)

# Merge into 1 df
df = pd.concat([approved, nonapproved])

# Add parcel_id column containing character 3-10 from file_name column
df['parcel_id'] = df['file_name'].str[3:10]
print(df)

# Write .csv
df.to_csv(r'C:/Users/kaspe/OneDrive - Aarhus Universitet/Skrivebord/BI/4. semester/Data/LBST/Danish Challenge/2023 J#/xlbst.csv', sep = ',', encoding='utf-8', index=False)

                                           file_name  label parcel_id
0   22-0223360_F796528C3F8CECCDE0530EEE260A12DE.jpeg      0   0223360
1   22-0223360_F796528C3F8DECCDE0530EEE260A12DE.jpeg      0   0223360
2   22-0223360_F796528C3F8EECCDE0530EEE260A12DE.jpeg      0   0223360
3   22-0223360_F796528C40F1ECCDE0530EEE260A12DE.jpeg      0   0223360
4   22-0223360_F796528C40F2ECCDE0530EEE260A12DE.jpeg      0   0223360
..                                               ...    ...       ...
35  22-0225160_F78EFF88701BA742E0530EEE260AEFC6.jpeg      1   0225160
36  22-0225160_F78EFF88701CA742E0530EEE260AEFC6.jpeg      1   0225160
37  22-0225160_F78EFF88701DA742E0530EEE260AEFC6.jpeg      1   0225160
38  22-0225160_F78EFF88701EA742E0530EEE260AEFC6.jpeg      1   0225160
39  22-0225160_F78EFF88701FA742E0530EEE260AEFC6.jpeg      1   0225160

[80 rows x 3 columns]


In [53]:
# Split group by

set_all_seeds(random_seed)

from sklearn.model_selection import GroupShuffleSplit 

splitter = GroupShuffleSplit(test_size = .20, n_splits = 1, random_state = random_seed) # n_splits saved 2 splits of train, test and valid: 
split = splitter.split(df, groups = df.parcel_id)
train_inds, test_inds = next(split)

train_set = df.iloc[train_inds]
test_set = df.iloc[test_inds]

print(test_set.to_string())

                                           file_name  label parcel_id
0   22-0223605_F78EFF88702EA742E0530EEE260AEFC6.jpeg      1   0223605
1   22-0223605_F78EFF887030A742E0530EEE260AEFC6.jpeg      1   0223605
2   22-0223605_F78EFF88703AA742E0530EEE260AEFC6.jpeg      1   0223605
28  22-0225160_F78EFF887012A742E0530EEE260AEFC6.jpeg      1   0225160
29  22-0225160_F78EFF887013A742E0530EEE260AEFC6.jpeg      1   0225160
30  22-0225160_F78EFF887014A742E0530EEE260AEFC6.jpeg      1   0225160
31  22-0225160_F78EFF887015A742E0530EEE260AEFC6.jpeg      1   0225160
32  22-0225160_F78EFF887016A742E0530EEE260AEFC6.jpeg      1   0225160
33  22-0225160_F78EFF887017A742E0530EEE260AEFC6.jpeg      1   0225160
34  22-0225160_F78EFF88701AA742E0530EEE260AEFC6.jpeg      1   0225160
35  22-0225160_F78EFF88701BA742E0530EEE260AEFC6.jpeg      1   0225160
36  22-0225160_F78EFF88701CA742E0530EEE260AEFC6.jpeg      1   0225160
37  22-0225160_F78EFF88701DA742E0530EEE260AEFC6.jpeg      1   0225160
38  22-0225160_F78EF

In [54]:
# Split train into 80/20 train/valid

train_set2 = train_set
splitter2 = GroupShuffleSplit(test_size = .2, n_splits = 1, random_state = random_seed)
split2 = splitter2.split(train_set2, groups = train_set2.parcel_id)
train_inds2, valid_inds = next(split2)

train_set2 = train_set.iloc[train_inds2]
valid_set = train_set.iloc[valid_inds]

# Print train, valid, test
print(train_set.to_string())
print(train_set2)
print(valid_set)

# Save to csv

train_set2.to_csv(r'C:/Users/kaspe/OneDrive - Aarhus Universitet/Skrivebord/BI/4. semester/Data/LBST/Danish Challenge/2023 J#/train_set.csv', sep = ',', encoding='utf-8', index=False)
valid_set.to_csv(r'C:/Users/kaspe/OneDrive - Aarhus Universitet/Skrivebord/BI/4. semester/Data/LBST/Danish Challenge/2023 J#/valid_set.csv', sep = ',', encoding='utf-8', index=False)
test_set.to_csv(r'C:/Users/kaspe/OneDrive - Aarhus Universitet/Skrivebord/BI/4. semester/Data/LBST/Danish Challenge/2023 J#/test_set.csv', sep = ',', encoding='utf-8', index=False)

                                           file_name  label parcel_id
0   22-0223360_F796528C3F8CECCDE0530EEE260A12DE.jpeg      0   0223360
1   22-0223360_F796528C3F8DECCDE0530EEE260A12DE.jpeg      0   0223360
2   22-0223360_F796528C3F8EECCDE0530EEE260A12DE.jpeg      0   0223360
3   22-0223360_F796528C40F1ECCDE0530EEE260A12DE.jpeg      0   0223360
4   22-0223360_F796528C40F2ECCDE0530EEE260A12DE.jpeg      0   0223360
5   22-0223360_F796528C40F3ECCDE0530EEE260A12DE.jpeg      0   0223360
6   22-0223360_F796528C4169ECCDE0530EEE260A12DE.jpeg      0   0223360
7   22-0223360_F796528C416AECCDE0530EEE260A12DE.jpeg      0   0223360
8   22-0223360_F796528C416BECCDE0530EEE260A12DE.jpeg      0   0223360
9   22-0223402_F796528C48EDECCDE0530EEE260A12DE.jpeg      0   0223402
10  22-0223402_F796528C48EEECCDE0530EEE260A12DE.jpeg      0   0223402
11  22-0223402_F796528C48EFECCDE0530EEE260A12DE.jpeg      0   0223402
12  22-0223402_F796528C48F0ECCDE0530EEE260A12DE.jpeg      0   0223402
13  22-0223402_F7965

## Data preparation

In [4]:
# https://www.youtube.com/watch?v=ZoZHd0Zm3RY&ab_channel=AladdinPersson

set_all_seeds(random_seed)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



from customDataset import LBSTDataset



# Load data: From images to tensor

train_set = LBSTDataset(csv_file = r'C:/Users/kaspe/OneDrive - Aarhus Universitet/Skrivebord/BI/4. semester/Data/LBST/Danish Challenge/2023 J#/train_set.csv', 
                      root_dir = r'C:/Users/kaspe/OneDrive - Aarhus Universitet/Skrivebord/BI/4. semester/Data/LBST/Danish Challenge/2023 J#/All',
                      transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
]))

test_set = LBSTDataset(csv_file = r'C:/Users/kaspe/OneDrive - Aarhus Universitet/Skrivebord/BI/4. semester/Data/LBST/Danish Challenge/2023 J#/test_set.csv', 
                      root_dir = r'C:/Users/kaspe/OneDrive - Aarhus Universitet/Skrivebord/BI/4. semester/Data/LBST/Danish Challenge/2023 J#/All',
                      transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
]))

image, label = train_set[0]
print(type(image))
len(train_set)
print(train_set)

<class 'torch.Tensor'>
<customDataset.LBSTDataset object at 0x000001E6B8894C40>


In [5]:
set_all_seeds(random_seed)


# Split data properly without information leak

#train_set, test_set = torch.utils.data.random_split(dataset, [60,20])

print('The shape of tensor for 50th image in train dataset: ',train_set[49][0].shape)
print('The label for 50th image in train dataset: ',train_set[49][1])

# Traind and test loader

train_loader = DataLoader(dataset = train_set,
                          batch_size = batch_size,
                          shuffle = True)

# print batch of image tensor
print('print 1st batch of image tensor:', next(iter(train_loader))[0].shape)
print('print batch of corresponding labels:', next(iter(train_loader))[1].shape)

test_loader = DataLoader(dataset = test_set,
                         batch_size = batch_size,
                         shuffle = True)

# Model

set_all_seeds(random_seed)

model = torchvision.models.googlenet(weights='GoogLeNet_Weights.DEFAULT')
model.to(device)

# Loss and optimizer

criterion = nn.CrossEntropyLoss() #BCELoss()
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

The shape of tensor for 50th image in train dataset:  torch.Size([3, 224, 224])
The label for 50th image in train dataset:  tensor(1)
print 1st batch of image tensor: torch.Size([32, 3, 224, 224])
print batch of corresponding labels: torch.Size([32])


In [6]:
# Train network

for epoch in range(num_epochs):
    losses = []
    
    for batch_idx, (data, targets) in enumerate(train_loader):
        # Get data to cuda if possible
        data = data.to(device=device)
        targets = targets.to(device=device)
        

        #forward
        scores = model(data)
        loss = criterion(scores, targets)

        losses.append(loss.item())

        #backward
        optimizer.zero_grad()
        loss.backward()

        #gradient descent or adam step
        optimizer.step()

    print(f'Cost at epoch {epoch} is {sum(losses)/len(losses)}')

# Check acc on training

def check_accuracy(loader, model):
    num_correct = 0
    num_samples = 0
    model.eval()
    
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device)
            y = y.to(device=device)
            
            scores = model(x)
            _, predictions = scores.max(1)
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)
            
        print(f'Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100}')
        
    model.train()
    
print("Checking accuracy on training set")
check_accuracy(train_loader, model)

print("Checking accuracy on test set")
check_accuracy(test_loader, model)

Cost at epoch 0 is 6.375879287719727
Checking accuracy on training set
Got 34 / 65 with accuracy 52.307692307692314
Checking accuracy on test set
Got 0 / 15 with accuracy 0.0


In [3]:
import os

In [5]:
os.getcwd()

'C:\\Users\\kaspe'

# data_loader.py
specifies how the data should be fed to the network: LightningDataModule

In short, data preparation has 4 steps:

- Download images
- Image transforms (these are highly subjective).
- Generate training, validation and test dataset splits.
- Wrap each dataset split in a DataLoader


In [5]:
# https://lightning.ai/docs/pytorch/latest/data/datamodule.html

import lightning.pytorch as pl
from torch.utils.data import random_split, DataLoader

# Note - you must have torchvision installed for this example
from torchvision.datasets import MNIST
from torchvision import transforms


class MNISTDataModule(pl.LightningDataModule):
    def __init__(self, data_dir: str = "./"):
        super().__init__()
        self.data_dir = data_dir
        self.transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])

    def prepare_data(self):
        # download
        MNIST(self.data_dir, train=True, download=True)
        MNIST(self.data_dir, train=False, download=True)

    def setup(self, stage: str):
        # Assign train/val datasets for use in dataloaders
        if stage == "fit":
            mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
            self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])

        # Assign test dataset for use in dataloader(s)
        if stage == "test":
            self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)

        if stage == "predict":
            self.mnist_predict = MNIST(self.data_dir, train=False, transform=self.transform)

    def train_dataloader(self):
        return DataLoader(self.mnist_train, batch_size=32)

    def val_dataloader(self):
        return DataLoader(self.mnist_val, batch_size=32)

    def test_dataloader(self):
        return DataLoader(self.mnist_test, batch_size=32)

    def predict_dataloader(self):
        return DataLoader(self.mnist_predict, batch_size=32)

data_module = MNISTDataModule()

# model.py
specifies the neural network architecture, the loss function and evaluation metrics

In [4]:
import torch
from torch import nn
import pytorch_lightning as pl
from torch.utils.data import DataLoader, random_split
from torch.nn import functional as F
from torchvision.datasets import MNIST
from torchvision import datasets, transforms
import os

class LightningMNISTClassifier(pl.LightningModule):

  def __init__(self):
    super().__init__()

    # mnist images are (1, 28, 28) (channels, width, height) 
    self.layer_1 = torch.nn.Linear(28 * 28, 128)
    self.layer_2 = torch.nn.Linear(128, 256)
    self.layer_3 = torch.nn.Linear(256, 10)

  def forward(self, x):
      batch_size, channels, width, height = x.size()

      # (b, 1, 28, 28) -> (b, 1*28*28)
      x = x.view(batch_size, -1)

      # layer 1 (b, 1*28*28) -> (b, 128)
      x = self.layer_1(x)
      x = torch.relu(x)

      # layer 2 (b, 128) -> (b, 256)
      x = self.layer_2(x)
      x = torch.relu(x)

      # layer 3 (b, 256) -> (b, 10)
      x = self.layer_3(x)

      # probability distribution over labels
      x = torch.log_softmax(x, dim=1)

      return x

  def cross_entropy_loss(self, logits, labels):
    return F.nll_loss(logits, labels)

  def training_step(self, train_batch, batch_idx):
      x, y = train_batch
      logits = self.forward(x)
      loss = self.cross_entropy_loss(logits, y)
      self.log('train_loss', loss)
      return loss


  def validation_step(self, val_batch, batch_idx):
      x, y = val_batch
      logits = self.forward(x)
      loss = self.cross_entropy_loss(logits, y)
      self.log('val_loss', loss)

  def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
    return optimizer


model = LightningMNISTClassifier()

# train.py
train models and save them

In [None]:
---
---
---
trainer.fit(model, data_module)

# Save the model with help from utils.py
utils.save_model(model=model1,
                 target_dir="models",
                 model_name="Scratch.pth")
utils.save_model(model=model2,
                 target_dir="models",
                 model_name="ResNet-18.pth")
utils.save_model(model=model3,
                 target_dir="models",
                 model_name="VGG-16.pth")

# evaluate.py
contains the main loop for evaluating the model

# search_hyperparams.py
hyper parameter search

# synthesize_results.py
An author synthesizes study data by combining the results together to enable comparison and to allow others to draw further conclusions from them

# evaluate.py
contains the main loop for evaluating the model

# utils.py
utility functions for handling hyperparams/logging/storing model

In [None]:
BATCH_SIZE = 128
NUM_EPOCHS = 20
LEARNING_RATE = 0.005
NUM_WORKERS = 4