In [127]:
import os, sys
import csv
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm
import random
from PIL import Image
import pytorch_lightning as pl
def set_all_seeds(seed):
    os.environ["PL_GLOBAL_SEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


In [128]:
# Set device cuda for GPU if it's available, otherwise run on the CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameters
input_size = 3 * 224 * 224
num_classes = 2
learning_rate = 1e-3
batch_size = 32
num_epochs = 3
random_seed = 1
set_all_seeds(random_seed)

In [129]:
# Reduce code length by creating a _common_step function, in order not to have to repeat code in training_step, validation_step and test_step


class NN(pl.LightningModule): # pl.LightningModule inherits from nn.Module and adds extra functionality
    def __init__(self, input_size, num_classes): # In the constructor, you declare all the layers you want to use.
        super().__init__()
        self.fc1 = nn.Linear(input_size, 50)
        self.fc2 = nn.Linear(50, num_classes)
        self.loss_fn = nn.CrossEntropyLoss()
        
    def forward(self, x): # Forward function computes output Tensors from input Tensors. In the forward function, you define how your model is going to be run, from input to output. We're accepting only a single input in here, but if you want, feel free to use more
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

    def training_step(self, batch, batch_idx):
        loss, scores, y = self._common_step(batch, batch_idx)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        loss, scores, y = self._common_step(batch, batch_idx)
        self.log('val_loss', loss)
        return loss
    
    def test_step(self, batch, batch_idx):
        loss, scores, y = self._common_step(batch, batch_idx)
        self.log('test_loss', loss)
        return loss
    
    def _common_step(self, batch, batch_idx):
        x, y = batch
        x = x.reshape(x.size(0), -1)
        scores = self.forward(x)
        loss = self.loss_fn(scores, y)
        return loss, scores, y
    
    def predict_step(self, batch, batch_idx):
        x, y = batch
        x = x.reshape(x.size(0), -1) # flattening
        scores = self.forward(x)
        preds = torch.argmax(scores, dim=1)
        return preds
    
    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=0.001)

In [130]:
# Find folder paths

base_path = r"C:/Users/kaspe/OneDrive - Aarhus Universitet/Skrivebord/BI/4. semester/Data/LBST/Danish Challenge/2023 J#/"
target_dirs = os.listdir(base_path)
print(target_dirs)

['All', 'Approved', 'NonApproved', 'test_set.csv', 'train_set.csv', 'val_set.csv', 'xlbst.csv']


In [131]:
# Create 2 separate df's: one for Approved images, one for NonApproved, containing file_name and label

# Assign label = 0 to Approved images
approved = pd.DataFrame(data = os.listdir(os.path.join(base_path, target_dirs[1])), columns = ['file_name'])
approved = approved.assign(label = 0)

# Assign label = 1 to NonApproved images
nonapproved = pd.DataFrame(data = os.listdir(os.path.join(base_path, target_dirs[2])), columns = ['file_name'])
nonapproved = nonapproved.assign(label = 1)

# Merge into 1 df
df = pd.concat([approved, nonapproved])

# Add parcel_id column containing character 3-10 from file_name column
df['parcel_id'] = df['file_name'].str[3:10]

# Write .csv
df.to_csv(r'C:/Users/kaspe/OneDrive - Aarhus Universitet/Skrivebord/BI/4. semester/Data/LBST/Danish Challenge/2023 J#/xlbst.csv', sep = ',', encoding='utf-8', index=False)

In [132]:
# Split group by

set_all_seeds(random_seed)

from sklearn.model_selection import GroupShuffleSplit

splitter = GroupShuffleSplit(test_size=0.20, n_splits=2, random_state=random_seed)
split = splitter.split(df, groups=df.parcel_id)
train_inds, test_inds = next(split)

train_set = df.iloc[train_inds]
test_set = df.iloc[test_inds]


print(test_set.to_string())
print('Test set length:',len(test_set), 'Train set length:', len(train_set), sep='\n')

                                           file_name  label parcel_id
0   22-0223605_F78EFF88702EA742E0530EEE260AEFC6.jpeg      1   0223605
1   22-0223605_F78EFF887030A742E0530EEE260AEFC6.jpeg      1   0223605
2   22-0223605_F78EFF88703AA742E0530EEE260AEFC6.jpeg      1   0223605
28  22-0225160_F78EFF887012A742E0530EEE260AEFC6.jpeg      1   0225160
29  22-0225160_F78EFF887013A742E0530EEE260AEFC6.jpeg      1   0225160
30  22-0225160_F78EFF887014A742E0530EEE260AEFC6.jpeg      1   0225160
31  22-0225160_F78EFF887015A742E0530EEE260AEFC6.jpeg      1   0225160
32  22-0225160_F78EFF887016A742E0530EEE260AEFC6.jpeg      1   0225160
33  22-0225160_F78EFF887017A742E0530EEE260AEFC6.jpeg      1   0225160
34  22-0225160_F78EFF88701AA742E0530EEE260AEFC6.jpeg      1   0225160
35  22-0225160_F78EFF88701BA742E0530EEE260AEFC6.jpeg      1   0225160
36  22-0225160_F78EFF88701CA742E0530EEE260AEFC6.jpeg      1   0225160
37  22-0225160_F78EFF88701DA742E0530EEE260AEFC6.jpeg      1   0225160
38  22-0225160_F78EF

In [133]:
# Split train into 80/20 train/val

train_set2 = train_set
splitter2 = GroupShuffleSplit(test_size=0.2, n_splits =1, random_state=random_seed)
split2 = splitter2.split(train_set2, groups=train_set2.parcel_id)
train_inds2, val_inds = next(split2)

train_set2 = train_set.iloc[train_inds2]
val_set = train_set.iloc[val_inds]

# Print train, val, test
#print(train_set.to_string())
#print(train_set2)
#print(val_set)
#print(len(train_set), len(train_set2), len(val_set))

# Save to csv

train_set2.to_csv(r'C:/Users/kaspe/OneDrive - Aarhus Universitet/Skrivebord/BI/4. semester/Data/LBST/Danish Challenge/2023 J#/train_set.csv', sep = ',', encoding='utf-8', index=False)
val_set.to_csv(r'C:/Users/kaspe/OneDrive - Aarhus Universitet/Skrivebord/BI/4. semester/Data/LBST/Danish Challenge/2023 J#/val_set.csv', sep = ',', encoding='utf-8', index=False)
test_set.to_csv(r'C:/Users/kaspe/OneDrive - Aarhus Universitet/Skrivebord/BI/4. semester/Data/LBST/Danish Challenge/2023 J#/test_set.csv', sep = ',', encoding='utf-8', index=False)

In [134]:
# https://www.youtube.com/watch?v=ZoZHd0Zm3RY&ab_channel=AladdinPersson

set_all_seeds(random_seed)

# Create PyTorch compatible datasets: From images to tensors

from customDataset import LBSTDataset
train_set = LBSTDataset(csv_file = r'C:/Users/kaspe/OneDrive - Aarhus Universitet/Skrivebord/BI/4. semester/Data/LBST/Danish Challenge/2023 J#/train_set.csv', 
                      root_dir = r'C:/Users/kaspe/OneDrive - Aarhus Universitet/Skrivebord/BI/4. semester/Data/LBST/Danish Challenge/2023 J#/All',
                      transform = transforms.Compose([
                                                transforms.Resize((224, 224)),
                                                #transforms.RandomHorizontalFlip(),
                                                #transforms.RandomResizedCrop(224),
                                                transforms.ToTensor()#,
                                                #transforms.Normalize()
                      ]))

val_set = LBSTDataset(csv_file = r'C:/Users/kaspe/OneDrive - Aarhus Universitet/Skrivebord/BI/4. semester/Data/LBST/Danish Challenge/2023 J#/val_set.csv', 
                      root_dir = r'C:/Users/kaspe/OneDrive - Aarhus Universitet/Skrivebord/BI/4. semester/Data/LBST/Danish Challenge/2023 J#/All',
                      transform = transforms.Compose([
                                                transforms.Resize((224, 224)),
                                                #transforms.RandomHorizontalFlip(),
                                                #transforms.RandomResizedCrop(224),
                                                transforms.ToTensor()#,
                                                #transforms.Normalize()
                      ]))

test_set = LBSTDataset(csv_file = r'C:/Users/kaspe/OneDrive - Aarhus Universitet/Skrivebord/BI/4. semester/Data/LBST/Danish Challenge/2023 J#/test_set.csv', 
                      root_dir = r'C:/Users/kaspe/OneDrive - Aarhus Universitet/Skrivebord/BI/4. semester/Data/LBST/Danish Challenge/2023 J#/All',
                      transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
]))

len(train_set)
print(train_set)
print('The shape of tensor for 50th image in train dataset: ',train_set[49][0].shape)
print('The label for 50th image in train dataset: ',train_set[49][1])

<customDataset.LBSTDataset object at 0x000001BB00118880>
The shape of tensor for 50th image in train dataset:  torch.Size([3, 224, 224])
The label for 50th image in train dataset:  tensor(1)


In [135]:
set_all_seeds(random_seed)

# Train, val and test loader

train_loader = DataLoader(dataset=train_set,
                          batch_size=batch_size,
                          shuffle=True) # Shuffling is done during the training to make sure we aren’t exposing our model to the same cycle (order) of data in every epoch. 

val_loader = DataLoader(dataset=val_set,
                          batch_size=batch_size,
                          shuffle=False) # You don’t need to shuffle the validation and test datasets, since no training is done, the model is used in model.eval() and thus the order of samples won’t change the results.

test_loader = DataLoader(dataset=test_set,
                         batch_size=batch_size,
                         shuffle=False)

# print batch of image tensor
print('print 1st batch of image tensor:', next(iter(train_loader))[0].shape)
print('print batch of corresponding labels:', next(iter(train_loader))[1].shape)

print 1st batch of image tensor: torch.Size([32, 3, 224, 224])
print batch of corresponding labels: torch.Size([32])


In [136]:
model = NN(input_size=input_size, num_classes=num_classes).to(device)

In [137]:
os.cpu_count() 

12

In [138]:
# Train network
from lightning.pytorch import Trainer, seed_everything

seed_everything(42, workers=True) # By setting workers=True in seed_everything(), Lightning derives unique seeds across all dataloader workers and processes for torch, numpy and stdlib random number generators. When turned on, it ensures that e.g. data augmentations are not repeated across workers.

trainer = pl.Trainer(accelerator='auto', min_epochs=1, max_epochs=3, deterministic=True) # deterministic ensures random seed reproducibility
# trainer.tune finds optimal hyper parameters, eg batch size and learning rate

trainer.fit(model, train_loader, val_loader)
trainer.validate(model, val_loader)
trainer.test(model, test_loader)

# A general place to start is to set num_workers equal to the number of CPU cores on that machine. You can get the number of CPU cores in python using os.cpu_count(), but note that depending on your batch size, you may overflow RAM memory.


Global seed set to 42
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name    | Type             | Params
---------------------------------------------
0 | fc1     | Linear           | 7.5 M 
1 | fc2     | Linear           | 102   
2 | loss_fn | CrossEntropyLoss | 0     
---------------------------------------------
7.5 M     Trainable params
0         Non-trainable params
7.5 M     Total params
30.106    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


                                                                            

  rank_zero_warn(
  rank_zero_warn(


Epoch 0: 100%|██████████| 2/2 [00:01<00:00,  1.43it/s, v_num=11]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Epoch 0: 100%|██████████| 2/2 [00:01<00:00,  1.11it/s, v_num=11]33it/s][A
Epoch 1: 100%|██████████| 2/2 [00:01<00:00,  1.46it/s, v_num=11]       [A
Validation: 0it [00:00, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Epoch 1: 100%|██████████| 2/2 [00:01<00:00,  1.13it/s, v_num=11]08it/s][A
Epoch 2: 100%|██████████| 2/2 [00:01<00:00,  1.48it/s, v_num=11]       [A
Validation: 0it [00:00, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Epoch 2: 100%|██████████| 2/2 [00:01<00:00,  1.16it/s, v_num=11]51it/s][A
Epoch 2: 100%|██████████| 2/2 [00:01<00:00,  1.15it/s, v_num=11]       [A

`Trainer.fit` stopped: `max_epochs=3` reached.


Epoch 2: 100%|██████████| 2/2 [00:01<00:00,  1.02it/s, v_num=11]
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 98.89it/s] 


  rank_zero_warn(


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 111.38it/s]


[{'test_loss': 4.881039058091119e-05}]

In [114]:
class NN(nn.Module):
    def __init__(self, input_size, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 50)
        self.fc2 = nn.Linear(50, num_classes)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
    
class NN(pl.LightningModule): # pl.LightningModule inherits from nn.Module and adds extra functionality
    def __init__(self, input_size, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 50)
        self.fc2 = nn.Linear(50, num_classes)
        self.loss_fn = nn.CrossEntropyLoss()
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        x = x.reshape(x.size(0), -1)
        scores = self.forward(x) # compute scores
        loss = F.loss_fn(scores, y) # compute losses
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        x = x.reshape(x.size(0), -1)
        scores = self.forward(x) # compute scores
        loss = F.loss_fn(scores, y) # compute losses
        self.log('val_loss', loss)
        return loss
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        x = x.reshape(x.size(0), -1)
        scores = self.forward(x) # compute scores
        loss = F.loss_fn(scores, y) # compute losses
        self.log('test_loss', loss)
        return loss
    
    def predict_step(self, batch, batch_idx):
        x, y = batch
        x = x.reshape(x.size(0), -1)
        scores = self.forward(x)
        preds = torch.argmax(scores, dim=1)
        return preds
    
    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=0.001)