# Uploading libraries

In [145]:
# Fundamental libraries

import os
import random
import time
import copy

# Working libraries
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import wandb
from torch.autograd import Variable
import torch.utils.data as data
import torchvision
from torchvision import transforms
from PIL import Image
from transformers import AutoImageProcessor, ViTForImageClassification, ViTConfig
import torchvision.transforms.functional as TF
from torchvision.io import read_image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, datasets, models
import torch.optim as optim
from torch.optim import lr_scheduler
from importlib import reload
import utility
reload(utility)
from utility import load_data, plot_confusion_matrix, plot_average_f1_scores, train_model, get_classification_details, get_hard_disk_path, show_samples, plot_features_importance, visualize_correlation, get_shap

# Evaluation
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay

# FILE PATHS on local environment

#### Each folder path represent class (label) :

**Folder name - calss name : description**

0 - 0 class : dead

1 - 1 class : empty

2 - 2 class : keep0

3 - 3 class : keep1

4 - 4 class : keep2

5 - 5 class : reseed0

6 - 6 class : reseed1

7 -  7 class : split

In [86]:
# paths for data upload
FILE_PATH = get_hard_disk_path("DL")
TRAIN_FEATURES_PATH_0 = FILE_PATH + 'dead' 
TRAIN_FEATURES_PATH_1 = FILE_PATH + 'empty'
TRAIN_FEATURES_PATH_2 = FILE_PATH + 'keep0'
TRAIN_FEATURES_PATH_3 = FILE_PATH + 'keep1'
TRAIN_FEATURES_PATH_4 = FILE_PATH + 'keep2'
TRAIN_FEATURES_PATH_5 = FILE_PATH + 'reseed0'
TRAIN_FEATURES_PATH_6 = FILE_PATH + 'reseed1'
TRAIN_FEATURES_PATH_7 = FILE_PATH + 'split'

# list with pathe
PATHES_LIST = [TRAIN_FEATURES_PATH_0,TRAIN_FEATURES_PATH_1,TRAIN_FEATURES_PATH_2,TRAIN_FEATURES_PATH_3,TRAIN_FEATURES_PATH_4, TRAIN_FEATURES_PATH_5, TRAIN_FEATURES_PATH_6, TRAIN_FEATURES_PATH_7]

Successfully loaded data from D:/data_for_DL_augmented/


# Data set : train & test

The data was divided into training and testing sets for each class, with a split of 80% for training and 20% for testing, maintaining the same ratio for each class. However, the dataset exhibits an imbalance issue, with one class having a significantly larger number of samples compared to the other class.

In [87]:
labels = ['dead', 'empty', 'keep0', 'keep1', 'keep2', 'reseed0', 'reseed1', 'split']
for i, path in enumerate(PATHES_LIST):
    print(path)
    print("Class ", labels[i], " : train : ",len(os.listdir(os.path.join(path, "train")))," test : ",len(os.listdir(os.path.join(path, "test"))))

D:/data_for_DL_augmented/dead
Class  dead  : train :  78  test :  4
D:/data_for_DL_augmented/empty
Class  empty  : train :  72  test :  3
D:/data_for_DL_augmented/keep0
Class  keep0  : train :  96  test :  5
D:/data_for_DL_augmented/keep1
Class  keep1  : train :  114  test :  5
D:/data_for_DL_augmented/keep2
Class  keep2  : train :  90  test :  4
D:/data_for_DL_augmented/reseed0
Class  reseed0  : train :  24  test :  2
D:/data_for_DL_augmented/reseed1
Class  reseed1  : train :  48  test :  2
D:/data_for_DL_augmented/split
Class  split  : train :  54  test :  3


### Customise test data set

The class `CustomImageDataset_test` is a custom dataset class used for testing or evaluating a machine learning model on a specific set of images. Here's a brief explanation of why it's needed:

1. **Path List**: The `path_list` parameter contains the list of paths to directories where the images are located. Each directory represents a different class or category.

2. **Transform**: The `transform` parameter represents the image transformations that need to be applied to each image, such as resizing, normalization, or augmentation.

3. **Ratio**: The `ratio` parameter determines the proportion of images that will be used for testing. It allows you to specify the desired split between the training and testing datasets.

4. **Initialization**: During initialization, the class calculates the size of each class based on the number of image files in the corresponding directory. It then determines the number of images to be included in the test set based on the specified ratio.

5. **Data Organization**: The class organizes the image paths and their corresponding labels, keeping only the images that will be used for testing. This ensures that the test dataset contains the desired proportion of samples from each class.

6. **Length and Indexing**: The `__len__` method returns the total number of images in the test set, while the `__getitem__` method allows indexing to retrieve a specific image and its label.

7. **Image Processing**: Within `__getitem__`, the class reads the image from the file path, converts it to a floating-point tensor, and applies the specified transformations. The image tensor and its corresponding label are then returned as a tuple.

By implementing this custom dataset class, you can easily load and process the test data in a standardized manner, making it convenient to evaluate the model's performance on a specific test set.

In [88]:
class CustomImageDataset_test(Dataset):
    def __init__(self, path_list, transform, ratio, model_name):
        self.path_list = path_list
        self.len = 0
        self.img_labels = []
        self.img_sort = []
        self.transform = transform
        self.processor = AutoImageProcessor.from_pretrained(model_name)  # Initialize the AutoImageProcessor

        for i, img_dir in enumerate(path_list):
            class_size = len([f for f in os.listdir(img_dir) if f.endswith(".jpg")])
            class_size_test = round(class_size * (1 - ratio))
            self.len += class_size_test
            self.img_labels.extend([i] * class_size_test)
            self.img_sort.extend(sorted([os.path.join(img_dir, f) for f in os.listdir(img_dir) if f.endswith(".jpg")][-class_size_test:]))

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        img_path = self.img_sort[idx]
        image = read_image(img_path).float() / 255  # Loading the image and normalizing
        label = self.img_labels[idx]

        if self.transform:
            image = self.transform(image)

        # Process the image for the model using the AutoImageProcessor
        image = self.processor(images=image, return_tensors="pt").pixel_values.squeeze()

        return image, label

### Customise train data set

The class `CustomImageDataset_train` is another custom dataset class specifically designed for training a machine learning model using augmented images. Here's a brief explanation of why it's needed, along with the rationale behind using data augmentation techniques:

1. **Path List**: Similar to the previous class, the `path_list` parameter contains the list of paths to directories where the training images are located.

2. **Transform**: The `transform` parameter represents the image transformations that need to be applied to each training image, such as resizing, normalization, or augmentation.

3. **Ratio**: The `ratio` parameter determines the proportion of images that will be used for training. It allows you to specify the desired split between the training and testing datasets.

4. **Initialization**: During initialization, the class calculates the size of each class based on the number of image files in the corresponding directory. It then determines the number of images to be included in the training set based on the specified ratio.

5. **Data Organization**: The class organizes the image paths and their corresponding labels, keeping only the images that will be used for training.

6. **Length and Indexing**: The `__len__` method returns the total number of images in the training set, while the `__getitem__` method allows indexing to retrieve a specific image and its label.

7. **Image Processing**: Within `__getitem__`, the class reads the image from the file path, converts it to a floating-point tensor, and applies the specified transformations. Additionally, the class applies data augmentation techniques to the training images to enhance the diversity and generalizability of the dataset.

8. **Augmentation Techniques**: The `augmentation` method implements several augmentation techniques commonly used for image data. These techniques include random cropping, random horizontal and vertical flipping, and rotation by a random angle between 0 and 45 degrees. By applying these techniques randomly to each training image, the dataset becomes more robust and less sensitive to variations in the input data.

The purpose of using data augmentation is to artificially increase the size and variability of the training dataset. This helps prevent overfitting and improves the model's ability to generalize to new, unseen data. By introducing random transformations during training, the model becomes more resilient to variations in the test data and can better handle real-world scenarios.

Overall, the `CustomImageDataset_train` class provides a convenient way to load, preprocess, and augment the training data, enabling effective training of machine learning models on a diverse and expanded dataset.

In [89]:
class CustomImageDataset_train(Dataset):
    def __init__(self, path_list, transform, ratio, model_name):
        self.path_list = path_list
        self.len = 0
        self.img_labels = []
        self.img_sort = []
        self.transform = transform
        self.processor = AutoImageProcessor.from_pretrained(model_name) # Initialize the AutoImageProcessor

        for i, img_dir in enumerate(path_list):
            class_size = len([f for f in sorted(os.listdir(img_dir)) if f.endswith(".jpg")])
            class_size_train = class_size - round(class_size * (1 - ratio))
            self.len += class_size_train
            self.img_labels.extend([i] * class_size_train)
            self.img_sort.extend(sorted([os.path.join(img_dir, f) for f in os.listdir(img_dir) if f.endswith(".jpg")])[:class_size_train])

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        img_path = self.img_sort[idx]
        image = read_image(img_path).float() / 255. # Loading the image and normalizing
        label = self.img_labels[idx]

        if self.transform:
            image = self.transform(image)

        image = self.augmentation(image) # Applying custom augmentation

        # Process the image for the model using the AutoImageProcessor
        image = self.processor(images=image, return_tensors="pt").pixel_values.squeeze()

        return image, label
        
    def augmentation(self, image):
        # Random horizontal flipping
        if random.random() > 0.5:
            image = TF.hflip(image)

        # Random vertical flipping
        if random.random() > 0.5:
            image = TF.vflip(image)

        # Random rotation by 0, 90, 180, or 270 degrees
        degree = random.choice([0, 90, 180, 270])
        image = TF.rotate(image, degree)

        return image

### Separate data to train and test data with ratio 0.8 in test data

To utilize the CustomImageDataset_test and CustomImageDataset_train, it is necessary to specify the desired transformations for each case, such as Normalize and CenterCrop.

The following code demonstrates how to define the transformations:

For test data transformation using CenterCrop:

In [144]:
class CustomImageDataset_test(Dataset):
    def __init__(self, path_list, transform=None, model_name=None):
        self.path_list = path_list
        self.transform = transform
        self.processor = None
        
        # Load the processor if a model name is given
        if model_name is not None:
            self.processor = AutoImageProcessor.from_pretrained(model_name)

        self.img_labels = []
        self.img_sort = []

        for i, img_dir in enumerate(path_list):
            # Load all images in the directory
            img_files = sorted([f for f in os.listdir(img_dir) if f.endswith(".jpg")])
            self.img_labels.extend([i] * len(img_files))
            self.img_sort.extend([os.path.join(img_dir, f) for f in img_files])

    def __len__(self):
        return len(self.img_sort)

    def __getitem__(self, idx):
        img_path = self.img_sort[idx]
        image = read_image(img_path).float()
        label = self.img_labels[idx]

        if image.shape[0] == 1:  # assuming (channel, height, width)
            image = image.repeat(3, 1, 1)

        # Apply transformations if provided
        if self.transform:
            image = self.transform(image)

        # Process the image for the model if a processor is available
        if self.processor:
            image = self.processor(images=image, return_tensors="pt").pixel_values.squeeze()

        return image, label

In [136]:
train_path_list = [os.path.normpath(os.path.join(path,"train")) for path in PATHES_LIST]
test_path_list = [os.path.normpath(os.path.join(path,"test")) for path in PATHES_LIST]

In [137]:
print(train_path_list)

['D:\\data_for_DL_augmented\\dead\\train', 'D:\\data_for_DL_augmented\\empty\\train', 'D:\\data_for_DL_augmented\\keep0\\train', 'D:\\data_for_DL_augmented\\keep1\\train', 'D:\\data_for_DL_augmented\\keep2\\train', 'D:\\data_for_DL_augmented\\reseed0\\train', 'D:\\data_for_DL_augmented\\reseed1\\train', 'D:\\data_for_DL_augmented\\split\\train']


In [138]:
test_dataset = CustomImageDataset_test(path_list=test_path_list, 
                                  transform=None, 
                                  model_name='google/vit-base-patch16-224')

train_dataset = CustomImageDataset_test(path_list=train_path_list, 
                                  transform=None, 
                                  model_name='google/vit-base-patch16-224')

In [139]:
print(len(test_dataset), len(train_dataset))

28 576


In [140]:
model_name = "google/vit-base-patch16-224"
num_labels = len(labels)  # The number of unique labels/classes in your dataset

# Load the configuration of the model
config = ViTConfig.from_pretrained(model_name, num_labels=num_labels)

# Instantiate the model with the new configuration
model = ViTForImageClassification(config)

In [141]:
from torch.utils.data import DataLoader

test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [142]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=5e-5)

In [148]:
# Initialize a new wandb run
wandb.init(project="organoid_classification", entity="laurent-gurtler")

# Configurations (hyperparameters and model architecture)
config = wandb.config
config.learning_rate = 0.001

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

wandb.watch(model, criterion, log="all", log_freq=10)

num_epochs = 100  # Number of epochs to train for

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        # Move tensors to the configured device
        images = images.to(device)
        labels = labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(images).logits

        # Compute loss
        loss = criterion(outputs, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    # Logging the loss
    wandb.log({"epoch": epoch, "loss": running_loss/len(train_loader)})

    # Validation loop
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images).logits
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    # Calculate accuracy
    accuracy = 100 * correct / total

    # Logging the accuracy
    wandb.log({"epoch": epoch, "accuracy": accuracy})

    # Print statistics
    print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}, Accuracy: {accuracy} %")

# Final log
wandb.log({"final_accuracy": accuracy})

print('Finished Training')

# Close the wandb run
wandb.finish()

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011277777777932999, max=1.0…

Problem at: c:\Users\Laurent\anaconda3\envs\organoids-project\Lib\site-packages\wandb\sdk\wandb_init.py 852 getcaller


CommError: Run initialization has timed out after 90.0 sec. 
Please refer to the documentation for additional information: https://docs.wandb.ai/guides/track/tracking-faq#initstarterror-error-communicating-with-wandb-process-