In [38]:
# import libraries
import os
import numpy as np
import pandas as pd
from PIL import Image
# import cv2
from sklearn.metrics import classification_report
# from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# import matplotlib.pyplot as plt
# import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms

# 1. Label and One-Hot encode the data
Some images could have multiple labels (i.e. someone could be using the computer and have bad posture). The following code goes through the entire folder and creates a csv file with the image path and one-hot encoded labels of each image.

In [9]:
# Define the dataset directory and labels
dataset_dir = 'dataset/images'
labels = ['computer_use', 'asleep', 'reading', 'not_present', 'good_posture', 'bad_posture']

# Create an empty dataframe with columns for image paths, image names, and labels
df = pd.DataFrame(columns=['image_path', 'image_name'] + labels)

# Traverse the dataset directory
for root, dirs, files in os.walk(dataset_dir):
    for file in files:
        if file.endswith(('.png', '.jpg', '.jpeg', '.webp')):  # Check for image files
            image_path = os.path.join(root, file)
            image_name = os.path.basename(image_path)
            
            # Create a row with default label values set to 0
            row = {'image_path': image_path, 'image_name': image_name}
            for label in labels:
                row[label] = 0
            
            # Check the parent folder name and set the corresponding label to 1
            parent_folder = os.path.basename(root)
            if parent_folder in labels:
                row[parent_folder] = 1
            
            # Append the row to the dataframe
            df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)

# Filter the dataframe to only include rows with duplicate image_name values
duplicates = df[df.duplicated('image_name', keep=False)]

# Sort the duplicates dataframe based on image_name
duplicates = duplicates.sort_values(by='image_name')

# Extract the root folders for each duplicate image_name
duplicates['root_folder'] = duplicates['image_path'].apply(lambda x: os.path.basename(os.path.dirname(x)))

# Group by image_name and aggregate the labels
aggregated_labels = duplicates.groupby('image_name')[labels].sum().clip(upper=1)

# Update the original dataframe with the aggregated labels
for image_name, row in aggregated_labels.iterrows():
    mask = df['image_name'] == image_name
    for label in labels:
        df.loc[mask, label] = row[label]

# Remove duplicates based on image_name, keeping only the first occurrence
df = df.drop_duplicates(subset='image_name', keep='first')

df.to_csv('dataset/labels.csv', index=False)
df

Unnamed: 0,image_path,image_name,computer_use,asleep,reading,not_present,good_posture,bad_posture
0,dataset/images/reading/rgp2 (2).jpg,rgp2 (2).jpg,0,0,1,0,1,0
1,dataset/images/reading/rbp1 (30).jpg,rbp1 (30).jpg,0,0,1,0,0,1
2,dataset/images/reading/rgp2 (14).jpg,rgp2 (14).jpg,0,0,1,0,1,0
3,dataset/images/reading/rgp1 (4).jpg,rgp1 (4).jpg,0,0,1,0,1,0
4,dataset/images/reading/rgp3 (14).jpg,rgp3 (14).jpg,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...
856,dataset/images/computer_use/busy-work-laptop-s...,busy-work-laptop-sits-large-empty-table-bright...,1,0,0,0,0,0
875,dataset/images/computer_use/20231015122142.jpg,20231015122142.jpg,1,0,0,0,0,0
876,dataset/images/computer_use/20231015122817.jpg,20231015122817.jpg,1,0,0,0,0,0
887,dataset/images/computer_use/istockphoto-164929...,istockphoto-1649292491-1024x1024.jpg,1,0,0,0,0,0


# 2. Preprocess the images & load the data
This involves resizing the images to a consistent size, converting them to grayscale, reducing noise and normalizing the pixel values

In [42]:
# Data Preprocessing
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.GaussianBlur(kernel_size=(5, 5), sigma=(0.1, 0.2)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    # transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

Create a custom dataset class to load the images and one-hot encoded labels from the csv file

In [61]:
# Custom Dataset
class CustomDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        self.dataframe = pd.read_csv(csv_file)
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx, 0]
        image = Image.open(img_path).convert('RGB')
        
        labels = self.dataframe.iloc[idx, 2:].values.astype(float) # gets label columns and converts to float
        labels = torch.tensor(labels, dtype=torch.float32)
        
        if self.transform:
            image = self.transform(image)
        
        return image, labels

# Load the dataset
dataset = CustomDataset(csv_file='dataset/labels.csv', transform=transform)

# Split the dataset into training and testing sets
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [62]:
# Check data loader 
data_iter = iter(train_loader)
images, labels = next(data_iter)
print(type(images), type(labels))
print(images.shape, labels.shape)
print("Labels: ", labels)

<class 'torch.Tensor'> <class 'torch.Tensor'>
torch.Size([32, 3, 128, 128]) torch.Size([32, 6])
Labels:  tensor([[1., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 1.],
        [1., 0., 0., 0., 0., 1.],
        [0., 0., 1., 0., 1., 0.],
        [0., 0., 1., 0., 0., 1.],
        [0., 1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 1., 0.],
        [0., 0., 1., 0., 0., 1.],
        [0., 0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0., 1.],
        [0., 0., 0., 1., 0., 0.],
        [1., 0., 0., 0., 0., 1.],
        [0., 0., 1., 0., 0., 1.],
        [0., 0., 1., 0., 1., 0.],
        [0., 0., 1., 0., 1., 0.],
        [0., 0., 1., 0., 1., 0.],
        [0., 0., 1., 0., 0., 1.],
        [1., 0., 0., 0., 1., 0.],
        [1., 0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 0., 1.],
        [0., 0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 1., 0.],
        [1., 0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 1., 0.],
        [0., 1., 0., 1., 0., 0.],
        [0., 1., 0., 1., 0., 0.],
        [1.

# 3. Create a CNN
Creating a convolutional neural network (CNN) for a multi-label image classification problem using PyTorch involves defining a model architecture that ends with a sigmoid activation function for each label, rather than a single softmax layer as in multi-class classification.

In [63]:
class MultiLabelCNN(nn.Module):
    def __init__(self):
        super(MultiLabelCNN, self).__init__()
        
        # Convolutional layers
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)
        
        # Max pooling layer
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        
        # Fully connected layers
        self.fc1 = nn.Linear(64 * 16 * 16, 512)  # Input image size is 128x128
        self.fc2 = nn.Linear(512, 6)  # 6 labels in your dataset
        
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        
        # Flatten the tensor
        x = x.view(-1, 64 * 16 * 16)
        
        x = F.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))  # Sigmoid activation for multi-label classification
        
        return x

# Instantiate the model
model = MultiLabelCNN()
print(model)

MultiLabelCNN(
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=16384, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=6, bias=True)
)


# 4. Train the model
When training the model, use the BCEWithLogitsLoss as the loss function. This loss function is designed for multi-label classification and combines the sigmoid activation function with binary cross-entropy loss.

During training, compute the loss using the predicted outputs and the true labels, and then backpropagate the errors to update the model's weights.

Remember, in multi-label classification, an image can belong to multiple classes simultaneously. The model will output a value between 0 and 1 for each label, indicating the probability of the image belonging to that label.

In [64]:
# define training hyperparameters
lr = 1e-3
num_epochs = 5

# set the device we will be using to train the model (to enable hardware acceleration)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu") # setting for mac
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

model.to(device)


for epoch in range(num_epochs):
    running_loss = 0.0
    correct_preds = 0
    total_samples = 0
    
    for inputs, labels in train_loader:
        # Shift data to device as well
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        
        # Compute loss
        loss = criterion(outputs, labels)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        # Update running loss
        running_loss += loss.item() * inputs.size(0)
        
        # Compute predicted labels
        predicted_labels = (torch.sigmoid(outputs) > 0.5).float()
        
        # Update number of correct predictions
        correct_preds += (predicted_labels == labels).sum().item()
        
        # Update total samples
        total_samples += labels.numel()
    
    # Compute average loss and accuracy for the epoch
    epoch_loss = running_loss / total_samples
    epoch_accuracy = correct_preds / total_samples
    print(torch.sigmoid(outputs))
    print(labels)
    
    print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {epoch_loss:.4f} - Accuracy: {epoch_accuracy:.4f}")


tensor([[0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000]], device='mps:0',
       grad_fn=<SigmoidBackward0>)
tensor([[0., 0., 1., 0., 0., 1.]], device='mps:0')
Epoch [1/5] - Loss: 0.1174 - Accuracy: 0.5987
tensor([[0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000]], device='mps:0',
       grad_fn=<SigmoidBackward0>)
tensor([[0., 0., 1., 0., 0., 1.]], device='mps:0')
Epoch [2/5] - Loss: 0.1155 - Accuracy: 0.7094
tensor([[0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000]], device='mps:0',
       grad_fn=<SigmoidBackward0>)
tensor([[0., 0., 1., 0., 0., 1.]], device='mps:0')
Epoch [3/5] - Loss: 0.1155 - Accuracy: 0.7094
tensor([[0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000]], device='mps:0',
       grad_fn=<SigmoidBackward0>)
tensor([[0., 0., 1., 0., 0., 1.]], device='mps:0')
Epoch [4/5] - Loss: 0.1155 - Accuracy: 0.7094
tensor([[0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000]], device='mps:0',
       grad_fn=<SigmoidBackward0>)
tensor([[0., 0., 1., 0., 0., 1.]], device='mps:0')
Epoch [5/5]

# 5. Evaluate the model

In [54]:
model.eval()

label_names = ["computer_use", "asleep", "reading", "not_present", "good_posture", "bad_posture"]
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        preds = (torch.sigmoid(outputs) > 0.5) # Apply sigmoid and threshold at 0.5
        all_preds.append(preds.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Concatenate results from all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)
print(all_preds)
print(all_labels)

# Convert predictions to binary (0 or 1)
binary_preds = all_preds.astype(float)

print(classification_report(all_labels, binary_preds, target_names=label_names))  # Assuming label_names is a list of your label names

[[False False False False False False]
 [False False False False False False]
 [False False False False False False]
 [False False False False False False]
 [False False False False False False]
 [False False False False False False]
 [False False False False False False]
 [False False False False False False]
 [False False False False False False]
 [False False False False False False]
 [False False False False False False]
 [False False False False False False]
 [False False False False False False]
 [False False False False False False]
 [False False False False False False]
 [False False False False False False]
 [False False False False False False]
 [False False False False False False]
 [False False False False False False]
 [False False False False False False]
 [False False False False False False]
 [False False False False False False]
 [False False False False False False]
 [False False False False False False]
 [False False False False False False]
 [False False False False

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
