## Importing necessary libraries

In [2]:
import numpy as np
from PIL import Image
import torch
from torch.utils.data import DataLoader
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.datasets import CocoDetection
from torchvision.transforms import transforms

## defining own data loader and loading the data

In [3]:
# Set paths and configurations
data_dir = 'data/train2017'
batch_size = 10
num_epochs = 10

# Define transforms
transform = transforms.Compose([
    transforms.ToTensor(),
    # Add any other necessary transformations
])

# Define a custom collate function
def collate_fn(batch):
    images = [item[0] for item in batch]
    targets = [item[1] for item in batch]

    # Resize images to a consistent size
    max_width = max(img.shape[1] for img in images)
    max_height = max(img.shape[2] for img in images)
    resized_images = []
    for img in images:
        pad_width = max_width - img.shape[1]
        pad_height = max_height - img.shape[2]
        padded_img = torch.nn.functional.pad(img, (0, pad_width, 0, pad_height))
        resized_images.append(padded_img)

    return resized_images, targets

# Load the COCO dataset
train_dataset = CocoDetection(root=data_dir, annFile='data/annotations/instances_train2017.json', transform=transform)
val_dataset = CocoDetection(root=data_dir, annFile='data/annotations/instances_val2017.json', transform=transform)

# Create data loaders with the custom collate function
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, collate_fn=collate_fn)
val_data_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4, collate_fn=collate_fn)

# # Create the model
# model = fasterrcnn_resnet50_fpn(pretrained=True)
# num_classes = len(train_dataset.coco.cats) + 1  # +1 for background class
# model.roi_heads.box_predictor.cls_score.out_features = num_classes
# model.roi_heads.box_predictor.bbox_pred.out_features = 4 * num_classes

# # Define optimizer and loss function
# optimizer = torch.optim.SGD(model.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# model.to(device)

# # Training loop
# for epoch in range(num_epochs):
#     model.train()
#     for images, targets in train_data_loader:
#         images = list(image.to(device) for image in images)
#         #targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
#         labels=[]
#         for t in targets:
#             for j in range(len(t)):
#                 labels.append(t[j]['category_id'])

#         loss_dict = model(images, targets)
#         losses = sum(loss for loss in loss_dict.values())

#         optimizer.zero_grad()
#         losses.backward()
#         optimizer.step()

#     # Validation loop
#     model.eval()
#     with torch.no_grad():
#         for images, targets in val_data_loader:
#             images = list(image.to(device) for image in images)
#             #targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
#             labels=[]
#             for t in targets:
#                 for j in range(len(t)):
#                     labels.append(t[j]['category_id'])

#             val_loss_dict = model(images, targets)
#             val_losses = sum(loss for loss in val_loss_dict.values())

#     print(f"Epoch {epoch+1}: Train Loss = {losses:.4f}, Val Loss = {val_losses:.4f}")


loading annotations into memory...
Done (t=18.96s)
creating index...
index created!
loading annotations into memory...
Done (t=0.51s)
creating index...
index created!


In [4]:
len(train_data_loader)

11829

## Getting fewer number of images
### It is tough to train the models directly from the train dataset, thus I am doing it for 10 batches only (total 100 images) and storing them in new variables

In [5]:
all_labels=[]
labels=[]
all_images=[]
count=1
for images, targets in train_data_loader:
    images = list(image.to(device) for image in images)
    all_images.extend(images)
    print(count)
    count=count+1
    print("How many images?:",len(images))
    #print(images)
#     all_labels=[]
#     labels=[]
    for t in targets:
        one_image_label=[]
        for j in range(len(t)):
            one_image_label.append(t[j]['category_id'])
        one_image_label_unique=list(set(one_image_label))
        labels.append(one_image_label_unique)
        all_labels.append(one_image_label)
    #print(all_labels)
    if count>10:
        break
    #print(labels)
#     break
all_labels

1
How many images?: 10
2
How many images?: 10
3
How many images?: 10
4
How many images?: 10
5
How many images?: 10
6
How many images?: 10
7
How many images?: 10
8
How many images?: 10
9
How many images?: 10
10
How many images?: 10


[[1, 1, 1, 61, 67, 1, 44, 62],
 [5, 28, 1, 28, 28, 28, 28, 1, 28, 28],
 [3, 8, 14, 14, 1, 1, 3, 14, 31, 31, 47, 47, 3],
 [9, 9, 9, 9, 9],
 [1, 23, 1],
 [82, 62, 62, 67, 51, 79, 81, 47, 51, 78, 47, 79],
 [1, 35],
 [44, 44, 67, 1, 47, 55, 49, 47, 55, 55, 55, 79, 47],
 [44,
  32,
  32,
  67,
  1,
  1,
  1,
  47,
  47,
  47,
  44,
  44,
  47,
  47,
  47,
  47,
  47,
  47,
  47,
  85,
  47,
  1,
  47,
  47],
 [62,
  62,
  62,
  62,
  62,
  62,
  62,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  39,
  40,
  1,
  37,
  62,
  62,
  62,
  1],
 [67,
  1,
  1,
  1,
  1,
  1,
  47,
  46,
  46,
  46,
  46,
  46,
  46,
  46,
  46,
  62,
  67,
  1,
  1,
  46,
  46,
  46,
  62,
  46,
  67,
  1,
  1,
  1,
  1,
  46],
 [70],
 [82, 78, 79],
 [16, 9],
 [1, 1, 1, 1, 1, 1, 1, 1, 38, 38, 1, 1, 1, 1, 1, 1],
 [24, 24],
 [18, 19, 19, 19, 19, 1, 27, 1, 1, 27, 1, 1],
 [85, 3, 3, 1, 1, 1, 1, 1, 31, 3, 1, 31, 31, 1, 31],
 [13],
 [35, 35, 35, 35, 35, 1, 1, 1, 1, 1, 1, 1, 1, 1, 35, 35, 1, 1,

In [6]:
len(all_labels)

100

In [7]:
len(all_images)

100

In [8]:
cuda_images = all_images  # Your list of CUDA tensors here

# Create an empty list to store the NumPy arrays
numpy_images = []

# Iterate through the CUDA tensors and convert them to NumPy arrays
for cuda_tensor in cuda_images:
    # Copy the tensor from GPU to CPU
    cpu_tensor = cuda_tensor.cpu()

    # Convert the CPU tensor to a NumPy array
    numpy_array = cpu_tensor.numpy()

    # Append the NumPy array to the list
    numpy_images.append(numpy_array)

def resize_images(all_images, width, height):
    resized_images = []
    for image in all_images:
        image = np.transpose(image, (1, 2, 0))  # Convert from (3, height, width) to (height, width, 3)
        pil_image = Image.fromarray((image * 255).astype('uint8'))
        resized_image = pil_image.resize((width, height))
        resized_image = np.array(resized_image) / 255.0
        resized_image = np.transpose(resized_image, (2, 0, 1))  # Convert back to (3, height, width)
        resized_images.append(resized_image)
    return resized_images

# Determine the common shape for all images
common_width = 713
common_height = 480

# Example usage
resized_images = resize_images(numpy_images, common_width, common_height)

## Do it for ResNet50 first

In [9]:
import numpy as np
from tensorflow import keras
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

# Step 1: Preprocess the images and labels
# Assuming you have already resized the images and loaded them into a list
images = resized_images  # Replace with your actual image data
labels = all_labels  # Replace with your actual labels

# Convert the images and labels to NumPy arrays
images = np.array(images)
all_labels = np.array(all_labels)

# Reshape the images to match the input shape of ResNet50 (480, 713, 3)
images = np.transpose(images, (0, 2, 3, 1))


# Convert the images and labels to NumPy arrays
images = np.array(images)
all_labels = np.array(all_labels)

# Step 2: Perform one-hot encoding of the labels
mlb = MultiLabelBinarizer()
labels_encoded = mlb.fit_transform(all_labels)

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(images, labels_encoded, test_size=0.2, random_state=42)

# Determine the number of classes
num_classes = labels_encoded.shape[1]

# Step 4: Build and train the model
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(480, 713, 3))

# Add custom layers on top of the base model
x = base_model.output
x = keras.layers.GlobalAveragePooling2D()(x)
x = keras.layers.Dense(128, activation='relu')(x)
predictions = keras.layers.Dense(num_classes, activation='sigmoid')(x)

# Create the model
model = Model(inputs=base_model.input, outputs=predictions)

# Freeze the base model layers
for layer in base_model.layers:
    layer.trainable = False

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Step 5: Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

# Step 6: Make predictions
predictions = model.predict(X_test)

# Apply a threshold to convert probabilities to binary indicators
threshold = 1
binary_predictions = (predictions >= threshold).astype(int)

# Convert binary indicators to labels
decoded_predictions = []
for pred in binary_predictions:
    labels = [label for label, binary in zip(mlb.classes_, pred) if binary == 1]
    decoded_predictions.append(labels)
    
print('Predictions:', decoded_predictions)


2023-06-26 16:21:36.564496: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-26 16:21:38.872447: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-06-26 16:21:38.872655: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
  all_labels = np.array(all_labels)
2023-06-26 16:22:10.607630: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic li

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 31.45534324645996
Test Accuracy: 0.550000011920929
Predictions: [[1, 3, 9, 16, 18, 19, 24, 27, 28, 31, 32, 39, 40, 43, 44, 47, 50, 51, 52, 55, 58, 61, 62, 63, 67, 70, 75, 79, 81], [1, 3, 9, 15, 16, 18, 19, 24, 27, 28, 31, 32, 39, 40, 43, 44, 47, 48, 50, 51, 52, 53, 55, 58, 61, 62, 63, 67, 70, 72, 75, 79, 81], [1, 3, 7, 9, 16, 18, 19, 24, 27, 28, 31, 32, 39, 40, 43, 44, 47, 50, 51, 52, 53, 55, 58, 61, 62, 63, 67, 70, 75, 79, 81], [1, 3, 9, 15, 16, 18, 19, 24, 27, 28, 31, 32, 39, 40, 43, 44, 47, 48, 50, 51, 52, 53, 55, 58, 61, 62, 63, 67, 70, 72, 75, 79, 81], [1, 3, 9, 16, 18, 19, 24, 27, 28, 31, 32, 39, 40, 43, 44, 47, 48, 50, 51, 52, 53, 55, 58, 61, 62, 63, 67, 70, 72, 75, 79, 81], [1, 3, 7, 9, 16, 18, 19, 24, 27, 28, 31, 32, 39, 40, 43, 44, 47, 48, 50, 51, 52, 53, 55, 58, 61, 62, 63, 67, 70, 72, 75, 79, 81], [1, 3, 9, 16, 18, 19, 24, 27, 28, 31, 32, 37, 39, 40, 43,

In [10]:
decoded_originals = []
for pred in labels_encoded:
    labels = [label for label, binary in zip(mlb.classes_, pred) if binary == 1]
    decoded_originals.append(labels)
    
print('Predictions:', decoded_originals)


Predictions: [[1, 44, 61, 62, 67], [1, 5, 28], [1, 3, 8, 14, 31, 47], [9], [1, 23], [47, 51, 62, 67, 78, 79, 81, 82], [1, 35], [1, 44, 47, 49, 55, 67, 79], [1, 32, 44, 47, 67, 85], [1, 37, 39, 40, 62], [1, 46, 47, 62, 67], [70], [78, 79, 82], [9, 16], [1, 38], [24], [1, 18, 19, 27], [1, 3, 31, 85], [13], [1, 35], [1, 38], [47, 70, 81], [17, 73, 74, 76], [1, 15, 47, 48, 49, 50, 51, 57, 62, 67], [1, 15, 37, 39], [62, 63, 67, 72], [1, 15, 27, 28, 31, 62], [1, 38], [44, 47, 81], [21], [1, 3, 38], [1, 9, 42], [3], [1, 33, 63, 73], [1, 18, 28, 34, 62], [7], [1, 3, 38], [52, 55, 84], [8, 15], [62, 67, 86], [17, 28], [1, 28, 37, 43, 47, 62, 67], [1, 52], [20], [1, 36], [70, 81], [1, 31, 44, 58], [3, 8, 10], [24, 25], [53], [6], [], [1, 3, 34], [1, 27, 35], [62, 86], [17, 73, 74, 76], [61], [13], [17, 63, 75], [1, 3, 41], [1, 15, 41], [1, 41], [1, 3, 27, 41], [1, 18, 31, 62], [47, 50, 54], [1, 48, 61], [1, 38], [1, 41], [1, 35], [1, 31, 47, 60, 67], [15, 18, 31, 33, 65], [48, 49, 56, 67], [1, 3

In [11]:
for i in range(len(decoded_predictions)):
    print(decoded_predictions[i],"\n")

[1, 3, 9, 16, 18, 19, 24, 27, 28, 31, 32, 39, 40, 43, 44, 47, 50, 51, 52, 55, 58, 61, 62, 63, 67, 70, 75, 79, 81] 

[1, 3, 9, 15, 16, 18, 19, 24, 27, 28, 31, 32, 39, 40, 43, 44, 47, 48, 50, 51, 52, 53, 55, 58, 61, 62, 63, 67, 70, 72, 75, 79, 81] 

[1, 3, 7, 9, 16, 18, 19, 24, 27, 28, 31, 32, 39, 40, 43, 44, 47, 50, 51, 52, 53, 55, 58, 61, 62, 63, 67, 70, 75, 79, 81] 

[1, 3, 9, 15, 16, 18, 19, 24, 27, 28, 31, 32, 39, 40, 43, 44, 47, 48, 50, 51, 52, 53, 55, 58, 61, 62, 63, 67, 70, 72, 75, 79, 81] 

[1, 3, 9, 16, 18, 19, 24, 27, 28, 31, 32, 39, 40, 43, 44, 47, 48, 50, 51, 52, 53, 55, 58, 61, 62, 63, 67, 70, 72, 75, 79, 81] 

[1, 3, 7, 9, 16, 18, 19, 24, 27, 28, 31, 32, 39, 40, 43, 44, 47, 48, 50, 51, 52, 53, 55, 58, 61, 62, 63, 67, 70, 72, 75, 79, 81] 

[1, 3, 9, 16, 18, 19, 24, 27, 28, 31, 32, 37, 39, 40, 43, 44, 47, 48, 50, 51, 52, 53, 55, 58, 61, 62, 63, 67, 70, 72, 75, 79, 81] 

[1, 3, 9, 15, 16, 18, 19, 24, 27, 28, 31, 32, 39, 40, 43, 44, 47, 48, 50, 51, 52, 53, 55, 58, 61, 62, 63, 

## The result is very bad because we have too many labels and very less images compared to that. We need to do the training with all the images
Find an efficient way to do that

## Trying it on rough model

In [12]:
import numpy as np
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer



# Step 1: Preprocess the images and labels
# Assuming you have already resized the images and loaded them into a list
images = resized_images  # Replace with your actual image data
labels = all_labels  # Replace with your actual labels

# Convert the images and labels to NumPy arrays
images = np.array(images)
all_labels = np.array(all_labels)


# Step 2: Perform one-hot encoding of the labels
mlb = MultiLabelBinarizer()
labels_encoded = mlb.fit_transform(all_labels)

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(images, labels_encoded, test_size=0.2, random_state=42)

# Determine the number of classes
num_classes = labels_encoded.shape[1]

# Step 4: Build and train the model
model = keras.Sequential([
    keras.layers.Flatten(input_shape=(3, 480, 713)),  # Flatten the input
    keras.layers.Dense(128, activation='relu'),  # Add a hidden dense layer
    keras.layers.Dense(num_classes, activation='softmax')  # Output layer with softmax activation
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Step 5: Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

# Step 6: Make predictions
predictions = model.predict(X_test)

# Apply a threshold to convert probabilities to binary indicators
threshold = 0
binary_predictions = (predictions > threshold).astype(int)

# Convert binary indicators to labels
decoded_predictions = []
for pred in binary_predictions:
    labels = [label for label, binary in zip(mlb.classes_, pred) if binary == 1]
    decoded_predictions.append(labels)
    
#decoded_predictions = mlb.inverse_transform(predictions)
print('Predictions:', decoded_predictions)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 7375.4296875
Test Accuracy: 0.30000001192092896
Predictions: [[47, 67], [1, 41, 49], [1, 47], [1, 38, 49], [1, 49], [3], [47], [1, 3], [1, 41, 49], [1, 49], [3], [1, 67], [1, 3], [1, 3, 9, 18, 41, 47], [1, 3], [1], [49], [1], [1, 49], [1]]


In [13]:
decoded_originals = mlb.inverse_transform(y_test)
decoded_originals

[(),
 (1, 27, 35),
 (15, 18, 31, 33, 65),
 (70, 81),
 (1, 36),
 (62, 67, 86),
 (17, 73, 74, 76),
 (70, 81),
 (1, 46, 47, 62, 67),
 (1, 44, 61, 62, 67),
 (13,),
 (1, 3, 38),
 (25,),
 (1, 33, 63, 73),
 (1, 13, 19),
 (1, 23),
 (60,),
 (1, 3, 7),
 (78, 79, 82),
 (1, 9, 42)]

In [14]:
decoded_predictions

[[47, 67],
 [1, 41, 49],
 [1, 47],
 [1, 38, 49],
 [1, 49],
 [3],
 [47],
 [1, 3],
 [1, 41, 49],
 [1, 49],
 [3],
 [1, 67],
 [1, 3],
 [1, 3, 9, 18, 41, 47],
 [1, 3],
 [1],
 [49],
 [1],
 [1, 49],
 [1]]