# Resnet18 Retrieval

Steps:

1. Get 150 images (15 classes x 10 images per class) for gallery of 75 query images from this link:

https://www.kaggle.com/datasets/shashankrapolu/human-action-recognition-dataset

2. Go onto kaggle GPU. Feel free to use Collab Pro if you have one.

3. Load the ResNet 18 model you have trained.
Encode the images with the ResNet18 Model

4. Perform retrieval with the 75 query images from the 150 gallery images
6. Record the accuracy and speed
7. Improve the speed (Your Idea)
8. Record accuracy and speed again
Compare 6 and 8
Submit the code, a video, etc...

In [None]:
# Source: Assignment 05: Resnet18

### Load the Model and the Pretrained checkpoint


In [5]:
import torch
import torch.nn as nn
from torchvision import models, transforms
from PIL import Image
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
import os

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the saved model
model = models.resnet18(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, 1000)  # Adjust to match the original model's output units
model.load_state_dict(torch.load('Human_Action_Recognition.pth'))
model.eval()

# Create a new model with the correct final layer
new_model = models.resnet18(pretrained=True)
new_model.fc = nn.Linear(new_model.fc.in_features, 2)  # Adjust to match the desired output units

# Copy the weights and biases from the loaded model to the new model
new_model.fc.weight.data = model.fc.weight.data[0:2]  # Copy only the first 2 output units
new_model.fc.bias.data = model.fc.bias.data[0:2]

new_model = new_model.to(device)

  model.load_state_dict(torch.load('Human_Action_Recognition.pth'))


In [6]:
# Load the query images

# =======================================================#
# 2. Define Data Transforms for Data Augmentation
# =======================================================#

data_transforms = {
    
    'test': transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
}

# =======================================================#
# 2. Define Data Directory
# =======================================================#
data_dir = 'data'

# Create Dataloaders: Dataloaders are responsible for loading the data
image_data = {x: datasets.ImageFolder(os.path.join(data_dir,x), data_transforms[x]) for x in ['test']}

# Image Datasets
dataloaders = {x: DataLoader(image_data[x], batch_size=4, shuffle=True, num_workers=4) for x in ['test'] } # num_workers is equal to the number of parallel processes to process the data
dataset_sizes = {x: len(image_data[x]) for x in ['test']}
print(dataset_sizes)

class_names = image_data['test'].classes

class_names

{'test': 150}


['calling',
 'clapping',
 'cycling',
 'dancing',
 'drinking',
 'eating',
 'fighting',
 'hugging',
 'laughing',
 'listening_to_music',
 'running',
 'sitting',
 'sleeping',
 'texting',
 'using_laptop']

In [None]:
# Perform the inference on the test data and evaluate accuracy and speed

for inputs, labels in dataloaders['test']:
    inputs = inputs.to(device)
    labels = labels.to(device)
    
    