In [1]:
# !wget http://vis-www.cs.umass.edu/lfw/lfw.tgz
# !tar -xvf /content/lfw.tgz

# Part 2 Using CNN for Feature Extraction (30 points)
* Use ResNet50 to extract features vectors from raw images. You can use TF or Pytorch APIs to:
* Obtain a ResNet-50 model pre-trained on a dataset such as ImageNet.
* Perform necessary preprocessing on the images before feeding them into the network.
* Extract the features from the penultimate layer of the network (before the fully connected layer - the classification head).
* Store the features in a dictionary, where the key is the name of the image and the value is the feature vector.

In [2]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from torch.autograd import Variable
from PIL import Image
import os

In [3]:

# Obtain a ResNet-50 model pre-trained on a dataset such as ImageNet.
model = models.resnet50(pretrained=True)




In [4]:
# Perform necessary preprocessing on the images before feeding them into the network.

def preprocess_image(image_path):
    # Load the image and resize it to (224, 224)
    image = Image.open(image_path)
    image = image.resize((224, 224))
    
    # Convert the image to a tensor and normalize it
    transformation = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        )
    ])
    
    image = transformation(image).float()
    
    # Add a batch dimension
    image = image.unsqueeze(0)
    
    return image


In [5]:
# Extract the features from the penultimate layer of the network (before the fully connected layer - the classification head).

def extract_features(image, model, is_cuda=False):
    # Extract features
    features = model(Variable(image))
    # features = features.data.squeeze().numpy()
    features = features.data.squeeze().cpu().numpy() if is_cuda else features.data.squeeze().numpy()
    
    return features
    

In [6]:
# from tqdm import tqdm
# Store the features in a dictionary, where the key is the name of the image and the value is the feature vector.


features_dict = {}

lfw_dir = "./data/lfw/"
img_list = []

class ImageData():
    def __init__(self, image_path, label):
        self.image_path = image_path
        self.label = label

    def __str__(self):
        return f'Image path: {self.image_path}, label: {self.label}'

for dirname, folders, filenames in os.walk(lfw_dir):
    for filename in filenames:
        # full path of the image
        full_path = os.path.join(dirname, filename)
        if full_path.endswith('.jpg'):
            # foldername, last foldername, split by / or \, get the last one
            foldername = dirname.split('/')[-1]
            foldername = foldername.split('\\')[-1]

            image_name = full_path.split('/')[-1].split('\\')[-1]
            label = foldername + '/' + image_name
            img = ImageData(full_path, image_name)
            img_list.append(img)


In [9]:
from tqdm import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
is_cuda = torch.cuda.is_available()

model = model.to(device)


for img in tqdm(img_list):
    image_path = img.image_path
    label = img.label
    processed_image = preprocess_image(image_path)
    # Speed up the process by using GPU
    processed_image = processed_image.to(device)
    features = extract_features(processed_image, model, is_cuda)
    image_path = image_path.split('/')[-1]
    image_name = image_path.split('.')[0]
    features_dict[label] = features


100%|██████████| 13233/13233 [03:52<00:00, 56.99it/s]


In [10]:
import pickle

with open("lfw_features.pkl", "wb") as f:
    pickle.dump(features_dict, f)

# Part 3 Retrieving most similar images (30 points)
Use a nearest neighbor algorithm such as this to obtain the 10 most similar images to each query image.

In [11]:
import pickle

with open("lfw_features.pkl", "rb") as f:
    features_dict = pickle.load(f)

In [12]:
# print keys of the dictionary
print(list(features_dict.keys())[:10])

['Aaron_Eckhart_0001.jpg', 'Aaron_Guiel_0001.jpg', 'Aaron_Patterson_0001.jpg', 'Aaron_Peirsol_0001.jpg', 'Aaron_Peirsol_0002.jpg', 'Aaron_Peirsol_0003.jpg', 'Aaron_Peirsol_0004.jpg', 'Aaron_Pena_0001.jpg', 'Aaron_Sorkin_0001.jpg', 'Aaron_Sorkin_0002.jpg']


In [13]:
import numpy as np

# Convert the dictionary of features to a matrix
features_matrix = np.array(list(features_dict.values()))

# Convert the matrix to a PyTorch tensor and move it to the GPU
features_tensor = torch.from_numpy(features_matrix).to(device)

# Set the batch size
batch_size = 128

# Compute the distance matrix in batches
distance_matrix = np.empty((len(features_dict), len(features_dict)), dtype=np.float32)
for i in range(0, len(features_dict), batch_size):
    batch_features = features_tensor[i:i+batch_size]
    batch_distances = torch.norm(batch_features[:, None] - features_tensor, dim=2).cpu().numpy()
    distance_matrix[i:i+batch_size] = batch_distances

In [14]:
# Save the matrix to a file
np.save("lfw_distance_matrix.npy", distance_matrix)

In [15]:
def retrieve_similar_images(query_image, features_dict, distance_matrix):
    # Retrieve the feature vector of the query image
    query_features = features_dict[query_image]
    
    # Compute the distances between the query image and all other images
    distances = distance_matrix[list(features_dict.keys()).index(query_image)]
        
    # Sort the distances in ascending order and retrieve the indices of the 10 closest images
    closest_indices = np.argsort(distances)[:10]
    
    # Retrieve the filenames of the closest images
    closest_images = [list(features_dict.keys())[i] for i in closest_indices]
    
    return closest_images


In [17]:
query_image = "George_W_Bush_0001.jpg"
similar_images = retrieve_similar_images(query_image, features_dict, distance_matrix)
for similar_image in similar_images:
    print(similar_image)

George_W_Bush_0001.jpg
Colin_Powell_0200.jpg
George_Robertson_0016.jpg
Emyr_Jones_Parry_0001.jpg
Kristen_Breitweiser_0003.jpg
Chakib_Khelil_0002.jpg
Tommy_Thompson_0007.jpg
Felipe_Perez_Roque_0001.jpg
Tony_Blair_0049.jpg
Bill_Gates_0013.jpg
