In [103]:
import requests
import pandas as pd
import xml.etree.ElementTree as ET
import os
import cv2
import torch
import torch.nn as nn
import torchvision
import numpy as np

In [3]:
def select_images(association_obj):
    """
    This function iteratively scrapes 100 records from the API. Each record consists of a variery of \
    child elements, but we chose to only scrape the association name (object type) and the URL to the \
    image, because we only needed the furnitures as requested by the client. 
    """
    
    img_prefix = "https://images.memorix.nl/rce/thumb/1600x1600/"
    img_postfix = ".jpg"
    base_url = "https://rcerijswijk.adlibhosting.com/api.wo2/wwwopac.ashx?database=collect"
    search_param = "&search=all"
    limit = 100
    startfrom = 0
    
    all_data = []
    
    while True:
        url = f"{base_url}{search_param}&limit={limit}&startfrom={startfrom}"
        response = requests.get(url)
        root = ET.fromstring(response.content)

        records = root.findall(".//record")
        if not records:
            break
            
        for record in records:
            associations = [association.text for association in record.findall(
            ".//Associated_subject/association.subject")]
            if association_obj in associations:
                uuids = [reproduction_ref.text for reproduction_ref in record.findall(
                ".//Reproduction/reproduction.reference")]
                association_name = association_obj
                for uuid in uuids:
                    if uuid:
                        img_url = img_prefix + uuid + img_postfix
                        all_data.append({
                            "Association": association_name,
                            "Url": img_url
                        })
            else:
                continue
                
        startfrom += limit
    
    pd.set_option('display.max_colwidth', None)
    df = pd.DataFrame(all_data)
    return df
    
images = select_images("meubel")

In [93]:
def download_images(df, directory_name="nk_collection_meubels"):
    """
    This function takes a dataframe produced by the function select_images, and a directory name. \
    Then it creates a directory with the given name if it does not exist, and it downloades all the \
    images in the dataframe and puts it in the directory. 
    """
    
    if not os.path.exists(directory_name):
        os.makedirs(directory_name)
        
    for idx, row in df.iterrows():
        img_url = row["Url"]
        img_name = os.path.join(directory_name, f"meubel_{idx+1}.jpg")
        response = requests.get(img_url, stream=True)
        with open(img_name, "wb") as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        
download_images(images)

In [150]:
def gray_scale(curr_path="nk_collection_meubels", new_path="nk_collection_meubels_cleaned"):
    """
    This function takes a path to the images that are not yet gray scaled, called curr_path. It also \
    takes a path to the directory that we are going to create to put the gray scaled images in, called \
    new_path. It then iteratively gray scales each image and puts it in the new directory. 
    """
    
    imgs = os.listdir(curr_path)
    if not os.path.exists(new_path):
        os.makedirs(new_path)
        
    for img_name in imgs:
        old_directory = os.path.join(curr_path, img_name)
        new_directory = os.path.join(new_path, img_name)
        img = cv2.imread(old_directory)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        cv2.imwrite(new_directory, img)

gray_scale()

In [102]:
munich_imgs = os.listdir("/home/hamid/Downloads/scraped_images_grayscaled_big")
furnitures = [r for r in munich_imgs if "linz" and "cp13150" in r ]
furnitures

['1153_8691_id=cp131501_linz.jpg',
 '0270_1455_id=cp131505_linz.jpg',
 '1950_33908_id=cp131508_linz.jpg',
 '0606_3010_id=cp131502_linz.jpg',
 '1949_33905_id=cp131507_linz.jpg',
 '0791_4970_id=cp131503_linz.jpg',
 '0271_1458_id=cp131506_linz.jpg',
 '1950_33912_id=cp131509_linz.jpg',
 '1141_8563_id=cp131500_linz.jpg',
 '0711_4263_id=cp131504_linz.jpg']

In [None]:
cv2.imshow("img", preprocess_image("/home/hamid/Downloads/scraped_images_grayscaled_big/1950_33912_id=cp131509_linz.jpg").squeeze().numpy())
cv2.waitKey(0)
cv2.destroyAllWindows()

In [112]:
model = torchvision.models.vgg16(pretrained=True)
model.features[0] = nn.Conv2d(1,64,kernel_size=(3,3), stride=(1,1),padding=(1,1))
model = nn.Sequential(*[*list(model.children())[:-1][0][:-12]])

def preprocess_image(image_path):
    """
    This function takes a path to a single image, it then resizes it to size 50x50 \
    and normalizes it to the range [0,1]. Lastly, it adds an extra dimension to the image \
    which represents the batch size. These steps are needed, because we want to pass the image \
    to a CNN. 
    """
    
    img = cv2.imread(image_path, -1)
    img = cv2.resize(img, (50, 50), interpolation=cv2.INTER_AREA)
    preprocess = torchvision.transforms.Compose([
        torchvision.transforms.ToTensor()
    ])
    img = preprocess(img).unsqueeze(0)
    return img

def extract_features(image_path):
    """
    This function takes a path to a single image, it then preprocesses the image with the \
    function preprocess_image. Afterwards it passes the image to the pretrained CNN to extract \
    a feature descriptor. 
    """
    
    img = preprocess_image(image_path)
    with torch.no_grad():
        features = model(img)
    return features.squeeze(0).numpy()

def normalize_features(features):
    """
    This function takes the feature descriptor and normalizes it. This is needed as we want \
    to compute the dot-product similarity between feature descriptors of different images. \
    And for similarity it is convenient to have all pixels on the same scale without too \
    much magnitude differences and this also ensures stability. 
    """
    
    return features / np.linalg.norm(features)
    

In [125]:
list(model.children())[0]

Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))

In [114]:
features = extract_features("/home/hamid/Downloads/scraped_images_grayscaled_big/1950_33912_id=cp131509_linz.jpg").flatten() / np.linalg.norm(extract_features("/home/hamid/Downloads/scraped_images_grayscaled_big/1950_33912_id=cp131509_linz.jpg").flatten())

In [115]:
f = extract_features("nk_collection_meubels_cleaned/meubel_1681.jpg").flatten() / np.linalg.norm(extract_features("nk_collection_meubels_cleaned/meubel_1681.jpg").flatten())

In [116]:
(np.dot(features, f)).item()

0.6367800235748291

In [92]:
for name, param in model.named_parameters():
    print(f"name: {param.shape}")

name: torch.Size([64, 1, 3, 3])
name: torch.Size([64])
name: torch.Size([64, 64, 3, 3])
name: torch.Size([64])
name: torch.Size([128, 64, 3, 3])
name: torch.Size([128])
name: torch.Size([128, 128, 3, 3])
name: torch.Size([128])
name: torch.Size([256, 128, 3, 3])
name: torch.Size([256])
name: torch.Size([256, 256, 3, 3])
name: torch.Size([256])
name: torch.Size([256, 256, 3, 3])
name: torch.Size([256])
name: torch.Size([512, 256, 3, 3])
name: torch.Size([512])
name: torch.Size([512, 512, 3, 3])
name: torch.Size([512])


In [117]:
munich_img = "/home/hamid/Downloads/scraped_images_grayscaled_big/1950_33912_id=cp131509_linz.jpg"
nk_furnitures = os.listdir("nk_collection_meubels_cleaned")

def compute_similarities(munich_img, nk_collection, path="nk_collection_meubels_cleaned"):
    """
    This function takes three arguments: 
    - munich_img, which is a single image from the Munich Database. 
    - nk_collection, this collection contains all the furnitures from the nk collection API. 
    - path, this is the path to the gray scaled nk collection.
    
    It then computes the feature descriptor for the munich image and all the images in the nk \
    collection. Afterwards takes the dot-product to get the dot-product similiarity. It then \
    saves the similarity and the two images as key-value pairs in a dictionary. 
    """
    
    similarities = {}
    for img in nk_collection:
        img_path = os.path.join(path, img)
        nk_img_feature_descriptor = normalize_features(extract_features(img_path).flatten())
        munich_img_feature_descriptor = normalize_features(extract_features(munich_img).flatten())
        similarity = np.dot(
            nk_img_feature_descriptor,
            munich_img_feature_descriptor
        )
        munich_img_name = munich_img[munich_img.rfind("/")+1:]
        similarities[(munich_img_name, img)] = similarity.item()
        
    return similarities
    
compute_similarities(munich_img, nk_furnitures)

{('1950_33912_id=cp131509_linz.jpg', 'meubel_1336.jpg'): 0.673613429069519,
 ('1950_33912_id=cp131509_linz.jpg', 'meubel_1014.jpg'): 0.7335500717163086,
 ('1950_33912_id=cp131509_linz.jpg', 'meubel_1439.jpg'): 0.6912555694580078,
 ('1950_33912_id=cp131509_linz.jpg', 'meubel_1394.jpg'): 0.7666996717453003,
 ('1950_33912_id=cp131509_linz.jpg', 'meubel_316.jpg'): 0.6834836006164551,
 ('1950_33912_id=cp131509_linz.jpg', 'meubel_2.jpg'): 0.6459488868713379,
 ('1950_33912_id=cp131509_linz.jpg', 'meubel_659.jpg'): 0.6661323308944702,
 ('1950_33912_id=cp131509_linz.jpg', 'meubel_214.jpg'): 0.6554477214813232,
 ('1950_33912_id=cp131509_linz.jpg', 'meubel_781.jpg'): 0.727974534034729,
 ('1950_33912_id=cp131509_linz.jpg', 'meubel_1076.jpg'): 0.7294822931289673,
 ('1950_33912_id=cp131509_linz.jpg', 'meubel_290.jpg'): 0.6305097341537476,
 ('1950_33912_id=cp131509_linz.jpg', 'meubel_49.jpg'): 0.6910910606384277,
 ('1950_33912_id=cp131509_linz.jpg', 'meubel_406.jpg'): 0.6082338094711304,
 ('1950_3391

In [133]:
nk_testset = os.listdir("/home/hamid/Downloads/nk_testset")
munich_testset = os.listdir("/home/hamid/Downloads/munich_testset")

def compute_similarities_testsets(munich_testset, nk_testset, 
                                  munich_path="/home/hamid/Downloads/munich_testset", 
                                  nk_path="/home/hamid/Downloads/nk_testset"):
    """
    This function takes four arguments: 
    - munich_testset, which contains 5 grayscaled images from the munich database.
    - nk_testset, which contains 5 grayscaled images from the nk collection API.
    - munich path, the path to the directory of the munich images. 
    - nk_path, the path to the directory of the nk images. 
    
    It then computes the feature descriptors for the munich images and all the \
    nk collection images. Afterwards takes the dot-product to get the dot-product similiarity. 
    It then saves the similarity and the two images as key-value pairs in a dictionary. 
    """
    
    similarities = {}
    for nk_img in nk_testset:
        nk_img_path = os.path.join(nk_path, nk_img)
        for munich_img in munich_testset:
            munich_img_path = os.path.join(munich_path, munich_img)
            nk_img_feature_descriptor = normalize_features(extract_features(nk_img_path).flatten())
            munich_img_feature_descriptor = normalize_features(extract_features(munich_img_path).flatten())
            similarity = np.dot(
                nk_img_feature_descriptor,
                munich_img_feature_descriptor
            )
            similarities[(nk_img, munich_img)] = similarity.item()
        
    return similarities
    
compute_similarities_testsets(munich_testset, nk_testset)

{('kast_nk.jpg', 'stoel_mccp.jpg'): 0.753388524055481,
 ('kast_nk.jpg', 'tafel_mccp.jpg'): 0.8200876116752625,
 ('kast_nk.jpg', 'kast_mccp.jpg'): 0.9317895770072937,
 ('kast_nk.jpg', 'dressoir_mccp.jpg'): 0.9365372657775879,
 ('kast_nk.jpg', 'speeltafel_mccp.png'): 0.8263669013977051,
 ('speeltafel_nk.png', 'stoel_mccp.jpg'): 0.5993944406509399,
 ('speeltafel_nk.png', 'tafel_mccp.jpg'): 0.5733804106712341,
 ('speeltafel_nk.png', 'kast_mccp.jpg'): 0.6685010194778442,
 ('speeltafel_nk.png', 'dressoir_mccp.jpg'): 0.6546347737312317,
 ('speeltafel_nk.png', 'speeltafel_mccp.png'): 0.6084338426589966,
 ('tafel_nk.jpg', 'stoel_mccp.jpg'): 0.6687464714050293,
 ('tafel_nk.jpg', 'tafel_mccp.jpg'): 0.662061870098114,
 ('tafel_nk.jpg', 'kast_mccp.jpg'): 0.6944741010665894,
 ('tafel_nk.jpg', 'dressoir_mccp.jpg'): 0.7222106456756592,
 ('tafel_nk.jpg', 'speeltafel_mccp.png'): 0.7990135550498962,
 ('dressoir_nk.jpg', 'stoel_mccp.jpg'): 0.6469377875328064,
 ('dressoir_nk.jpg', 'tafel_mccp.jpg'): 0.6692