In [1]:
import requests
import pandas as pd
import xml.etree.ElementTree as ET
import os
import cv2
import torch
import torch.nn as nn
import torchvision
import numpy as np
import pandas as pd

In [3]:
def select_images(association_obj):
    """
    This function iteratively scrapes 100 records from the API. Each record consists of a variery of \
    child elements, but we chose to only scrape the association name (object type) and the URL to the \
    image, because we only needed the furnitures as requested by the client. 
    """
    
    img_prefix = "https://images.memorix.nl/rce/thumb/1600x1600/"
    img_postfix = ".jpg"
    base_url = "https://rcerijswijk.adlibhosting.com/api.wo2/wwwopac.ashx?database=collect"
    search_param = "&search=all"
    limit = 100
    startfrom = 0
    
    all_data = []
    
    while True:
        url = f"{base_url}{search_param}&limit={limit}&startfrom={startfrom}"
        response = requests.get(url)
        root = ET.fromstring(response.content)

        records = root.findall(".//record")
        if not records:
            break
            
        for record in records:
            associations = [association.text for association in record.findall(
            ".//Associated_subject/association.subject")]
            if association_obj in associations:
                uuids = [reproduction_ref.text for reproduction_ref in record.findall(
                ".//Reproduction/reproduction.reference")]
                association_name = association_obj
                for uuid in uuids:
                    if uuid:
                        img_url = img_prefix + uuid + img_postfix
                        all_data.append({
                            "Association": association_name,
                            "Url": img_url
                        })
            else:
                continue
                
        startfrom += limit
    
    pd.set_option('display.max_colwidth', None)
    df = pd.DataFrame(all_data)
    return df
    
images = select_images("meubel")

In [93]:
def download_images(df, directory_name="nk_collection_meubels"):
    """
    This function takes a dataframe produced by the function select_images, and a directory name. \
    Then it creates a directory with the given name if it does not exist, and it downloades all the \
    images in the dataframe and puts it in the directory. 
    """
    
    if not os.path.exists(directory_name):
        os.makedirs(directory_name)
        
    for idx, row in df.iterrows():
        img_url = row["Url"]
        img_name = os.path.join(directory_name, f"meubel_{idx+1}.jpg")
        response = requests.get(img_url, stream=True)
        with open(img_name, "wb") as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        
download_images(images)

In [150]:
def gray_scale(curr_path="nk_collection_meubels", new_path="nk_collection_meubels_cleaned"):
    """
    This function takes a path to the images that are not yet gray scaled, called curr_path. It also \
    takes a path to the directory that we are going to create to put the gray scaled images in, called \
    new_path. It then iteratively gray scales each image and puts it in the new directory. 
    """
    
    imgs = os.listdir(curr_path)
    if not os.path.exists(new_path):
        os.makedirs(new_path)
        
    for img_name in imgs:
        old_directory = os.path.join(curr_path, img_name)
        new_directory = os.path.join(new_path, img_name)
        img = cv2.imread(old_directory)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        cv2.imwrite(new_directory, img)

gray_scale()

In [2]:
munich_imgs = os.listdir("/home/hamid/Downloads/scraped_images_grayscaled_big")
furnitures = [r for r in munich_imgs if "linz" and "cp13150" in r ]
furnitures

['1153_8691_id=cp131501_linz.jpg',
 '0270_1455_id=cp131505_linz.jpg',
 '1950_33908_id=cp131508_linz.jpg',
 '0606_3010_id=cp131502_linz.jpg',
 '1949_33905_id=cp131507_linz.jpg',
 '0791_4970_id=cp131503_linz.jpg',
 '0271_1458_id=cp131506_linz.jpg',
 '1950_33912_id=cp131509_linz.jpg',
 '1141_8563_id=cp131500_linz.jpg',
 '0711_4263_id=cp131504_linz.jpg']

In [121]:
model = torchvision.models.vgg16(pretrained=True)
model.features[0] = nn.Conv2d(1,64,kernel_size=(3,3), stride=(1,1),padding=(1,1))
model = nn.Sequential(*[*list(model.children())[:-1][0][:-10]])

def preprocess_image(image_path):
    """
    This function takes a path to a single image, it then resizes it to size 50x50 \
    and normalizes it to the range [0,1]. Lastly, it adds an extra dimension to the image \
    which represents the batch size. These steps are needed, because we want to pass the image \
    to a CNN. 
    """
    
    img = cv2.imread(image_path, -1)
    clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(8,8))
    img = clahe.apply(img)
    _, thresh = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY)
    contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    contours = sorted(contours, key=cv2.contourArea, reverse=True)
    mask = np.ones(img.shape, np.uint8)
    mask.fill(255)
    cv2.drawContours(mask, contours, 0, 0, -1)
    img = cv2.add(thresh, mask)
    kernel = np.ones((5,5), dtype=np.uint8)
    img = cv2.erode(img, kernel, 10)
    img = np.abs(np.max(img) - img)
    
    img = cv2.resize(img, (50, 50), interpolation=cv2.INTER_AREA)
    preprocess = torchvision.transforms.Compose([
        torchvision.transforms.ToTensor()
    ])
    img = preprocess(img).unsqueeze(0)
    return img

def extract_features(image_path):
    """
    This function takes a path to a single image, it then preprocesses the image with the \
    function preprocess_image. Afterwards it passes the image to the pretrained CNN to extract \
    a feature descriptor. 
    """
    
    img = preprocess_image(image_path)
    with torch.no_grad():
        features = model(img)
    return features.squeeze(0).numpy()

def normalize_features(features):
    """
    This function takes the feature descriptor and normalizes it. This is needed as we want \
    to compute the dot-product similarity between feature descriptors of different images. \
    And for similarity it is convenient to have all pixels on the same scale without too \
    much magnitude differences and this also ensures stability. 
    """
    
    return features / np.linalg.norm(features)
    

In [7]:
list(model.children())

[Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
 ReLU(inplace=True),
 Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
 ReLU(inplace=True),
 MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
 Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
 ReLU(inplace=True),
 Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
 ReLU(inplace=True),
 MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
 Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
 ReLU(inplace=True),
 Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
 ReLU(inplace=True),
 Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
 ReLU(inplace=True),
 MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
 Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
 ReLU(inplace=True),
 Conv2d(512, 512, kernel_size=(3, 3), stride=(1

In [28]:
munich_img = "/home/hamid/Downloads/scraped_images_grayscaled_big/1950_33912_id=cp131509_linz.jpg"
nk_furnitures = os.listdir("nk_collection_meubels_cleaned")

def compute_similarities(munich_img, nk_collection, path="nk_collection_meubels_cleaned"):
    """
    This function takes three arguments: 
    - munich_img, which is a single image from the Munich Database. 
    - nk_collection, this collection contains all the furnitures from the nk collection API. 
    - path, this is the path to the gray scaled nk collection.
    
    It then computes the feature descriptor for the munich image and all the images in the nk \
    collection. Afterwards takes the dot-product to get the dot-product similiarity. It then \
    saves the similarity and the two images as key-value pairs in a dictionary. 
    """
    
    similarities = {}
    for img in nk_collection:
        img_path = os.path.join(path, img)
        nk_img_feature_descriptor = normalize_features(extract_features(img_path).flatten())
        munich_img_feature_descriptor = normalize_features(extract_features(munich_img).flatten())
        similarity = np.dot(
            nk_img_feature_descriptor,
            munich_img_feature_descriptor
        )
        munich_img_name = munich_img[munich_img.rfind("/")+1:]
        similarities[(munich_img_name, img)] = similarity.item()
        
    return similarities
    
compute_similarities(munich_img, nk_furnitures)

KeyboardInterrupt: 

In [123]:
#best_model_state_dict = model.state_dict()
#torch.save(best_model_state_dict, "best2_vgg16_weights.pth")
best_model_state_dict = torch.load("best2_vgg16_weights.pth")
model.load_state_dict(best_model_state_dict)
model.eval()

Sequential(
  (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU(inplace=True)
  (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): ReLU(inplace=True)
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (6): ReLU(inplace=True)
  (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (8): ReLU(inplace=True)
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (11): ReLU(inplace=True)
  (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (13): ReLU(inplace=True)
  (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (15): ReLU(inplace=True)
  (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (17): Conv2d(256, 512, kernel_si

In [122]:
nk_testset = os.listdir("/home/hamid/Downloads/nk_testset")
munich_testset = os.listdir("/home/hamid/Downloads/munich_testset")

def compute_similarities_testsets(munich_testset, nk_testset, 
                                  munich_path="/home/hamid/Downloads/munich_testset", 
                                  nk_path="/home/hamid/Downloads/nk_testset"):
    """
    This function takes four arguments: 
    - munich_testset, which contains 5 grayscaled images from the munich database.
    - nk_testset, which contains 5 grayscaled images from the nk collection API.
    - munich path, the path to the directory of the munich images. 
    - nk_path, the path to the directory of the nk images. 
    
    It then computes the feature descriptors for the munich images and all the \
    nk collection images. Afterwards takes the dot-product to get the dot-product similiarity. 
    It then saves the similarity and the two images as key-value pairs in a dictionary. 
    """
    
    similarities = {}
    for nk_img in nk_testset:
        nk_img_path = os.path.join(nk_path, nk_img)
        for munich_img in munich_testset:
            munich_img_path = os.path.join(munich_path, munich_img)
            nk_img_feature_descriptor = normalize_features(extract_features(nk_img_path).flatten())
            munich_img_feature_descriptor = normalize_features(extract_features(munich_img_path).flatten())
            similarity = np.dot(
                nk_img_feature_descriptor,
                munich_img_feature_descriptor
            )
            similarities[(nk_img, munich_img)] = similarity.item()
        
    return similarities
    
sims = compute_similarities_testsets(munich_testset, nk_testset)
sims

{('kast_nk.jpg', 'stoel_mccp.jpg'): 0.6604210138320923,
 ('kast_nk.jpg', 'tafel_mccp.jpg'): 0.6619276404380798,
 ('kast_nk.jpg', 'kast_mccp.jpg'): 0.8534377813339233,
 ('kast_nk.jpg', 'dressoir_mccp.jpg'): 0.8410069942474365,
 ('kast_nk.jpg', 'speeltafel_mccp.png'): 0.6869429349899292,
 ('speeltafel_nk.png', 'stoel_mccp.jpg'): 0.548683226108551,
 ('speeltafel_nk.png', 'tafel_mccp.jpg'): 0.5701091885566711,
 ('speeltafel_nk.png', 'kast_mccp.jpg'): 0.5543745756149292,
 ('speeltafel_nk.png', 'dressoir_mccp.jpg'): 0.5458159446716309,
 ('speeltafel_nk.png', 'speeltafel_mccp.png'): 0.6365468502044678,
 ('tafel_nk.jpg', 'stoel_mccp.jpg'): 0.4823276102542877,
 ('tafel_nk.jpg', 'tafel_mccp.jpg'): 0.5581287145614624,
 ('tafel_nk.jpg', 'kast_mccp.jpg'): 0.5177386999130249,
 ('tafel_nk.jpg', 'dressoir_mccp.jpg'): 0.4840072989463806,
 ('tafel_nk.jpg', 'speeltafel_mccp.png'): 0.5526392459869385,
 ('dressoir_nk.jpg', 'stoel_mccp.jpg'): 0.6967198848724365,
 ('dressoir_nk.jpg', 'tafel_mccp.jpg'): 0.688

In [152]:
imagee2 = cv2.imread("/home/hamid/Downloads/nk_testset/stoel_nk.jpg", -1)

clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(8,8))
e_image2 = clahe.apply(imagee2)
_, thresh = cv2.threshold(e_image2, 127, 255, cv2.THRESH_BINARY)

In [125]:
contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
contours = sorted(contours, key=cv2.contourArea, reverse=True)

mask = np.ones(imagee2.shape, np.uint8)
mask.fill(255)
cv2.drawContours(mask, contours, 0, 0, -1)
new_img = cv2.add(thresh, mask)
kernel = np.ones((5,5), dtype=np.uint8)
new_img = cv2.erode(new_img, kernel, 10)
new_img = np.abs(np.max(new_img) - new_img)
cv2.imshow("img", new_img)
cv2.imwrite("stoel_nk_preprocessed.jpg", new_img)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [142]:
def get_table(sims):
    """
    This function takes the output produced by either the compute_similarities \ 
    or compute_similarities_testsets function, and returns a pandas dataframe/table \
    and also saves it in excel.
    """
    
    data = {}
    rows = []

    for key, value in sims.items():
        if key[0] not in data:
            data[key[0]] = []
        if key[1] not in rows:
            rows.append(key[1])
        data[key[0]].append(value)
        
    data = {key[:key.rfind(".")]:value for key, value in data.items()}
    rows = [row[:row.rfind(".")] for row in rows]
        
    df = pd.DataFrame(data, index=rows)
    #df.to_excel('output.xlsx')
    return df.T
    
get_table(sims)

Unnamed: 0,stoel_mccp,tafel_mccp,kast_mccp,dressoir_mccp,speeltafel_mccp
kast_nk,0.660421,0.661928,0.853438,0.841007,0.686943
speeltafel_nk,0.548683,0.570109,0.554375,0.545816,0.636547
tafel_nk,0.482328,0.558129,0.517739,0.484007,0.552639
dressoir_nk,0.69672,0.688964,0.709567,0.752601,0.590738
stoel_nk,0.778562,0.685584,0.7317,0.749232,0.589983
