# Script to find similar homes using Autoencoder embedding

In [None]:
import sys  
sys.path.insert(0, '../visual_home_finder')

import imp
import config, paths, utilities

imp.reload(config)
imp.reload(utilities)

import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from random import randint
import pickle
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from scipy.stats import pearsonr, spearmanr
import random

from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model, load_model
plt.rcParams['figure.figsize'] = (5,5)

In [None]:
favorite_image = 'modern.jpg'#'98105_27.jpg' #'98117_83.jpg'#'98117_81.jpg' 
similarity_threshold = 0.9
use_favorite_image = True

Read in embeddings for all home listings

In [None]:
# First read-in the feature vectors for all similar homes
feature_file = os.path.sep.join([config.FEATURE_PATH,'home_features_autoencoder.csv'])
home_listings_df = pd.read_csv(feature_file,
                              index_col = 0)
home_listings_df['home_feature'] = home_listings_df.home_feature.apply(utilities.str_to_array)

Generate embeddings for the favorite image

In [None]:
if use_favorite_image:
    model_name= "autoencoder_2.h5"
    my_model = load_model(os.path.sep.join([config.MODEL_PATH, model_name]))

    # Get home-style features from the model
    home_feature_model = Model(inputs=my_model.input,
                               outputs=my_model.get_layer('max_pooling2d_7').output)

In [None]:
my_model.summary()

In [None]:
if use_favorite_image:
    # Load image
    fav_img = image.load_img(favorite_image, target_size = (config.IMAGE_SIZE,config.IMAGE_SIZE))
    fav_img_orig = image.img_to_array(fav_img)
    
    # Get  features for the image from the model
    fav_feature = utilities.get_features_for_image_with_scaling(favorite_image, home_feature_model)

    # Also plot the image
    plt.imshow(fav_img_orig/255)
    plt.show()

In [None]:
def show_home_images(home_index, home_listings_df_local, similarity_score=None):
    """
    home_listings_df_local is the dataframe that contains home listing information
    home_index is the index of the image to be shown
    similarity_score is the similarity score for the homes
    """
    home_img_name = os.path.sep.join([config.LISTINGS_PATH, home_index+'.jpg'])
    home_name_only = (home_img_name.split("/")[-1]).strip(".jpg")
    if similarity_score:
        print(home_name_only + " Similarity: %.2f"%(similarity_score))
    else:
        print(home_name_only)
    img = image.load_img(home_img_name)
    img = image.img_to_array(img)
    plt.imshow(img/255)
    plt.show()

In [None]:
def vpearsonr(x_vector, y_array):
    """
    x_vector is a 1xN array, y_array is a M x N matrix. Returnes a vector of length M which 
    containes the pearson correlation of x_vector wiht each row of y_array
    """
    M = y_array.shape[0]
    correlation_values = list()
    for mm in range(M):
        cx,_ = pearsonr(np.transpose(np.ravel(x_vector)), np.transpose(y_array[mm,:]))
        correlation_values.append(cx)
    return correlation_values

In [None]:
def euclidean_similarity(x_vector, y_array):
    values = euclidean_distances(x_vector, y_array)
    return 1.0/(1+ values)

In [None]:
# Using home-style features
if use_favorite_image:
    selected_home_feature = np.reshape(fav_feature, [1,-1])
else:
    selected_ind = 76
    selected_home_feature = np.reshape(home_listings_df["home_feature"].iloc[selected_ind], [1,-1])

# Find the cosine similarity of selected home with other homes
home_similarities = np.ravel(cosine_similarity(selected_home_feature, 
                    np.vstack(home_listings_df.home_feature).astype(float)))

# Only show listings with similarity above user-selected threshold
filtered_indices = np.ravel(np.argwhere(home_similarities > similarity_threshold))
home_similarities_filtered = home_similarities[filtered_indices]
sorted_similarity_arg = np.ravel(np.flip(np.argsort(home_similarities_filtered)))
filtered_indices = filtered_indices[sorted_similarity_arg]
home_similarities_filtered = home_similarities_filtered[sorted_similarity_arg]

# Plot the images of the selected home and other similar homes
count = 0
for iv, ii in enumerate(filtered_indices):
    if use_favorite_image: 
        # Dont show the same image if it was in the data-set
        if abs(home_similarities_filtered[iv] - 1.0) <= 1e-10: # Remove the same image
            continue
    home_index = home_listings_df.index[ii]
    show_home_images(home_index, home_listings_df, home_similarities[ii])
    count += 1
num_listings_to_show = count

In [None]:
np.percentile(home_similarities, 95)

In [None]:
# Generate t-SNE plot by using all images of home styles 
# -WARNING. THIS IS TIME CONSUMING. DONT RUN OFTEN
num_samples = 300  # Number of sample images from each class to plot
# For each style class, read "num_images" and generate features
features_df = pd.DataFrame([])
for style in config.CLASSES:
    img_paths = list(paths.list_images(os.path.sep.join([config.ORIG_INPUT_DIR, style])))
    total_images = len(img_paths)
    selection = random.sample(range(total_images), num_samples)
    for ii in selection:
        image_feature = utilities.get_features_for_image_with_scaling(img_paths[ii], home_feature_model)
        features_df = features_df.append(pd.DataFrame({"class":style,
                                                    "home_feature":[image_feature]}))

In [None]:
# Plot the characteristics of the selected homes in TSNE cluster plot

features_np = np.vstack(features_df['home_feature'])
all_listings_home_np = np.vstack(home_listings_df.home_feature).astype(float)
features_np = np.vstack([features_np, all_listings_home_np, selected_home_feature])

pca = PCA(n_components=30)
features_np_pca = pca.fit_transform(features_np)

tsne = TSNE(n_components=2, verbose=1, perplexity=30, n_iter=300)
features_2d_tsne = tsne.fit_transform(features_np_pca)

In [None]:
plt.rcParams['figure.figsize'] = (10,10)

# Load up the TSNE and cluster plots
features_copy = np.copy(features_2d_tsne)
selected_home_2d_tsne = np.reshape(features_copy[-1,:], [1,-1])
features_copy = features_copy[0:-1, :]
nn = all_listings_home_np.shape[0]
all_listings_2d_tsne = features_copy[-nn:,:]
features_copy = features_copy[0:-nn, :]
home_features_2d_tsne = all_listings_2d_tsne[filtered_indices,:]
features_df['2DTSNE'] = list(features_copy)        

In [None]:
# Scatter plot for different classes
for style in config.CLASSES:
    plt_array = np.vstack(features_df[features_df['class']==style]['2DTSNE'])
    plt.scatter(plt_array[:,0], plt_array[:,1], label=style, alpha = 0.5)
plt.legend()

In [None]:
# Scatter plot for different classes
for style in config.CLASSES:
    plt_array = np.vstack(features_df[features_df['class']==style]['2DTSNE'])
    plt.scatter(plt_array[:,0], plt_array[:,1], label=style, alpha = 0.5)
plt.legend()
plt.plot(all_listings_2d_tsne[:,0], all_listings_2d_tsne[:,1], '.k', alpha=0.8)
plt.plot(selected_home_2d_tsne[0,0], selected_home_2d_tsne[0,1], 'rx', alpha=1, markersize=16)

In [None]:
# Scatter plot for different classes
for style in config.CLASSES:
    plt_array = np.vstack(features_df[features_df['class']==style]['2DTSNE'])
    plt.scatter(plt_array[:,0], plt_array[:,1], label=style, alpha = 0.5)
plt.legend()

plt.plot(all_listings_2d_tsne[:,0], all_listings_2d_tsne[:,1], '.k', alpha=0.8)
plt.plot(selected_home_2d_tsne[0,0], selected_home_2d_tsne[0,1], 'rx', alpha=1, markersize=16)
# Plot the selected homes
for iv, ii in enumerate(filtered_indices):
    if abs(home_similarities_filtered[iv] - 1.0) <= 1e-10: 
        continue
    else:
         plt.plot([home_features_2d_tsne[iv,0],selected_home_2d_tsne[0,0]],
                 [home_features_2d_tsne[iv,1],selected_home_2d_tsne[0,1]],
                 'k:', alpha=0.8)    
plt.xlabel('Feature Space')
plt.ylabel('Feature Space')
plt.grid()

In [None]:
# Show randomly selected homes
num_listings = len(home_similarities)
count = 1
while count<=num_listings_to_show:
    count+=1
    ii = randint(0,num_listings-1)
    home_index = home_listings_df.index[ii]
    show_home_images(home_index, home_listings_df)

In [None]:
# Using Resnet Features
if not(use_favorite_image):
    selected_home_feature = home_listings_df["resnet_feature"].iloc[selected_ind]

    #Find the cosine similarity of selected home with other homes
    home_similarities = np.ravel(cosine_similarity(np.reshape(selected_home_feature, [1,-1]), 
                        np.vstack(home_listings_df.resnet_feature)))
    similar_homes_arg = np.ravel(np.flip(np.argsort(home_similarities)))

    # Plot the images of the selected home and other similar homes
    count = 0
    for iv, ii in enumerate(similar_homes_arg):
        if use_favorite_image and iv==0:
            continue
        home_index = home_listings_df.index[ii]
        show_home_images(home_index, home_listings_df, home_similarities[ii])
        count += 1
        if count>=num_listings_to_show:
            break