In [None]:
import pandas as pd
import numpy as np
import cv2  # for image processing
import scipy.io
import os
from arts_preprocess_utils import load_dataset, get_image
import scipy
#from scipy.spatial.distance import directed_hausdorff, cosine
from IPython import display
%matplotlib inline
import matplotlib.pyplot as plt
import networkx as nx
from graph_utils import get_edge_dict, plot_graph

In [None]:
import zipfile

#dataset from Kaggle
filezip = zipfile.ZipFile('/root/work/datasets/train.zip', 'r')

## Load Data

**Artwork metadata**

In [None]:
#Artwork metadata

#df_artworks = pd.read_csv('/root/work/datasets/train_mayors_style_encoded.csv')
df_artworks = pd.read_csv('/root/work/datasets/train_mayors_style_encoded_with_url.csv')
df_artworks.head()

**Artwork codes**

In [None]:
#Artwork codes

#artwork_code_matrix = np.load('/root/work/datasets/train_mayors_style_encode.npy')
artwork_code_matrix = np.load('/root/work/datasets/train_mayors_style_w_encoded.npy')
artwork_code_matrix.shape

**Influence graph**

In [None]:
df_edges = pd.read_csv('/root/work/datasets/artist-influences-edges.csv')
df_edges.head()

In [None]:
#Create graph
artist_dict = get_edge_dict(df=df_edges, 
                            col_to_index='Artist', col_to_split='Influence', col_to_clean='Influence')

g_artist = nx.from_dict_of_lists(artist_dict)
nx.set_edge_attributes(g_artist, 'red', 'color')
nx.set_node_attributes(g_artist, 'artist', 'type')

**Artwork pivot**

In [None]:
image_conflict = get_image('agusil-pelo-naranja.jpg')

In [None]:
df_monet = df_artworks[df_artworks['artist'] == 'Claude Monet']
df_monet.head(10)

In [None]:
#Plot image
img_path = filezip.extract('train/' + df_monet['filename'][627])
image = get_image(img_path)

fig = plt.gcf()
fig.set_size_inches(14.5, 6.5)

plt.imshow(image[0][...,::-1])

In [None]:
fig.savefig('./image_result/query.jpg')

In [None]:
code_image = artwork_code_matrix[627]

## Get similar artworks 

**Cosine similarity**

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

sim_matrix = cosine_similarity(code_image.reshape((1,-1)), artwork_code_matrix)

**Earth mover's distance**

In [None]:
from scipy.stats import wasserstein_distance

sim_list = []

for i in range(artwork_code_matrix.shape[0]):
    sim_list.append(wasserstein_distance(code_image.reshape((-1,)), artwork_code_matrix[i]))
    
sim_matrix = np.array(sim_list)
sim_matrix = sim_matrix.reshape((1,-1))

In [None]:
index_sorted = np.argsort(sim_matrix)

In [None]:
top_n = index_sorted[0][-1001:]

In [None]:
top_n_matrix = np.take(a=sim_matrix, indices=top_n)
top_n_matrix.shape

**Get top n metadata**

In [None]:
df_top_n = df_artworks.iloc[top_n]
df_top_n['sim_distance'] = top_n_matrix
df_top_n.head()

In [None]:
df_top_n.iloc[-1]['artist']

**Re-order taking account artist influence**

In [None]:
#Pre compute shortest path length
length = dict(nx.all_pairs_shortest_path_length(g_artist))

In [None]:
artist_ocurrence = 0

In [None]:
def sim_influence(sim_distance, artist_source, artist_target):
    global artist_ocurrence
    if artist_source == artist_target:
        artist_decay = 2 ** artist_ocurrence
        artist_ocurrence += 1
        return sim_distance * (1./artist_decay)
    if artist_target in length[artist_source]:
        return sim_distance * (1./length[artist_source][artist_target])
    else:
        return sim_distance * (1./100)

In [None]:
df_top_n['sim_influence'] = df_top_n.apply(
    lambda x: sim_influence(sim_distance=x['sim_distance'], artist_source='Claude Monet', artist_target=x['artist']),
    axis=1 )

df_top_n.head()

In [None]:
df_top_ten = df_top_n.sort_values(by=['sim_influence'], ascending=False)
df_top_ten = df_top_ten.head(5)
df_top_ten

In [None]:
df_top_ten = df_top_ten.dropna(subset=['imageUrl'])

In [None]:
sim_artworks = df_top_ten[['title', 'artist', 'imageUrl']].transpose().to_dict()
sim_artworks

In [None]:
import json

values = list(sim_artworks.values())
data = []
for i in range(len(sim_artworks)):
    values[i]['id'] = list(sim_artworks.keys())[i]
    data.append(values[i])
    
data

**Plot artworks**

In [None]:
r, c = 2, 5
cnt= 0

relative_path = list(df_top_ten['filename'])
fig, axs = plt.subplots(r, c)

for i in range(r):
    for j in range(c):
        img_path = filezip.extract('train/' + relative_path[cnt])
        axs[i,j].imshow(get_image(img_path)[0][...,::-1])
        axs[i,j].axis('off')
        cnt += 1

fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)

plt.show()


In [None]:
fig.savefig('./image_result/query_result_wasserstein_II.jpg')