# Sequence generation based on previous most similar

In [1]:
import numpy as np
import pandas as pd
import os

## Load data

**Load artwork code and metadata**

In [2]:
BASE_PATH = '/root/work/datasets/artwork_sequence/'
CONFIG_PATH = '/root/work/artwork_sequence/train_test_configuration'

In [3]:
df_all_metadata = pd.read_csv(os.path.join(BASE_PATH, 'all_metadata.csv'))
all_data_matrix = np.load(os.path.join(BASE_PATH, 'all_code_matrix.npy' ))

In [4]:
print(df_all_metadata.shape)
print(all_data_matrix.shape)

(633, 6)
(633, 300)


**Load tours**

In [5]:
museum_sequence_path = {
    'x_train' : os.path.join(CONFIG_PATH, 'X_train.csv'),
    'x_test' : os.path.join(CONFIG_PATH, 'X_test.csv'),
    'x_train_matrix' : os.path.join(CONFIG_PATH, 'X_train_matrix.npy'),
    'x_test_matrix' : os.path.join(CONFIG_PATH, 'X_test_matrix.npy')
}
museum_sequence_path

{'x_test': '/root/work/artwork_sequence/train_test_configuration/X_test.csv',
 'x_test_matrix': '/root/work/artwork_sequence/train_test_configuration/X_test_matrix.npy',
 'x_train': '/root/work/artwork_sequence/train_test_configuration/X_train.csv',
 'x_train_matrix': '/root/work/artwork_sequence/train_test_configuration/X_train_matrix.npy'}

In [6]:
df_x_train = pd.read_csv(museum_sequence_path['x_train'], index_col=0)
df_x_test = pd.read_csv(museum_sequence_path['x_test'], index_col=0)
x_train_matrix = np.load(museum_sequence_path['x_train_matrix'])
x_test_matrix = np.load(museum_sequence_path['x_test_matrix'])
df_x_train.head()

Unnamed: 0,tour_path
20,/root/work/datasets/artwork_sequence/rijksmuse...
7,/root/work/datasets/artwork_sequence/rijksmuse...
40,/root/work/datasets/artwork_sequence/prado_cra...
0,/root/work/datasets/artwork_sequence/rijksmuse...
23,/root/work/datasets/artwork_sequence/prado_cra...


In [7]:
df_x_test['tour_path'].values[0]

'/root/work/datasets/artwork_sequence/prado_crawler/tour_18'

In [8]:
images_path = os.path.join(df_x_test['tour_path'].values[0], 'images')
tour_length = len(os.listdir(images_path))
X_tour = x_test_matrix[:tour_length]
X_tour.shape

(13, 300)

In [9]:
metadata_path = os.path.join(df_x_test['tour_path'].values[0], 'metadata.csv')
df_X_tour = pd.read_csv(metadata_path)
df_X_tour.head()

Unnamed: 0,id,author,data,image_url,title
0,1,"angelico, fra",1425 - 1426. tempera on poplar panel,https://content3.cdnprado.net/imagenes/Documen...,the annunciation
1,2,"weyden, rogier van der",before 1443. oil on panel,https://content3.cdnprado.net/imagenes/Documen...,the descent from the cross
2,3,"juanes, juan de (vicente juan masip)",1555 - 1562. oil on panel,https://content3.cdnprado.net/imagenes/Documen...,the last supper
3,4,anonymous,xii century. fresco painting on mural transfer...,https://content3.cdnprado.net/imagenes/Documen...,"christ pantocrator held by four angels, hermit..."
4,5,"piombo, sebastiano del",1516. oil on canvas,https://content3.cdnprado.net/imagenes/Documen...,christs descent into limbo


## Find similar artwork

In [10]:
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances,euclidean_distances

In [42]:
df_predicted_tour = pd.DataFrame({ 'title' : [],
                         'author' : [],
                         'sim_value' : [],
                         'tour_path': [],
                         'image_url':[]})
df_predicted_tour

Unnamed: 0,author,image_url,sim_value,title,tour_path


In [43]:
def get_sim_matrix(code, all_data_matrix):
    #get the mean vector
    mean_code = np.mean(code, axis=0)
    mean_code.shape
    
    #Find most similar
    return cosine_similarity(mean_code.reshape((1,-1)), all_data_matrix)
    

In [44]:
def get_artwork_index(sim_matrix):
    
    #Sort indexes
    sort_index = np.argsort(sim_matrix.reshape((-1,)))
    #Find most similar artwork index
    sim_artwork_index = sort_index[-1]

    if np.isclose(sim_matrix[:,sim_artwork_index][0], 1.):
        #Because the top is the current artwork
        return sort_index[-2]
    else:
        return sort_index[-1]

In [45]:
predicted_code_list =[]

window_size = 3

for i in range(X_tour.shape[0]-window_size):
    
    #Get actual code
    code = X_tour[i:i+window_size,:]

    #Find most similar
    sim_matrix = get_sim_matrix(code, all_data_matrix)

    #Find most similar artwork index
    sim_artwork_index = get_artwork_index(sim_matrix)
    
    #Save in dataframe 
    df_predicted_tour = df_predicted_tour.append({'title' : df_all_metadata.iloc[sim_artwork_index]['title'],
                    'author': df_all_metadata.iloc[sim_artwork_index]['author'],
                    'tour_path':df_all_metadata.iloc[sim_artwork_index]['tour_path'],
                    'image_url':df_all_metadata.iloc[sim_artwork_index]['image_url'],
                    'sim_value':sim_matrix[:,sim_artwork_index][0]
                   }, 
                   ignore_index=True)
    
    #Save predicted code
    predicted_code_list.append(all_data_matrix[sim_artwork_index])
df_predicted_tour

False
False
False
False
False
False
False
False
False
False


Unnamed: 0,author,image_url,sim_value,title,tour_path
0,"juanes, juan de (vicente juan masip)",https://content3.cdnprado.net/imagenes/Documen...,0.887453,the last supper,/root/work/datasets/artwork_sequence/prado_cra...
1,"juanes, juan de (vicente juan masip)",https://content3.cdnprado.net/imagenes/Documen...,0.90431,the last supper,/root/work/datasets/artwork_sequence/prado_cra...
2,"juanes, juan de (vicente juan masip)",https://content3.cdnprado.net/imagenes/Documen...,0.900055,the last supper,/root/work/datasets/artwork_sequence/prado_cra...
3,"rubens, peter paul",https://content3.cdnprado.net/imagenes/Documen...,0.923725,the adoration of the magi,/root/work/datasets/artwork_sequence/prado_cra...
4,"rubens, peter paul",https://content3.cdnprado.net/imagenes/Documen...,0.942827,the adoration of the magi,/root/work/datasets/artwork_sequence/prado_cra...
5,"veronese, paolo (paolo cagliari)",https://content3.cdnprado.net/imagenes/Documen...,0.926839,christ among the doctors in the temple,/root/work/datasets/artwork_sequence/prado_cra...
6,"veronese, paolo (paolo cagliari)",https://content3.cdnprado.net/imagenes/Documen...,0.921281,christ among the doctors in the temple,/root/work/datasets/artwork_sequence/prado_cra...
7,"tintoretto, jacopo robusti",https://content3.cdnprado.net/imagenes/Documen...,0.895887,the washing of the feet,/root/work/datasets/artwork_sequence/prado_cra...
8,"zurbaran, francisco de",https://content3.cdnprado.net/imagenes/Documen...,0.910291,agnus dei,/root/work/datasets/artwork_sequence/prado_cra...
9,"velazquez, diego rodriguez de silva y",https://content3.cdnprado.net/imagenes/Documen...,0.914807,the crucified christ,/root/work/datasets/artwork_sequence/prado_cra...


## Evaluate model

**MAE between predicted artworks and true artworks**

In [46]:
from sklearn.metrics import mean_absolute_error

In [47]:
forecast_matrix = np.stack(predicted_code_list)
forecast_matrix.shape

(10, 300)

In [49]:
np.mean(mean_absolute_error( X_tour[window_size:,:].T, forecast_matrix.T, multioutput='raw_values' ))

0.49383798