# Sequence generation based on previous most similar

In [1]:
import numpy as np
import pandas as pd
import os

## Load data

**Load artwork code and metadata**

In [2]:
BASE_PATH = '/root/work/datasets/artwork_sequence/'
CONFIG_PATH = '/root/work/artwork_sequence/train_test_configuration'
RESULT_PATH = '/root/work/artwork_sequence/predicted_tours/generated_sequence_based_previous_most_similar'

In [3]:
df_all_metadata = pd.read_csv(os.path.join(BASE_PATH, 'all_metadata.csv'))
all_data_matrix = np.load(os.path.join(BASE_PATH, 'all_code_matrix.npy' ))

In [4]:
print(df_all_metadata.shape)
print(all_data_matrix.shape)

(633, 6)
(633, 300)


**Load tours**

In [5]:
museum_sequence_path = {
    'x_train' : os.path.join(CONFIG_PATH, 'X_train.csv'),
    'x_test' : os.path.join(CONFIG_PATH, 'X_test.csv'),
    'x_train_matrix' : os.path.join(CONFIG_PATH, 'X_train_matrix.npy'),
    'x_test_matrix' : os.path.join(CONFIG_PATH, 'X_test_matrix.npy')
}
museum_sequence_path

{'x_test': '/root/work/artwork_sequence/train_test_configuration/X_test.csv',
 'x_test_matrix': '/root/work/artwork_sequence/train_test_configuration/X_test_matrix.npy',
 'x_train': '/root/work/artwork_sequence/train_test_configuration/X_train.csv',
 'x_train_matrix': '/root/work/artwork_sequence/train_test_configuration/X_train_matrix.npy'}

In [6]:
df_x_train = pd.read_csv(museum_sequence_path['x_train'], index_col=0)
df_x_test = pd.read_csv(museum_sequence_path['x_test'], index_col=0)
x_train_matrix = np.load(museum_sequence_path['x_train_matrix'])
x_test_matrix = np.load(museum_sequence_path['x_test_matrix'])
df_x_train.head()

Unnamed: 0,tour_path
20,/root/work/datasets/artwork_sequence/rijksmuse...
7,/root/work/datasets/artwork_sequence/rijksmuse...
40,/root/work/datasets/artwork_sequence/prado_cra...
0,/root/work/datasets/artwork_sequence/rijksmuse...
23,/root/work/datasets/artwork_sequence/prado_cra...


In [7]:
df_x_test['tour_path'].values[1]

'/root/work/datasets/artwork_sequence/rijksmuseum_crawler/tour_14'

In [8]:
images_path = os.path.join(df_x_test['tour_path'].values[1], 'images')
tour_length = len(os.listdir(images_path))
X_tour = x_test_matrix[13:13 + tour_length]
X_tour.shape

(13, 300)

In [9]:
metadata_path = os.path.join(df_x_test['tour_path'].values[1], 'metadata.csv')
df_X_tour = pd.read_csv(metadata_path)
df_X_tour.head()

Unnamed: 0,id,author,data,image_url,title
0,1,pieter claesz.,1627,https://lh3.googleusercontent.com/uqHsSOyPEFvu...,still life with a turkey pie
1,2,jan havicksz. steen,1655,https://lh3.googleusercontent.com/fLsffYqD3Ex7...,adolf and catharina croeser known as the burgo...
2,3,jan brueghel (ii),c. 1625 - c. 1630,https://lh3.googleusercontent.com/Z3PTxWguIfYH...,still life with flowers in a glass
3,4,jacob jordaens (i),c. 1635 - c. 1636,https://lh3.googleusercontent.com/iogmFpVbkXpz...,portrait of magdalena de cuyper
4,5,jacob coeman,1665,https://lh3.googleusercontent.com/-iNszdtfmRNt...,pieter cnoll cornelia van nijenrode and their ...


## Find similar artwork

In [10]:
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances,euclidean_distances

In [11]:
df_predicted_tour = pd.DataFrame({ 'title' : [],
                         'author' : [],
                         'sim_value' : [],
                         'tour_path': [],
                         'image_url':[]})
df_predicted_tour

Unnamed: 0,author,image_url,sim_value,title,tour_path


In [12]:
def get_sim_matrix(code, all_data_matrix):
    #get the mean vector
    mean_code = np.mean(code, axis=0)
    mean_code.shape
    
    #Find most similar
    return cosine_similarity(mean_code.reshape((1,-1)), all_data_matrix)
    

In [13]:
def get_artwork_index(sim_matrix):
    
    #Sort indexes
    sort_index = np.argsort(sim_matrix.reshape((-1,)))
    #Find most similar artwork index
    sim_artwork_index = sort_index[-1]

    if np.isclose(sim_matrix[:,sim_artwork_index][0], 1.):
        #Because the top is the current artwork
        return sort_index[-2]
    else:
        return sort_index[-1]

In [14]:
def drop_selected_artwork(indexes, df_all_metadata, all_data_matrix):
    
    #Remove from metadata
    df_removed = df_all_metadata.copy()
    df_removed = df_removed.drop(indexes)
    df_removed = df_removed.reset_index(drop=True)
    
    #Remove ftom code matrix
    code_matrix = all_data_matrix.copy()
    code_matrix = np.delete(code_matrix, indexes, 0)
    
    return df_removed, code_matrix

In [None]:
def find_artworks_indexes(i, window_size, df_all_metadata, df_X_tour):
    
    indexes = []
    for j in range(window_size):
        row = df_all_metadata[(df_all_metadata['author']==df_X_tour.iloc[i+j]['author']) & (df_all_metadata['title']==df_X_tour.iloc[i+j]['title'])]

        #Because the artwork may be already deleted
        if row.shape[0] != 0:
            indexes.append(row.index[0])
    
    return indexes
    

In [None]:
predicted_code_list =[]

window_size = 3

for i in range(X_tour.shape[0]-window_size):
    
    #Get current codes
    code = X_tour[i:i+window_size,:]
    
    index_artwork_selected = find_artworks_indexes(i, window_size, df_all_metadata, df_X_tour)
    
    df_current_metadata, current_data_matrix = drop_selected_artwork(index_artwork_selected, df_all_metadata, all_data_matrix)
    

    #Find most similar
    sim_matrix = get_sim_matrix(code, current_data_matrix)

    #Find most similar artwork index
    sim_artwork_index = get_artwork_index(sim_matrix)
    
    #Save in dataframe 
    df_predicted_tour = df_predicted_tour.append({'title' : df_all_metadata.iloc[sim_artwork_index]['title'],
                    'author': df_all_metadata.iloc[sim_artwork_index]['author'],
                    'tour_path':df_all_metadata.iloc[sim_artwork_index]['tour_path'],
                    'image_url':df_all_metadata.iloc[sim_artwork_index]['image_url'],
                    'sim_value':sim_matrix[:,sim_artwork_index][0]
                   }, 
                   ignore_index=True)
    
    #Save predicted code
    predicted_code_list.append(all_data_matrix[sim_artwork_index])
    
    #Remove chosen artwork
    df_all_metadata, all_data_matrix = drop_selected_artwork(sim_artwork_index, df_all_metadata, all_data_matrix)

df_predicted_tour

In [None]:
tour_name = df_x_test['tour_path'].values[0].split('/')[-1]
df_predicted_tour.to_csv(os.path.join(RESULT_PATH, 'predicted_' + tour_name + '.csv'), index=False)

## Evaluate model

**MAE between predicted artworks and true artworks**

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
forecast_matrix = np.stack(predicted_code_list)
forecast_matrix.shape

In [None]:
np.mean(mean_absolute_error( X_tour[window_size:,:].T, forecast_matrix.T, multioutput='raw_values' ))