# Artist Sequence prediction find most similar

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os

In [None]:
BASE_PATH = '/root/work/artwork_sequence/train_test_configuration'
MULTI_CONFIG_PATH = os.path.join(BASE_PATH, 'multivariate')
UNI_CONFIG_PATH = os.path.join(BASE_PATH, 'univariate')
UNI_PRED_MULTI_CONFIG_PATH = os.path.join(BASE_PATH, 'univariate_predict_multiple')
ARTIST_CODE_CONFIG_PATH = os.path.join(BASE_PATH,'artist_code')

In [None]:
window_index = 3

museum_sequence_path = {
    'x_train' : os.path.join(BASE_PATH, 'X_train.csv'),
    'x_test' : os.path.join(BASE_PATH, 'X_test.csv'),
    'x_train_matrix' : os.path.join(BASE_PATH, 'X_train_matrix.npy'),
    'x_test_matrix' : os.path.join(BASE_PATH, 'X_test_matrix.npy'),
    'artist_code_train_matrix' : os.path.join(BASE_PATH, 'artist_code_train_matrix.npy'),
    'artist_code_test_matrix' : os.path.join(BASE_PATH, 'artist_code_test_matrix.npy')
}
museum_sequence_path

In [None]:
def get_trained_weights_path(CONFIG_PATH, window_size):
    trained_weights_path = {
            'weights_folder' : os.path.join(CONFIG_PATH, 'config_'+str(window_size)+'/trained_model_weights')
        }

    return trained_weights_path


## Load data

In [None]:
df_x_train = pd.read_csv(museum_sequence_path['x_train'], index_col=0)
df_x_test = pd.read_csv(museum_sequence_path['x_test'], index_col=0)
x_train_matrix = np.load(museum_sequence_path['x_train_matrix'])
x_test_matrix = np.load(museum_sequence_path['x_test_matrix'])
artist_code_train_matrix = np.load(museum_sequence_path['artist_code_train_matrix'])
artist_code_test_matrix = np.load(museum_sequence_path['artist_code_test_matrix'])
df_x_train.head()

In [None]:
x_train_matrix.shape

**Reduce artits code matrix**

In [None]:
artist_code_train_matrix = np.mean(artist_code_train_matrix, axis=1)
artist_code_test_matrix = np.mean(artist_code_test_matrix, axis=1)

## Reset Tensorflow session

In [None]:
tf.keras.backend.clear_session()

## Config data to fit with the model input

**Define timeline**

In [None]:
timeline = np.arange(x_test_matrix.shape[0])
timeline.shape

In [None]:
split_time = x_train_matrix.shape[0]

X = x_test_matrix

#length of the history
window_size = window_index

#Number of artwork's feature
n_features = X.shape[1]

#Number of feature to take into account
n_influence_features=10

batch_size = 128
shuffle_buffer_size = 300

## Create and Load model

**Get artwork's models**

In [None]:
from Sequence_prediction_factory import Sequence_prediction_multivariate, Sequence_prediction_univariate

In [None]:
model_univariate = Sequence_prediction_univariate(
    X=X, 
    shuffle_buffer_size=shuffle_buffer_size, 
    split_time=split_time, 
    train_batch_size=batch_size, 
    val_batch_size=batch_size,
    CONFIG_PATH=UNI_CONFIG_PATH)

In [None]:
model_multivariate = Sequence_prediction_multivariate(
    X=X, 
    shuffle_buffer_size=shuffle_buffer_size, 
    split_time=split_time, 
    train_batch_size=batch_size, 
    val_batch_size=batch_size,  
    n_influence_features=n_influence_features,
    CONFIG_PATH=MULTI_CONFIG_PATH)

In [None]:
model_prediction = model_univariate

In [None]:
n_features = X.shape[1]
#Get and define the RNN model 
model_prediction.set_window_size(window_size)
model = model_prediction.get_model()
model.define_model(conv_filter=16, lstm_filter=32, dense_filter=16, prediction_length=1)
model.get_model().summary()

**Get artist model**

In [None]:
from Prediction_artist_model import  Prediction_artist_model

In [None]:
artist_model = Prediction_artist_model(
            X=artist_code_test_matrix,
            split_time=split_time,
            train_batch_size=batch_size, 
            val_batch_size=batch_size, 
            window_size=window_size, 
            shuffle_buffer=shuffle_buffer_size)

In [None]:
artist_model.define_model(conv_filter=20, lstm_filter=40, dense_filter=20, prediction_length=1)
artist_model.get_model().summary()

## Predict tour

In [None]:
from utils_plot import plot_series, plot_train_history, plot_prediction

**Select a tour**

In [None]:
df_x_test['tour_path'].values[0]

In [None]:
images_path = os.path.join(df_x_test['tour_path'].values[0], 'images')
tour_length = len(os.listdir(images_path))
X_tour = x_test_matrix[:tour_length]
X_tour.shape

In [None]:
metadata_path = os.path.join(df_x_test['tour_path'].values[0], 'metadata.csv')
df_X_tour = pd.read_csv(metadata_path)
df_X_tour.head()

**Function to predict the tour**

In [None]:
def model_forecast(model, series, window_size, batch_size):
    if len(series.shape) == 1:
            series = tf.expand_dims(series, axis=-1)
    ds = tf.data.Dataset.from_tensor_slices(series)
    ds = ds.window(window_size, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda w: w.batch(window_size))
    ds = ds.map(lambda w: (w[:]))
    ds = ds.batch(batch_size)
    forecast = model.predict(ds)
    return forecast

### Predict artists

In [None]:
artist_model.load_weights(get_trained_weights_path(ARTIST_CODE_CONFIG_PATH, window_size))
rnn_artist_forecast = model_forecast(artist_model.get_model(), artist_code_test_matrix, window_size, batch_size)
rnn_artist_forecast = rnn_artist_forecast[1:,-1]
rnn_artist_forecast.shape

In [None]:
x_valid = artist_code_test_matrix[window_size:]
plot = plot_series(timeline[window_size:], [(x_valid, 'x_valid'), (rnn_artist_forecast, 'rnn')])
plot.title('Forescast artwork sequence')

### Predict artworks 

**Predict feature tours**

In [None]:
df_evaluation = pd.DataFrame({'feature' : [],
                             'forecast': [],
                             'x_valid':[],
                             'mae':[]})
df_evaluation

In [None]:
model = model_prediction.get_model()
model.define_model(conv_filter=16, lstm_filter=32, dense_filter=16, prediction_length=1)

for feature in range(n_features):
    
    #Load weights for feature i
    model.set_index(feature)
    model.load_weights(get_trained_weights_path(UNI_CONFIG_PATH, window_size))
    
    #Define feature to take into account for prediction
    x_influence_features = model.get_indexes_features()
    x_influence_features = np.insert(arr=x_influence_features, obj=0, values=int(feature))
    x_feature = X[:,x_influence_features.astype(int)]
    
    #Predict feature i
    rnn_forecast = model_forecast(model.get_model(), x_feature, window_size, batch_size)
    rnn_forecast = rnn_forecast[1:,-1]

    #Get validation dataset 
    x_valid = x_feature[window_size:, 0]

    #Compute MAE
    mae = tf.keras.metrics.mean_absolute_error(x_valid, rnn_forecast).numpy().mean()

    df_evaluation = df_evaluation.append({'feature' : feature,
                    'forecast': rnn_forecast,
                    'x_valid':x_valid,
                    'mae':mae
                   }, 
                   ignore_index=True)

df_evaluation.head()

In [None]:
df_evaluation['mae'].mean()

**Plot forecast for a single feature**

In [None]:
feature = 4
rnn_forecast = df_evaluation.loc[feature,'forecast']
#discard the first artworks
x_valid = df_evaluation.loc[feature,'x_valid']
timeline = np.arange(rnn_forecast.shape[0])

In [None]:
plot = plot_series(timeline, [(x_valid, 'x_valid'), (rnn_forecast, 'rnn')])
plot.title('Forescast artwork sequence')
#plot.savefig(os.path.join(PLOT_PATH, 'forescast_sequence_2.png'))
plot.show()

## Find most similar artwork

**Load artwork's repository**

In [None]:
BASE_PATH = '/root/work/datasets/artwork_sequence/'

In [None]:
df_all_metadata = pd.read_csv(os.path.join(BASE_PATH, 'all_metadata.csv'))
all_data_matrix = np.load(os.path.join(BASE_PATH, 'all_code_matrix.npy' ))

df_all_artists = pd.read_csv(os.path.join(BASE_PATH, 'all_artists.csv'))
all_artists_code_matrix = np.load(os.path.join(BASE_PATH, 'all_artists_code_matrix.npy' ))
all_artists_code_matrix_reduce = np.mean(all_artists_code_matrix, axis=1)

In [None]:
print(df_all_metadata.shape)
print(all_data_matrix.shape)

print(df_all_artists.shape)
print(all_artists_code_matrix_reduce.shape)

### Most similar artist

In [None]:
def get_most_similar_artist(p, df_all_artists, all_artists_code_matrix_reduce):
    
    #Find nearest value. Try to take a couple
    nearest_index_sort = np.abs(all_artists_code_matrix_reduce - p).argsort()

    #Find most similar
    return list(df_all_artists.iloc[nearest_index_sort[:2]]['author'].values)
    

In [None]:
predicted_artists = pd.Series(rnn_artist_forecast, name="prediction")
predicted_artists = predicted_artists.to_frame()
predicted_artists.head()

In [None]:
predicted_artists['author'] = predicted_artists['prediction'].apply(get_most_similar_artist, args=(df_all_artists, all_artists_code_matrix_reduce,))
predicted_artists.head()

In [None]:
predicted_artists.shape

### Most similar artwork

**Reconstruct predicted codes**

In [None]:
forescast = df_evaluation['forecast']
forescast.head()

In [None]:
forescast[0].shape

In [None]:
feature_list = list(forescast.values)

In [None]:
forecast_matrix = np.stack(feature_list)
forecast_matrix = forecast_matrix.T
forecast_matrix.shape

**Helper function to avoid duplicated artworks in a recommendation**

In [None]:
def drop_selected_artwork(indexes, df_all_metadata, all_data_matrix):
    
    #Remove from metadata
    df_removed = df_all_metadata.copy()
    df_removed = df_removed.drop(indexes)
    df_removed = df_removed.reset_index(drop=True)
    
    #Remove from code matrix
    code_matrix = all_data_matrix.copy()
    code_matrix = np.delete(code_matrix, indexes, 0)
    
    return df_removed, code_matrix

**Compute cosine similarity**

In [None]:
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances,euclidean_distances

def get_predicted_tour(forecast_matrix, all_data_matrix, df_all_metadata, predicted_artists):
    #Define dataframe to save the predicted tour
    predicted_tour_code_list = []
    df_predicted_tour = pd.DataFrame({ 'title' : [],
                             'author' : [],
                             'sim_value' : [],
                             'tour_path': [],
                             'image_url':[]})
    
    for i in range(forecast_matrix.shape[0]):
        #Find code
        code = forecast_matrix[i].reshape((1,-1))
        
        #Define a valid subset
        artists = predicted_artists['author'][i]
        df_artist_work = df_artist_work = df_all_metadata[df_all_metadata['author'].isin(artists)]
        artist_work_matrix =all_data_matrix[df_all_metadata[df_all_metadata['author'].isin(artists)].index]

        #Compute cosine similarity
        sim_matrix = cosine_similarity(code, artist_work_matrix)
        #sim_matrix = euclidean_distances(code, all_data_matrix)

        #sort indexes
        sort_index = np.argsort(sim_matrix.reshape((-1,)))

        #Find most similar
        sim_artwork_index = sort_index[-1]

        #Save in dataframe 
        df_predicted_tour = df_predicted_tour.append({'title' : df_artist_work.iloc[sim_artwork_index]['title'],
                        'author': df_artist_work.iloc[sim_artwork_index]['author'],
                        'tour_path':df_artist_work.iloc[sim_artwork_index]['tour_path'],
                        'image_url':df_artist_work.iloc[sim_artwork_index]['image_url'],
                        'sim_value':sim_matrix[:,sim_artwork_index][0]
                       }, 
                       ignore_index=True)

        #Save predicted artwork's code
        predicted_tour_code_list.append(artist_work_matrix[sim_artwork_index])

        #Remove selected artworks
        #df_all_metadata, all_data_matrix = drop_selected_artwork([sim_artwork_index], df_all_metadata, all_data_matrix)



    return (df_predicted_tour, predicted_tour_code_list)

In [None]:
df_predicted_tour, predicted_tour_code_list = get_predicted_tour(forecast_matrix, all_data_matrix, df_all_metadata, predicted_artists)
df_predicted_tour.shape

In [None]:
predicted_tour_code_matrix = np.vstack(predicted_tour_code_list)
predicted_tour_code_matrix.shape

### Compute mae between predicted tour and valid tour

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
x_valid_artwork = df_evaluation['x_valid']
x_valid_artwork.head()

In [None]:
artwork_feature_list = list(x_valid_artwork.values)
artwork_matrix = np.stack(artwork_feature_list)
artwork_matrix = artwork_matrix.T
artwork_matrix.shape

In [None]:
np.mean(mean_absolute_error(predicted_tour_code_matrix, artwork_matrix, multioutput='raw_values' ))

In [None]:
np.mean(mean_absolute_error(forecast_matrix.T, artwork_matrix.T, multioutput='raw_values' ))

In [None]:
df_X_tour

In [None]:
df_predicted_tour.head(12)