# Prepare artist dataset

In [None]:
import os
import pandas as pd
import numpy as np

In [None]:
BASE_PATH = '/root/work/artwork_sequence/train_test_configuration'
DATASET_PATH = '/root/work/datasets/artwork_sequence/'

In [None]:
museum_sequence_path = {
    'x_train' : os.path.join(BASE_PATH, 'X_train.csv'),
    'x_test' : os.path.join(BASE_PATH, 'X_test.csv'),
    'all_artists_code_matrix' : os.path.join(DATASET_PATH, 'all_artists_code_matrix.npy'),
    'all_artists' : os.path.join(DATASET_PATH, 'all_artists.csv')
}
museum_sequence_path

## Load train test dataset and embeddings

In [None]:
df_x_train = pd.read_csv(museum_sequence_path['x_train'], index_col=0)
df_x_test = pd.read_csv(museum_sequence_path['x_test'], index_col=0)
df_all_artists = pd.read_csv(museum_sequence_path['all_artists'])
all_artists_code_matrix = np.load(museum_sequence_path['all_artists_code_matrix'])
df_x_train.head()

In [None]:
df_all_artists.head()

## Define artist code matrix

In [None]:
def get_artist_code(row, df_all_artists, all_artists_code_matrix):
    index = df_all_artists[df_all_artists['author']==row['author']].index.values[0]
    return all_artists_code_matrix[index]

In [None]:
def get_artist_matrix(df, df_all_artists, all_artists_code_matrix):
    tour_path_list = list(df['tour_path'].values)
    X_artist_code_list = []
    for path in tour_path_list:
        #Get the matadata path 
        metadata_path = os.path.join(path, 'metadata.csv')
        df = pd.read_csv(metadata_path)

        #Find the artist code for each artwork
        df['artist_code'] = df.apply(get_artist_code, axis=1, args=(df_all_artists, all_artists_code_matrix, ) )

        #define artist code matrix for the tour
        artists_list = list(df['artist_code'].values)
        artists_code_matrix = np.stack(artists_list)
        #Save
        X_artist_code_list.append(artists_code_matrix)

    return np.concatenate(X_artist_code_list)

In [None]:
artist_code_train_matrix = get_artist_matrix(df_x_train, df_all_artists, all_artists_code_matrix)
artist_code_test_matrix = get_artist_matrix(df_x_test, df_all_artists, all_artists_code_matrix)

In [None]:
artist_code_train_matrix.shape

**Save code artists**

In [None]:
np.save(os.path.join(BASE_PATH, 'artist_code_train_matrix' ), artist_code_train_matrix)
np.save(os.path.join(BASE_PATH, 'artist_code_test_matrix' ), artist_code_test_matrix)