# Prepare Word Embedding dataset

In [None]:
import os
import pandas as pd
import numpy as np

In [None]:
BASE_PATH = '/root/work/artwork_sequence/train_test_configuration'
DATASET_PATH = '/root/work/datasets/artwork_sequence/'

In [None]:
museum_sequence_path = {
    'x_train' : os.path.join(BASE_PATH, 'X_train.csv'),
    'x_test' : os.path.join(BASE_PATH, 'X_test.csv'),
    'all_embedding_matrix' : os.path.join(DATASET_PATH, 'all_embedding_matrix.npy'),
    'all_metadata' : os.path.join(DATASET_PATH, 'all_metadata.csv')
}
museum_sequence_path

## Load train test dataset and embeddings

In [None]:
df_x_train = pd.read_csv(museum_sequence_path['x_train'], index_col=0)
df_x_test = pd.read_csv(museum_sequence_path['x_test'], index_col=0)
df_all_metadata = pd.read_csv(museum_sequence_path['all_metadata'])
all_embedding_matrix = np.load(museum_sequence_path['all_embedding_matrix'])
df_x_train.head()

In [None]:
df_all_metadata.head()

In [None]:
df_all_metadata.shape

## Define embedding matrix for train and test dataset

In [None]:
def get_embedding(row):
    index = df_all_metadata[(df_all_metadata['author']==row['author']) & (df_all_metadata['title'] == row['title'])].index.values[0]
    return all_embedding_matrix[index]

In [None]:
def get_embedding_matrix(df):
    tour_path_list = list(df['tour_path'].values)
    X_embedding_list = []
    for path in tour_path_list:
        #Get the matadata path 
        metadata_path = os.path.join(path, 'metadata.csv')
        df = pd.read_csv(metadata_path)

        #Find the emdedding for each artwork
        df['embedding'] = df.apply(get_embedding, axis=1 )

        #define embedding matrix for the tour
        embeddings_list = list(df['embedding'].values)
        embeddings_matrix = np.stack(embeddings_list)
        #Save
        X_embedding_list.append(embeddings_matrix)

    return np.concatenate(X_embedding_list)

In [None]:
embedding_train_matrix = get_embedding_matrix(df_x_train)
embedding_test_matrix = get_embedding_matrix(df_x_test)

**Save embedding matrix**

In [None]:
np.save(os.path.join(BASE_PATH, 'embedding_train_matrix' ), embedding_train_matrix)
np.save(os.path.join(BASE_PATH, 'embedding_test_matrix' ), embedding_test_matrix)