In [None]:
import os
import pandas as pd
import numpy as np

In [None]:
BASE_PATH = '/root/work/datasets/artwork_sequence/'
CONFIGURATION_PATH = '/root/work/artwork_sequence/'

## Map tours to unique id

In [None]:
folders_dict = {}
index = 0
#List museums
for folder in os.listdir(BASE_PATH):
    museum_path = os.path.join(BASE_PATH, folder)
    #List tours inside the museum
    if os.path.isdir(museum_path):
        for tour_folder in os.listdir(museum_path):
            #skip .ipynb checkpoint folder
            if (not tour_folder.startswith('.')) or (not os):

                tour_path = os.path.join(museum_path, tour_folder)
                folders_dict[index] = tour_path
                index +=1

print(len(folders_dict))

**Put data into a Dataframe**

In [None]:
df_tours = pd.DataFrame.from_dict(folders_dict, orient='index', columns=['tour_path'])
df_tours.head()

In [None]:
df_tours['tour_path'][0]

## Train Test split data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df_tours
y = [1]*len(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

**Save train test data configuration**

In [None]:
df_tours['train_test'] = 'Train'
df_tours.loc[X_test.index, 'train_test'] = 'Test'
df_tours.head(10)

In [None]:
config_folder = os.path.join(CONFIGURATION_PATH, 'train_test_configuration/config_0')
if not os.path.exists(config_folder):
    os.makedirs(config_folder)

In [None]:
df_tours.to_csv(os.path.join(config_folder, 'config.csv'), index=False)

#Because the training and testing data is shuffled
X_train.to_csv(os.path.join(config_folder, 'X_train.csv'), index=True)
X_test.to_csv(os.path.join(config_folder, 'X_test.csv'), index=True)

**Save train and test code matrix**

In [None]:
def get_code_matrix(X):
    X_list = []
    for path in X.values:
        matrix_path = os.path.join(path[0], 'code_matrix.npy')
        code_matrix = np.load(matrix_path)
        X_list.append(code_matrix)

    X_matrix = np.concatenate(X_list)    
    return X_matrix

In [None]:
#X_train_matrix
X_train_matrix = get_code_matrix(X_train)
X_train_matrix.shape

In [None]:
#X_test_matrix
X_test_matrix = get_code_matrix(X_test)
X_test_matrix.shape

In [None]:
np.save(os.path.join(config_folder, 'X_train_matrix' ), X_train_matrix)
np.save(os.path.join(config_folder, 'X_test_matrix' ), X_test_matrix)

## Merge all data from the tours

In [None]:
metadata_list = []
index = 0
#List museums
for folder in os.listdir(BASE_PATH):
    museum_path = os.path.join(BASE_PATH, folder)
    #List tours inside the museum
    if os.path.isdir(museum_path):
        for tour_folder in os.listdir(museum_path):
            #skip .ipynb checkpoint folder
            if not tour_folder.startswith('.'):
                tour_path = os.path.join(museum_path, tour_folder)
                #Get metadata
                df = pd.read_csv(os.path.join(tour_path, 'metadata.csv'))
                df['tour_path'] = tour_path
                metadata_list.append(df)

print(len(metadata_list))


**Tour length mean**

In [None]:
length_tours = []
for df in metadata_list:
    length_tours.append(df.shape[0])
np.mean(length_tours)

In [None]:
PLOT_PATH = '/root/work/artwork_sequence/plots'

In [None]:
import matplotlib.pyplot as plt

plot = pd.Series(length_tours).hist()
plt.title('Tour length distribution')
plt.xticks(np.arange(0, 61, 5))
plt.yticks(np.arange(0, 22, 2))
plt.xlabel('Length')
plt.savefig(os.path.join(PLOT_PATH, 'tour_length_hist.png'))

**Merge unique artworks**

In [None]:
all_metadata = pd.concat(metadata_list)
all_metadata.head()

In [None]:
all_metadata.shape

In [None]:
all_metadata = all_metadata.drop_duplicates(subset=['author', 'title'])
all_metadata.head()

In [None]:
all_metadata['tour_path'].iloc[0]

In [None]:
all_metadata.shape

In [None]:
all_metadata.to_csv(os.path.join(BASE_PATH, 'all_metadata.csv'), index=False)