In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
BASE_PATH = '/root/work/datasets/artwork_sequence/'
CONFIGURATION_PATH = '/root/work/artwork_sequence/'

## Map tours to unique id

In [3]:
folders_dict = {}
index = 0
#List museums
for folder in os.listdir(BASE_PATH):
    museum_path = os.path.join(BASE_PATH, folder)
    #List tours inside the museum
    if os.path.isdir(museum_path):
        for tour_folder in os.listdir(museum_path):
            #skip .ipynb checkpoint folder
            if (not tour_folder.startswith('.')) or (not os):

                tour_path = os.path.join(museum_path, tour_folder)
                folders_dict[index] = tour_path
                index +=1

print(len(folders_dict))

52


**Put data into a Dataframe**

In [4]:
df_tours = pd.DataFrame.from_dict(folders_dict, orient='index', columns=['tour_path'])
df_tours.head()

Unnamed: 0,tour_path
0,/root/work/datasets/artwork_sequence/rijksmuse...
1,/root/work/datasets/artwork_sequence/rijksmuse...
2,/root/work/datasets/artwork_sequence/rijksmuse...
3,/root/work/datasets/artwork_sequence/rijksmuse...
4,/root/work/datasets/artwork_sequence/rijksmuse...


In [5]:
df_tours['tour_path'][0]

'/root/work/datasets/artwork_sequence/rijksmuseum_crawler/tour_21'

## Generate K-fold validation data

In [6]:
from sklearn.model_selection import KFold

In [7]:
X = df_tours
y = [1]*len(X)

In [8]:
kf = KFold(n_splits=10)

In [9]:
kf.get_n_splits(X)

10

**Save k-fold data configuration**

In [10]:
config_folder = os.path.join(CONFIGURATION_PATH, 'kfold')
if not os.path.exists(config_folder):
    os.makedirs(config_folder)

In [11]:
def get_code_matrix(X):
    X_list = []
    for path in X.values:
        matrix_path = os.path.join(path[0], 'code_matrix.npy')
        code_matrix = np.load(matrix_path)
        X_list.append(code_matrix)

    X_matrix = np.concatenate(X_list)    
    return X_matrix

In [12]:
def save_data_configuration(df, train_indexes, test_indexes, path):
    
    #Define train and test data
    X_train = df.loc[train_indexes]
    X_test = df.loc[test_indexes]
    
    #X_train_matrix
    X_train_matrix = get_code_matrix(X_train)
    X_test_matrix = get_code_matrix(X_test)

    #Save data
    X_train.to_csv(os.path.join(path, 'X_train.csv'), index=True)
    X_test.to_csv(os.path.join(path, 'X_test.csv'), index=True)
    
    np.save(os.path.join(path, 'X_train_matrix' ), X_train_matrix)
    np.save(os.path.join(path, 'X_test_matrix' ), X_test_matrix)

In [13]:
k = 0
for train_index, test_index in kf.split(X):
    #Create folder
    k_folder_path = os.path.join(config_folder, 'folder_' + str(k))
    if not os.path.exists(k_folder_path):
        os.makedirs(k_folder_path)
    
    #Save data
    save_data_configuration(X, train_index, test_index, k_folder_path)
    k += 1 
    

## Test data generation

In [15]:
## Define path ##
k_folder_path = os.path.join(config_folder, 'folder_' + str(0))

In [17]:
df_x_train = pd.read_csv(os.path.join(k_folder_path, 'X_train.csv'), index_col=0)
df_x_test = pd.read_csv(os.path.join(k_folder_path, 'X_test.csv'), index_col=0)

x_train_matrix = np.load(os.path.join(k_folder_path, 'X_train_matrix.npy'))
x_test_matrix = np.load(os.path.join(k_folder_path, 'X_test_matrix.npy'))

In [23]:
df_x_train.head()

Unnamed: 0,tour_path
6,/root/work/datasets/artwork_sequence/rijksmuse...
7,/root/work/datasets/artwork_sequence/rijksmuse...
8,/root/work/datasets/artwork_sequence/rijksmuse...
9,/root/work/datasets/artwork_sequence/rijksmuse...
10,/root/work/datasets/artwork_sequence/rijksmuse...


In [21]:
df_x_train.shape

(46, 1)

In [22]:
x_train_matrix.shape

(747, 300)