In [None]:
import os
import pandas as pd
import numpy as np

In [None]:
BASE_PATH = '/root/work/datasets/artwork_sequence/'
CONFIGURATION_PATH = '/root/work/artwork_sequence/'

## Map tours to unique id

In [None]:
folders_dict = {}
index = 0
#List museums
for folder in os.listdir(BASE_PATH):
    museum_path = os.path.join(BASE_PATH, folder)
    #List tours inside the museum
    if os.path.isdir(museum_path):
        for tour_folder in os.listdir(museum_path):
            #skip .ipynb checkpoint folder
            if (not tour_folder.startswith('.')) or (not os):

                tour_path = os.path.join(museum_path, tour_folder)
                folders_dict[index] = tour_path
                index +=1

print(len(folders_dict))

**Put data into a Dataframe**

In [None]:
df_tours = pd.DataFrame.from_dict(folders_dict, orient='index', columns=['tour_path'])
df_tours.head()

In [None]:
df_tours['tour_path'][0]

## Generate K-fold validation data

In [None]:
from sklearn.model_selection import KFold

In [None]:
X = df_tours
y = [1]*len(X)

In [None]:
kf = KFold(n_splits=10)

In [None]:
kf.get_n_splits(X)

**Save k-fold data configuration**

In [None]:
config_folder = os.path.join(CONFIGURATION_PATH, 'kfold')
if not os.path.exists(config_folder):
    os.makedirs(config_folder)

In [None]:
def get_code_matrix(X):
    X_list = []
    for path in X.values:
        matrix_path = os.path.join(path[0], 'code_matrix.npy')
        code_matrix = np.load(matrix_path)
        X_list.append(code_matrix)

    X_matrix = np.concatenate(X_list)    
    return X_matrix

In [None]:
def save_data_configuration(df, train_indexes, test_indexes, path):
    
    #Define train and test data
    X_train = df.loc[train_indexes]
    X_test = df.loc[test_indexes]
    
    #X_train_matrix
    X_train_matrix = get_code_matrix(X_train)
    X_test_matrix = get_code_matrix(X_test)

    #Save data
    X_train.to_csv(os.path.join(path, 'X_train.csv'), index=True)
    X_test.to_csv(os.path.join(path, 'X_test.csv'), index=True)
    
    np.save(os.path.join(path, 'X_train_matrix' ), X_train_matrix)
    np.save(os.path.join(path, 'X_test_matrix' ), X_test_matrix)

In [None]:
k = 0
for train_index, test_index in kf.split(X):
    #Create folder
    k_folder_path = os.path.join(config_folder, 'folder_' + str(k))
    if not os.path.exists(k_folder_path):
        os.makedirs(k_folder_path)
    
    #Save data
    save_data_configuration(X, train_index, test_index, k_folder_path)
    k += 1 
    

## Test data generation

In [None]:
## Define path ##
k_folder_path = os.path.join(config_folder, 'folder_' + str(0))

In [None]:
df_x_train = pd.read_csv(os.path.join(k_folder_path, 'X_train.csv'), index_col=0)
df_x_test = pd.read_csv(os.path.join(k_folder_path, 'X_test.csv'), index_col=0)

x_train_matrix = np.load(os.path.join(k_folder_path, 'X_train_matrix.npy'))
x_test_matrix = np.load(os.path.join(k_folder_path, 'X_test_matrix.npy'))

In [None]:
df_x_train.head()

In [None]:
df_x_train.shape

In [None]:
x_train_matrix.shape