In [1]:
import pandas as pd
import numpy as np
import os

## Load Data

In [2]:
CONFIG_PATH = '/root/work/artwork_sequence/train_test_configuration'
DATASET_PATH = '/root/work/datasets'

**Load metadata**

In [3]:
df_artworks = pd.read_csv(os.path.join(DATASET_PATH,'train_mayors_style_encoded_with_url.csv'))
df_artworks.head()

Unnamed: 0,filename,title,style,genre,date,artist,imageUrl
0,70944.jpg,Forbidden Land,Surrealism,landscape,1937.0,Wolfgang Paalen,https://uploads6.wikiart.org/images/wolfgang-p...
1,99442.jpg,Storm at sea,Romanticism,marina,1873.0,Ivan Aivazovsky,https://uploads4.wikiart.org/images/ivan-aivaz...
2,28908.jpg,Yachting in the Mediterranean,Realism,genre painting,1896.0,Julius LeBlanc Stewart,https://uploads4.wikiart.org/images/julius-leb...
3,7486.jpg,Death of Eleazer,Romanticism,religious painting,1866.0,Gustave Dore,https://uploads5.wikiart.org/images/gustave-do...
4,35766.jpg,The-Deluge,Romanticism,religious painting,,Gustave Dore,https://uploads3.wikiart.org/images/gustave-do...


In [4]:
df_artworks.shape

(46010, 7)

**Load code matrix**

In [5]:
artwork_code_matrix = np.load(os.path.join(DATASET_PATH, 'train_mayors_style_encode.npy'))
artwork_code_matrix.shape

(46010, 300)

**Load embedding matrix**

In [6]:
artwork_embedding_matrix = np.load(os.path.join(DATASET_PATH, 'train_mayors_style_embedding.npy'))
artwork_embedding_matrix.shape

(46010, 100)

### Get artworks by style and sorted by date

**Normalize artist name**

In [9]:
import unicodedata

def normalize_title(title):
    return unicodedata.normalize('NFKD', title.lower()).encode('ASCII', 'ignore').decode('utf8')

**Normalize date**

In [10]:
def normalize_date(dates):
    dates_norm = dates.str.extract(r'(?P<begining>.+)?(?P<date>\d\d\d\d)(?P<end>.+)?')
    return dates_norm['date']
    

**Drop corrupt data**

In [11]:
def drop_corrupt_data(df):
     #Drop NaN date
    df = df.dropna(subset=['date'])
    #Drop NaN image url
    df = df.dropna(subset=['imageUrl'])
    
    return df
    

In [12]:
df_artworks_clean = drop_corrupt_data(df_artworks)
#Normalize artist name
df_artworks_clean['artist'] = df_artworks_clean['artist'].apply(normalize_title)
#Normalize date
df_artworks_clean['date'] = normalize_date(df_artworks_clean['date'])

df_artworks_clean.head()

Unnamed: 0,filename,title,style,genre,date,artist,imageUrl
0,70944.jpg,Forbidden Land,Surrealism,landscape,1937,wolfgang paalen,https://uploads6.wikiart.org/images/wolfgang-p...
1,99442.jpg,Storm at sea,Romanticism,marina,1873,ivan aivazovsky,https://uploads4.wikiart.org/images/ivan-aivaz...
2,28908.jpg,Yachting in the Mediterranean,Realism,genre painting,1896,julius leblanc stewart,https://uploads4.wikiart.org/images/julius-leb...
3,7486.jpg,Death of Eleazer,Romanticism,religious painting,1866,gustave dore,https://uploads5.wikiart.org/images/gustave-do...
5,28363.jpg,Still Life with Flowers and Gold Cups of Honor,Baroque,still life,1612,clara peeters,https://uploads0.wikiart.org/images/clara-peet...


In [13]:
df_artworks_clean.shape

(30862, 7)

**Get artworks by style**

In [14]:
df_style_grouped = df_artworks_clean.groupby('style')

**Get genres presents in all periods**

In [15]:
df_grouped = df_artworks_clean.groupby('style')
genres_set = []

for name, group in df_grouped:
    genres_set.append(set(group['genre'].unique()))
    
ref_set = genres_set[0]

for gen_set in genres_set:
    ref_set = ref_set.intersection(gen_set)

common_genres = np.array(list(ref_set))
common_genres

array(['flower painting', 'still life', 'interior', 'portrait', 'marina',
       'history painting', 'self-portrait', 'sketch and study',
       'nude painting (nu)', 'allegorical painting',
       'mythological painting', 'religious painting', 'genre painting',
       'animal painting', 'illustration', 'landscape', 'cityscape',
       'literary painting'], dtype='<U21')

## Generate random samples

In [17]:
samples_tour_list = []

for i in range(1000):
    df_tour_lists = []
    #choose a genre
    genre = np.random.choice(common_genres, 1)[0]
    for name, group in df_style_grouped:
        group_genre = group[group['genre'] == genre]
        df_tour_lists.append(group_genre.sample(3, replace=True))
        
    df_tours = pd.concat(df_tour_lists)
    df_tours = df_tours.sort_values(by=['date'])
    samples_tour_list.append(df_tours)

df_style_tours = pd.concat(samples_tour_list)
df_style_tours.shape

(27000, 7)

### Get artwork's code and embedding

In [None]:
matrix_sorted = artwork_code_matrix[df_style_tours.index]
matrix_sorted.shape

In [None]:
embedding_matrix_sorted = artwork_embedding_matrix[df_style_tours.index]
embedding_matrix_sorted.shape

### Save data

In [None]:
df_tours.to_csv(os.path.join(CONFIG_PATH, 'style_tours.csv'), index=True)

In [None]:
np.save(os.path.join(CONFIG_PATH, 'style_tours_matrix'),matrix_sorted)

In [None]:
np.save(os.path.join(CONFIG_PATH, 'style_tours_embedding_matrix'),embedding_matrix_sorted)