In [1]:
import pandas as pd
import numpy as np
import os

## Load Data

In [2]:
CONFIG_PATH = '/root/work/artwork_sequence/train_test_configuration'
DATASET_PATH = '/root/work/datasets'
CLEAN_DATASET_PATH = os.path.join(DATASET_PATH, 'clean_data')

**Load metadata**

In [3]:
df_artworks = pd.read_csv(os.path.join(DATASET_PATH,'train_mayors_style_encoded_with_url.csv'))
df_artworks.head()

Unnamed: 0,filename,title,style,genre,date,artist,imageUrl
0,70944.jpg,Forbidden Land,Surrealism,landscape,1937.0,Wolfgang Paalen,https://uploads6.wikiart.org/images/wolfgang-p...
1,99442.jpg,Storm at sea,Romanticism,marina,1873.0,Ivan Aivazovsky,https://uploads4.wikiart.org/images/ivan-aivaz...
2,28908.jpg,Yachting in the Mediterranean,Realism,genre painting,1896.0,Julius LeBlanc Stewart,https://uploads4.wikiart.org/images/julius-leb...
3,7486.jpg,Death of Eleazer,Romanticism,religious painting,1866.0,Gustave Dore,https://uploads5.wikiart.org/images/gustave-do...
4,35766.jpg,The-Deluge,Romanticism,religious painting,,Gustave Dore,https://uploads3.wikiart.org/images/gustave-do...


In [4]:
df_artworks.shape

(46010, 7)

**Load code matrix**

In [5]:
artwork_code_matrix = np.load(os.path.join(DATASET_PATH, 'train_mayors_style_encode.npy'))
artwork_code_matrix.shape

(46010, 300)

**Load embedding matrix**

In [6]:
artwork_embedding_matrix = np.load(os.path.join(DATASET_PATH, 'train_mayors_style_embedding.npy'))
artwork_embedding_matrix.shape

(46010, 100)

**Load artist code matrix**

In [7]:
artist_code_matrix = np.load(os.path.join(DATASET_PATH, 'train_mayors_style_artist_code_matrix.npy'))
artist_code_matrix.shape

(46010, 300)

**Load artist data and matrix**

In [8]:
all_artist_data = pd.read_csv(os.path.join(DATASET_PATH, 'all_artists.csv'))
all_artist_data.shape

(1168, 1)

In [9]:
all_artists_matrix = np.load(os.path.join(DATASET_PATH, 'all_artist_code_matrix.npy'))
all_artists_matrix.shape

(1168, 300)

**Load tfidf matrix**

In [10]:
tfidf_matrix = np.load(os.path.join(DATASET_PATH, 'tfidf_matrix.npy'),  allow_pickle = True)
tfidf_matrix = tfidf_matrix.reshape((-1))[0]

tfidf_matrix.shape

(46010, 24068)

### Get artworks by style and sorted by date

**Normalize artist name**

In [11]:
import unicodedata

def normalize_title(title):
    return unicodedata.normalize('NFKD', title.lower()).encode('ASCII', 'ignore').decode('utf8')

**Normalize date**

In [12]:
def normalize_date(dates):
    dates_norm = dates.str.extract(r'(?P<begining>.+)?(?P<date>\d\d\d\d)(?P<end>.+)?')
    return dates_norm['date']
    

**Drop corrupt data**

In [13]:
def drop_corrupt_data(df):
     #Drop NaN date
    df = df.dropna(subset=['date'])
    #Drop NaN image url
    df = df.dropna(subset=['imageUrl'])
    
    return df
    

In [14]:
df_artworks_clean = drop_corrupt_data(df_artworks)
#Normalize artist name
df_artworks_clean['artist'] = df_artworks_clean['artist'].apply(normalize_title)
#Normalize date
df_artworks_clean['date'] = normalize_date(df_artworks_clean['date'])

df_artworks_clean.head()

Unnamed: 0,filename,title,style,genre,date,artist,imageUrl
0,70944.jpg,Forbidden Land,Surrealism,landscape,1937,wolfgang paalen,https://uploads6.wikiart.org/images/wolfgang-p...
1,99442.jpg,Storm at sea,Romanticism,marina,1873,ivan aivazovsky,https://uploads4.wikiart.org/images/ivan-aivaz...
2,28908.jpg,Yachting in the Mediterranean,Realism,genre painting,1896,julius leblanc stewart,https://uploads4.wikiart.org/images/julius-leb...
3,7486.jpg,Death of Eleazer,Romanticism,religious painting,1866,gustave dore,https://uploads5.wikiart.org/images/gustave-do...
5,28363.jpg,Still Life with Flowers and Gold Cups of Honor,Baroque,still life,1612,clara peeters,https://uploads0.wikiart.org/images/clara-peet...


In [15]:
df_artworks_clean.shape

(30862, 7)

**Look for unique artists**

In [16]:
all_artist_data['author'] = all_artist_data['author'].apply(normalize_title)
all_artist_data.head()

Unnamed: 0,author
0,wolfgang paalen
1,ivan aivazovsky
2,julius leblanc stewart
3,gustave dore
4,clara peeters


In [17]:
artist_clean_list = list(df_artworks_clean['artist'].unique())

In [18]:
all_artist_data = all_artist_data[all_artist_data['author'].isin(artist_clean_list)]
all_artist_data.shape

(845, 1)

In [19]:
all_artists_matrix = all_artists_matrix[all_artist_data.index]
all_artists_matrix.shape

(845, 300)

**Save clean data**

In [20]:
code_matrix_clean = artwork_code_matrix[df_artworks_clean.index]
code_matrix_clean.shape

(30862, 300)

In [21]:
embedding_matrix_clean = artwork_embedding_matrix[df_artworks_clean.index]
embedding_matrix_clean.shape

(30862, 100)

In [22]:
artist_code_matrix_clean = artist_code_matrix[df_artworks_clean.index]
artist_code_matrix_clean.shape

(30862, 300)

In [24]:
artist_code_matrix_clean = np.mean(artist_code_matrix_clean, axis=1)
artist_code_matrix_clean.shape

IndexError: tuple index out of range

In [25]:
all_artists_matrix = np.mean(all_artists_matrix, axis = 1)
all_artists_matrix.shape

(845,)

In [26]:
tfidf_matrix_clean = tfidf_matrix[df_artworks_clean.index]
tfidf_matrix_clean.shape

(30862, 24068)

In [None]:
df_artworks_clean.to_csv(os.path.join(CLEAN_DATASET_PATH, 'train_mayors_style_encoded_with_url.csv'), index=True)
all_artist_data.to_csv(os.path.join(CLEAN_DATASET_PATH, 'all_artists.csv'))
np.save(os.path.join(CLEAN_DATASET_PATH, 'train_mayors_style_encode'),code_matrix_clean)
np.save(os.path.join(CLEAN_DATASET_PATH, 'train_mayors_style_embedding'),embedding_matrix_clean)
np.save(os.path.join(CLEAN_DATASET_PATH, 'train_mayors_style_artist_code_matrix'),artist_code_matrix_clean)
np.save(os.path.join(CLEAN_DATASET_PATH, 'all_artists_code_matrix'),all_artists_matrix)

#np.save(os.path.join(CLEAN_DATASET_PATH, 'tfidf_matrix'),tfidf_matrix_clean)

### Group data by style an period

**Get artworks by style**

In [27]:
df_style_grouped = df_artworks_clean.groupby('style')

**Get genres presents in all periods**

In [28]:
df_grouped = df_artworks_clean.groupby('style')
genres_set = []

for name, group in df_grouped:
    genres_set.append(set(group['genre'].unique()))
    
ref_set = genres_set[0]

for gen_set in genres_set:
    ref_set = ref_set.intersection(gen_set)

common_genres = np.array(list(ref_set))
common_genres

array(['illustration', 'still life', 'self-portrait', 'cityscape',
       'mythological painting', 'genre painting', 'history painting',
       'landscape', 'literary painting', 'flower painting', 'portrait',
       'sketch and study', 'religious painting', 'nude painting (nu)',
       'interior', 'animal painting', 'marina', 'allegorical painting'],
      dtype='<U21')

## Generate random samples

In [29]:
samples_tour_list = []

for i in range(1000):
    df_tour_lists = []
    #choose a genre
    genre = np.random.choice(common_genres, 1)[0]
    for name, group in df_style_grouped:
        group_genre = group[group['genre'] == genre]
        df_tour_lists.append(group_genre.sample(3, replace=True))
        
    df_tours = pd.concat(df_tour_lists)
    df_tours = df_tours.sort_values(by=['date'])
    samples_tour_list.append(df_tours)

df_style_tours = pd.concat(samples_tour_list)
df_style_tours.shape

(27000, 7)

### Get artwork's code and embedding

In [30]:
matrix_sorted = artwork_code_matrix[df_style_tours.index]
matrix_sorted.shape

(27000, 300)

In [31]:
embedding_matrix_sorted = artwork_embedding_matrix[df_style_tours.index]
embedding_matrix_sorted.shape

(27000, 100)

In [32]:
artist_code_matrix_sorted = artist_code_matrix[df_style_tours.index]
artist_code_matrix_sorted.shape

(27000, 300)

### Save data

In [None]:
df_tours.to_csv(os.path.join(CONFIG_PATH, 'style_tours.csv'), index=True)

In [None]:
np.save(os.path.join(CONFIG_PATH, 'style_tours_matrix'),matrix_sorted)

In [None]:
np.save(os.path.join(CONFIG_PATH, 'style_tours_embedding_matrix'),embedding_matrix_sorted)

In [None]:
np.save(os.path.join(CONFIG_PATH, 'style_tours_artist_code_matrix'),artist_code_matrix_sorted)