In [2]:
import pandas as pd
import numpy as np
import unicodedata
import os

In [3]:
BASE_PATH = '/root/work/datasets/'
BASE_SEQUENCE_PATH = '/root/work/datasets/artwork_sequence/'

**Normalize data function**

In [4]:
def normalize_title(title):
    return unicodedata.normalize('NFKD', title.lower()).encode('ASCII', 'ignore').decode('utf8')

## Load data

**Load tours metadata**

In [5]:
df_tours = pd.read_csv(os.path.join(BASE_SEQUENCE_PATH, 'all_metadata.csv'))
df_tours.head()

Unnamed: 0,id,author,data,image_url,title,tour_path
0,1,n.v. haagsche plateelfabriek rozenburg,1914,https://lh3.googleusercontent.com/IJn7rB4WvYvv...,vaas beschilderd met paarse seringen en op een...,/root/work/datasets/artwork_sequence/rijksmuse...
1,2,n.v. haagsche plateelfabriek rozenburg,1900,https://lh3.googleusercontent.com/l0ccWh5aCgP5...,vaas,/root/work/datasets/artwork_sequence/rijksmuse...
2,3,n.v. haagsche plateelfabriek rozenburg,1900,https://lh3.googleusercontent.com/uNQWFg-BhiPZ...,vase with lily decoration,/root/work/datasets/artwork_sequence/rijksmuse...
3,4,n.v. haagsche plateelfabriek rozenburg,1902,https://lh3.googleusercontent.com/QRdRjQDGyvDp...,vaas beschilderd met takken met seringen en ee...,/root/work/datasets/artwork_sequence/rijksmuse...
4,6,theo colenbrander,1886,https://lh3.googleusercontent.com/TZqVQVxb-1kl...,garniture of five vases,/root/work/datasets/artwork_sequence/rijksmuse...


In [8]:
df_tours.shape

(633, 6)

**Load wikiArts metadata**

In [9]:
df_all_data = pd.read_csv(os.path.join(BASE_PATH,'all_data_info.csv'))
df_all_data = df_all_data[['artist', 'date', 'genre', 'style', 'title']]
df_all_data = df_all_data.dropna()
df_all_data.head()

Unnamed: 0,artist,date,genre,style,title
0,Barnett Newman,1955.0,abstract,Color Field Painting,Uriel
1,Barnett Newman,1950.0,abstract,Color Field Painting,Vir Heroicus Sublimis
7,Hiroshige,1838.0,bird-and-flower painting,Ukiyo-e,Small Bird on a Branch of Kaidozakura
8,Barnett Newman,1963.0,abstract,Color Field Painting,Black Fire I
10,Hiroshige,1844.0,bird-and-flower painting,Ukiyo-e,Camellia and Bush Warbler


In [10]:
df_all_data.shape

(75858, 5)

**Normalize data**

In [11]:
df_all_data['artist'] = df_all_data['artist'].apply(normalize_title)
df_all_data['genre'] = df_all_data['genre'].apply(normalize_title)
df_all_data['style'] = df_all_data['style'].apply(normalize_title)
df_all_data['title'] = df_all_data['title'].apply(normalize_title)
df_all_data.head()

Unnamed: 0,artist,date,genre,style,title
0,barnett newman,1955.0,abstract,color field painting,uriel
1,barnett newman,1950.0,abstract,color field painting,vir heroicus sublimis
7,hiroshige,1838.0,bird-and-flower painting,ukiyo-e,small bird on a branch of kaidozakura
8,barnett newman,1963.0,abstract,color field painting,black fire i
10,hiroshige,1844.0,bird-and-flower painting,ukiyo-e,camellia and bush warbler


In [12]:
df_all_artists = df_all_data['artist'].drop_duplicates()
df_all_artists.shape

(2084,)

In [13]:
df_all_artists.head()

0      barnett newman
7           hiroshige
11    wolfgang paalen
14    ivan aivazovsky
15      paul serusier
Name: artist, dtype: object

## Find artist name in WikiArt

In [14]:
import re

'''
reverse name transform last name, first name -> first name last name
'''


def reverse_name(name):
    
    name_cleaned = re.sub(r'\(.+\)', '', name)
    name_cleaned = re.sub(r' (de|y)( |$)', ' ', name_cleaned)
    #name_cleaned = re.sub(r'(\(|\))', '', name_cleaned)
    name_splitted = name_cleaned.split(', ')
    name_splitted.reverse()
    return ' '.join(name_splitted)
    

In [15]:
df_tours['name reverse'] = df_tours['author'].apply(reverse_name)

df_tours.head()

Unnamed: 0,id,author,data,image_url,title,tour_path,name reverse
0,1,n.v. haagsche plateelfabriek rozenburg,1914,https://lh3.googleusercontent.com/IJn7rB4WvYvv...,vaas beschilderd met paarse seringen en op een...,/root/work/datasets/artwork_sequence/rijksmuse...,n.v. haagsche plateelfabriek rozenburg
1,2,n.v. haagsche plateelfabriek rozenburg,1900,https://lh3.googleusercontent.com/l0ccWh5aCgP5...,vaas,/root/work/datasets/artwork_sequence/rijksmuse...,n.v. haagsche plateelfabriek rozenburg
2,3,n.v. haagsche plateelfabriek rozenburg,1900,https://lh3.googleusercontent.com/uNQWFg-BhiPZ...,vase with lily decoration,/root/work/datasets/artwork_sequence/rijksmuse...,n.v. haagsche plateelfabriek rozenburg
3,4,n.v. haagsche plateelfabriek rozenburg,1902,https://lh3.googleusercontent.com/QRdRjQDGyvDp...,vaas beschilderd met takken met seringen en ee...,/root/work/datasets/artwork_sequence/rijksmuse...,n.v. haagsche plateelfabriek rozenburg
4,6,theo colenbrander,1886,https://lh3.googleusercontent.com/TZqVQVxb-1kl...,garniture of five vases,/root/work/datasets/artwork_sequence/rijksmuse...,theo colenbrander


In [16]:
from itertools import combinations


def name_in_wikiart(name_reverse):
    
    name_splitted = name_reverse.split(' ')
    for n in range(len(name_splitted),0,-1):
        posible_combs = list(combinations(name_splitted, n))
        for c in posible_combs:
            regex = '.*'.join(c)
            regex = re.sub(r'(\(|\))', '', regex)
            wikiarts_names = df_all_artists[df_all_artists.str.contains(regex)].values
            #Find one example
            if len(wikiarts_names) == 1:
                return wikiarts_names[0]
            #Find different examples
            elif len(wikiarts_names) > 1:
                for name in wikiarts_names:
                    if name == name_splitted[0] :
                        return name
                return wikiarts_names[0]
    return 'Anonimous'

In [17]:
df_tours['wikiart name'] = df_tours['name reverse'].apply(name_in_wikiart)
df_tours.head()

Unnamed: 0,id,author,data,image_url,title,tour_path,name reverse,wikiart name
0,1,n.v. haagsche plateelfabriek rozenburg,1914,https://lh3.googleusercontent.com/IJn7rB4WvYvv...,vaas beschilderd met paarse seringen en op een...,/root/work/datasets/artwork_sequence/rijksmuse...,n.v. haagsche plateelfabriek rozenburg,jan van eyck
1,2,n.v. haagsche plateelfabriek rozenburg,1900,https://lh3.googleusercontent.com/l0ccWh5aCgP5...,vaas,/root/work/datasets/artwork_sequence/rijksmuse...,n.v. haagsche plateelfabriek rozenburg,jan van eyck
2,3,n.v. haagsche plateelfabriek rozenburg,1900,https://lh3.googleusercontent.com/uNQWFg-BhiPZ...,vase with lily decoration,/root/work/datasets/artwork_sequence/rijksmuse...,n.v. haagsche plateelfabriek rozenburg,jan van eyck
3,4,n.v. haagsche plateelfabriek rozenburg,1902,https://lh3.googleusercontent.com/QRdRjQDGyvDp...,vaas beschilderd met takken met seringen en ee...,/root/work/datasets/artwork_sequence/rijksmuse...,n.v. haagsche plateelfabriek rozenburg,jan van eyck
4,6,theo colenbrander,1886,https://lh3.googleusercontent.com/TZqVQVxb-1kl...,garniture of five vases,/root/work/datasets/artwork_sequence/rijksmuse...,theo colenbrander,theo van rysselberghe


## Find artwork metadata

In [18]:
from itertools import combinations

def get_style_genre_artwork(row, df_all_data, feature):
    
    #Anonimous artist
    if row['wikiart name'] == 'Anonimous':
        return'No value'

    #Get all the artworks related to the artist
    df_metadata_artist = df_all_data[(df_all_data['artist']==row['wikiart name'])]
    
    title = row['title']
    
    title_split = title.split(' ')

    for n in range(len(title_split),0,-1):
        posible_combs = list(combinations(title_split, n))
        for c in posible_combs:
            regex = '.*'.join(c)
            regex = re.sub(r'(\(|\))', '', regex)
            wikiart_feature = df_metadata_artist[df_metadata_artist['title'].str.contains(regex)][feature].values
            if len(wikiart_feature) >0:
                return wikiart_feature[0]
            
    return 'No value'


In [20]:
df_tours['style'] = df_tours.apply(func=get_style_genre_artwork, axis=1, args=[df_all_data, 'style'])
df_tours.head()

Unnamed: 0,id,author,data,image_url,title,tour_path,name reverse,wikiart name,style
0,1,n.v. haagsche plateelfabriek rozenburg,1914,https://lh3.googleusercontent.com/IJn7rB4WvYvv...,vaas beschilderd met paarse seringen en op een...,/root/work/datasets/artwork_sequence/rijksmuse...,n.v. haagsche plateelfabriek rozenburg,jan van eyck,northern renaissance
1,2,n.v. haagsche plateelfabriek rozenburg,1900,https://lh3.googleusercontent.com/l0ccWh5aCgP5...,vaas,/root/work/datasets/artwork_sequence/rijksmuse...,n.v. haagsche plateelfabriek rozenburg,jan van eyck,No value
2,3,n.v. haagsche plateelfabriek rozenburg,1900,https://lh3.googleusercontent.com/uNQWFg-BhiPZ...,vase with lily decoration,/root/work/datasets/artwork_sequence/rijksmuse...,n.v. haagsche plateelfabriek rozenburg,jan van eyck,northern renaissance
3,4,n.v. haagsche plateelfabriek rozenburg,1902,https://lh3.googleusercontent.com/QRdRjQDGyvDp...,vaas beschilderd met takken met seringen en ee...,/root/work/datasets/artwork_sequence/rijksmuse...,n.v. haagsche plateelfabriek rozenburg,jan van eyck,northern renaissance
4,6,theo colenbrander,1886,https://lh3.googleusercontent.com/TZqVQVxb-1kl...,garniture of five vases,/root/work/datasets/artwork_sequence/rijksmuse...,theo colenbrander,theo van rysselberghe,impressionism
5,7,manufacture de sevres,1908,https://lh3.googleusercontent.com/LK59Nk7jj7dP...,vase,/root/work/datasets/artwork_sequence/rijksmuse...,manufacture sevres,Anonimous,No value
6,8,rene lalique,c. 1897 - c. 1899,https://lh5.ggpht.com/vdDfw4waIjM2hN1z80G2uMvY...,broche in de vorm van een narrenkop,/root/work/datasets/artwork_sequence/rijksmuse...,rene lalique,rene duvillier,lyrical abstraction
7,9,lucien gaillard,c. 1904,https://lh3.ggpht.com/kbDB0ey1eBLjgJwySFSKIkRH...,comb in the form of two dragonflies,/root/work/datasets/artwork_sequence/rijksmuse...,lucien gaillard,Anonimous,No value
8,10,rene lalique,c. 1902 - c. 1903,https://lh6.ggpht.com/aIO2fdxvbJJ58jvLa_5lzhJ2...,haarkam in de vorm van twee takken viburnum,/root/work/datasets/artwork_sequence/rijksmuse...,rene lalique,rene duvillier,lyrical abstraction
9,11,rene lalique,c. 1904 - c. 1906,https://lh3.ggpht.com/NfRVQNTHfMA_9GUMzYGzVGS2...,broche met korenbloemen,/root/work/datasets/artwork_sequence/rijksmuse...,rene lalique,rene duvillier,No value


In [22]:
df_tours['genre'] = df_tours.apply(func=get_style_genre_artwork, axis=1, args=[df_all_data, 'genre'])
df_tours.head()

Unnamed: 0,id,author,data,image_url,title,tour_path,name reverse,wikiart name,style,genre
0,1,n.v. haagsche plateelfabriek rozenburg,1914,https://lh3.googleusercontent.com/IJn7rB4WvYvv...,vaas beschilderd met paarse seringen en op een...,/root/work/datasets/artwork_sequence/rijksmuse...,n.v. haagsche plateelfabriek rozenburg,jan van eyck,northern renaissance,portrait
1,2,n.v. haagsche plateelfabriek rozenburg,1900,https://lh3.googleusercontent.com/l0ccWh5aCgP5...,vaas,/root/work/datasets/artwork_sequence/rijksmuse...,n.v. haagsche plateelfabriek rozenburg,jan van eyck,No value,No value
2,3,n.v. haagsche plateelfabriek rozenburg,1900,https://lh3.googleusercontent.com/uNQWFg-BhiPZ...,vase with lily decoration,/root/work/datasets/artwork_sequence/rijksmuse...,n.v. haagsche plateelfabriek rozenburg,jan van eyck,northern renaissance,portrait
3,4,n.v. haagsche plateelfabriek rozenburg,1902,https://lh3.googleusercontent.com/QRdRjQDGyvDp...,vaas beschilderd met takken met seringen en ee...,/root/work/datasets/artwork_sequence/rijksmuse...,n.v. haagsche plateelfabriek rozenburg,jan van eyck,northern renaissance,portrait
4,6,theo colenbrander,1886,https://lh3.googleusercontent.com/TZqVQVxb-1kl...,garniture of five vases,/root/work/datasets/artwork_sequence/rijksmuse...,theo colenbrander,theo van rysselberghe,impressionism,portrait


In [None]:
df_tours.to_csv(os.path.join())