In [None]:
import pandas as pd
import numpy as np
import unicodedata
import os

In [None]:
BASE_PATH = '/root/work/datasets/'
BASE_SEQUENCE_PATH = '/root/work/datasets/artwork_sequence/'

**Normalize data function**

In [None]:
def normalize_title(title):
    return unicodedata.normalize('NFKD', title.lower()).encode('ASCII', 'ignore').decode('utf8')

## Load data

**Load tours metadata**

In [None]:
df_tours = pd.read_csv(os.path.join(BASE_SEQUENCE_PATH, 'all_metadata.csv'))
df_tours.head()

In [None]:
df_tours.shape

**Load wikiArts metadata**

In [None]:
df_all_data = pd.read_csv(os.path.join(BASE_PATH,'all_data_info.csv'))
df_all_data = df_all_data[['artist', 'date', 'genre', 'style', 'title']]
df_all_data = df_all_data.dropna()
df_all_data.head()

In [None]:
df_all_data.shape

**Normalize data**

In [None]:
df_all_data['artist'] = df_all_data['artist'].apply(normalize_title)
df_all_data['genre'] = df_all_data['genre'].apply(normalize_title)
df_all_data['style'] = df_all_data['style'].apply(normalize_title)
df_all_data['title'] = df_all_data['title'].apply(normalize_title)
df_all_data.head()

In [None]:
df_all_artists = df_all_data['artist'].drop_duplicates()
df_all_artists.shape

In [None]:
df_all_artists.head()

## Find artist name in WikiArt

In [None]:
import re

'''
reverse name transform last name, first name -> first name last name
'''


def reverse_name(name):
    
    name_cleaned = re.sub(r'\(.+\)', '', name)
    name_cleaned = re.sub(r' (de|y)( |$)', ' ', name_cleaned)
    #name_cleaned = re.sub(r'(\(|\))', '', name_cleaned)
    name_splitted = name_cleaned.split(', ')
    name_splitted.reverse()
    return ' '.join(name_splitted)
    

In [None]:
df_tours['name reverse'] = df_tours['author'].apply(reverse_name)

df_tours.head()

In [None]:
from itertools import combinations


def name_in_wikiart(name_reverse):
    
    name_splitted = name_reverse.split(' ')
    for n in range(len(name_splitted),0,-1):
        posible_combs = list(combinations(name_splitted, n))
        for c in posible_combs:
            regex = '.*'.join(c)
            regex = re.sub(r'(\(|\))', '', regex)
            wikiarts_names = df_all_artists[df_all_artists.str.contains(regex)].values
            #Find one example
            if len(wikiarts_names) == 1:
                return wikiarts_names[0]
            #Find different examples
            elif len(wikiarts_names) > 1:
                for name in wikiarts_names:
                    if name == name_splitted[0] :
                        return name
                return wikiarts_names[0]
    return 'anonimous'

In [None]:
df_tours['wikiart name'] = df_tours['name reverse'].apply(name_in_wikiart)
df_tours.head()

## Find artwork metadata

In [None]:
from itertools import combinations

def get_style_genre_artwork(row, df_all_data, feature):
    
    #Anonimous artist
    if row['wikiart name'] == 'anonimous':
        return'no value'

    #Get all the artworks related to the artist
    df_metadata_artist = df_all_data[(df_all_data['artist']==row['wikiart name'])]
    
    title = row['title']
    
    title_split = title.split(' ')

    for n in range(len(title_split),0,-1):
        posible_combs = list(combinations(title_split, n))
        for c in posible_combs:
            regex = '.*'.join(c)
            regex = re.sub(r'(\(|\))', '', regex)
            wikiart_feature = df_metadata_artist[df_metadata_artist['title'].str.contains(regex)][feature].values
            if len(wikiart_feature) >0:
                return wikiart_feature[0]
            
    return 'no value'


In [None]:
df_tours['style'] = df_tours.apply(func=get_style_genre_artwork, axis=1, args=[df_all_data, 'style'])
df_tours.head()

In [None]:
df_tours['genre'] = df_tours.apply(func=get_style_genre_artwork, axis=1, args=[df_all_data, 'genre'])
df_tours.head()

In [None]:
df_tours.to_csv(os.path.join(BASE_SEQUENCE_PATH, 'all_metadata_with_style_genre.csv'), index=False)