In [1]:
import pandas as pd
import numpy as np

## Load Data

**Load metadata**

In [2]:
df_artworks = pd.read_csv('/root/work/datasets/train_mayors_style_encoded_with_url.csv')
df_artworks.head()

Unnamed: 0,filename,title,style,genre,date,artist,imageUrl
0,70944.jpg,Forbidden Land,Surrealism,landscape,1937.0,Wolfgang Paalen,https://uploads6.wikiart.org/images/wolfgang-p...
1,99442.jpg,Storm at sea,Romanticism,marina,1873.0,Ivan Aivazovsky,https://uploads4.wikiart.org/images/ivan-aivaz...
2,28908.jpg,Yachting in the Mediterranean,Realism,genre painting,1896.0,Julius LeBlanc Stewart,https://uploads4.wikiart.org/images/julius-leb...
3,7486.jpg,Death of Eleazer,Romanticism,religious painting,1866.0,Gustave Dore,https://uploads5.wikiart.org/images/gustave-do...
4,35766.jpg,The-Deluge,Romanticism,religious painting,,Gustave Dore,https://uploads3.wikiart.org/images/gustave-do...


In [3]:
df_artworks.shape

(46010, 7)

**Load matrix**

In [4]:
artwork_code_matrix = np.load('/root/work/datasets/train_mayors_style_w_encoded.npy')
artwork_code_matrix.shape

(46010, 100)

### Get Impressionist artworks

In [5]:
df_impressionist = df_artworks[df_artworks['style'] == 'Impressionism']
df_impressionist.head()

Unnamed: 0,filename,title,style,genre,date,artist,imageUrl
48,95360.jpg,Interior (The Rape),Impressionism,interior,1869,Edgar Degas,https://uploads5.wikiart.org/images/edgar-dega...
80,49757.jpg,"The Manneport, Rock Arch West of Etretat",Impressionism,landscape,1883,Claude Monet,https://uploads5.wikiart.org/images/claude-mon...
95,53841.jpg,Rodez,Impressionism,cityscape,c.1923,Paul Signac,https://uploads6.wikiart.org/images/paul-signa...
125,81912.jpg,The Look at Montauban in rain,Impressionism,landscape,c.1922,Paul Signac,https://uploads3.wikiart.org/images/paul-signa...
129,17146.jpg,Prayer in the Saint Pierre Cathedral in Geneva,Impressionism,genre painting,1882,Ferdinand Hodler,https://uploads3.wikiart.org/images/ferdinand-...


In [6]:
df_impressionist.shape

(8220, 7)

**NaN values**

In [7]:
df_impressionist['date'].isnull().value_counts()

False    6355
True     1865
Name: date, dtype: int64

In [8]:
df_impressionist = df_impressionist.dropna(subset=['date'])
df_impressionist.head()

Unnamed: 0,filename,title,style,genre,date,artist,imageUrl
48,95360.jpg,Interior (The Rape),Impressionism,interior,1869,Edgar Degas,https://uploads5.wikiart.org/images/edgar-dega...
80,49757.jpg,"The Manneport, Rock Arch West of Etretat",Impressionism,landscape,1883,Claude Monet,https://uploads5.wikiart.org/images/claude-mon...
95,53841.jpg,Rodez,Impressionism,cityscape,c.1923,Paul Signac,https://uploads6.wikiart.org/images/paul-signa...
125,81912.jpg,The Look at Montauban in rain,Impressionism,landscape,c.1922,Paul Signac,https://uploads3.wikiart.org/images/paul-signa...
129,17146.jpg,Prayer in the Saint Pierre Cathedral in Geneva,Impressionism,genre painting,1882,Ferdinand Hodler,https://uploads3.wikiart.org/images/ferdinand-...


**Normalize artist name**

In [9]:
import unicodedata

def normalize_title(title):
    return unicodedata.normalize('NFKD', title.lower()).encode('ASCII', 'ignore').decode('utf8')

In [10]:
df_impressionist['artist'] = df_impressionist['artist'].apply(normalize_title)

**Normalize date**

In [11]:
dates = df_impressionist['date']
dates.head()

48       1869
80       1883
95     c.1923
125    c.1922
129      1882
Name: date, dtype: object

In [12]:
dates_normalize = dates.str.extract(r'(?P<begining>.+)?(?P<date>\d\d\d\d)(?P<end>.+)?')
dates_normalize.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,begining,date,end
48,,1869,
80,,1883,
95,c.,1923,
125,c.,1922,
129,,1882,


In [13]:
df_impressionist['date'] = dates_normalize['date']
df_impressionist.head()

Unnamed: 0,filename,title,style,genre,date,artist,imageUrl
48,95360.jpg,Interior (The Rape),Impressionism,interior,1869,edgar degas,https://uploads5.wikiart.org/images/edgar-dega...
80,49757.jpg,"The Manneport, Rock Arch West of Etretat",Impressionism,landscape,1883,claude monet,https://uploads5.wikiart.org/images/claude-mon...
95,53841.jpg,Rodez,Impressionism,cityscape,1923,paul signac,https://uploads6.wikiart.org/images/paul-signa...
125,81912.jpg,The Look at Montauban in rain,Impressionism,landscape,1922,paul signac,https://uploads3.wikiart.org/images/paul-signa...
129,17146.jpg,Prayer in the Saint Pierre Cathedral in Geneva,Impressionism,genre painting,1882,ferdinand hodler,https://uploads3.wikiart.org/images/ferdinand-...


### Sort values

In [14]:
df_impressionist_sorted = df_impressionist.sort_values(by=['date', 'artist'], ascending=True)
df_impressionist_sorted.head()

Unnamed: 0,filename,title,style,genre,date,artist,imageUrl
13855,44196.jpg,"La Hacienda Buenavista en Ponce, Puerto Rico",Impressionism,landscape,1840,francisco oller,https://uploads8.wikiart.org/images/francisco-...
36603,50608.jpg,Woman and Child on a Bridge,Impressionism,genre painting,1848,honore daumier,https://uploads4.wikiart.org/images/honore-dau...
1313,27526.jpg,On the Shore,Impressionism,genre painting,1853,honore daumier,https://uploads7.wikiart.org/images/honore-dau...
35617,102063.jpg,Bathers,Impressionism,genre painting,1853,honore daumier,https://uploads4.wikiart.org/images/honore-dau...
1502,66688.jpg,Study for the Self Portrait,Impressionism,self-portrait,1855,edgar degas,https://uploads6.wikiart.org/images/edgar-dega...


### Get artwork's code

In [15]:
matrix_sorted = artwork_code_matrix[df_impressionist_sorted.index]
matrix_sorted.shape

(6355, 100)

In [16]:
matrix_sorted[0]

array([-0.18733734, -0.7107996 ,  0.2520979 , -0.17679693, -0.60448617,
        0.10720313,  2.1172245 ,  1.2336361 , -0.23950386, -0.4110453 ,
       -0.2546481 ,  0.36813372, -0.29154047, -0.41856167, -0.03077362,
       -0.18250334, -0.65798277,  0.76718646,  0.9617817 , -0.17786421,
       -0.16553105, -0.47963133,  0.11038253, -0.39510176, -0.2166926 ,
       -0.39048964,  0.04394746, -0.48526707,  0.25912058,  1.3549267 ,
        0.06388462, -0.1721779 , -0.16312343, -0.20455246, -0.55944425,
       -0.23970237, -0.31799254, -0.33271417, -0.08630779,  0.46668637,
       -0.46433797, -0.08583858,  1.2002717 , -0.40400964, -0.21918252,
       -0.01095881, -0.15159182, -0.21558967, -0.1024729 , -0.14038943,
       -0.40729037,  0.06979251, -0.2389057 , -0.3315314 , -0.19975021,
        1.3818963 , -0.19873004, -0.19333676, -0.35796562, -0.39401227,
       -0.15066484,  0.5918576 , -0.21019451, -0.24923305,  1.0728569 ,
       -0.14889121, -0.29143173, -0.09621882,  0.08412094, -0.46

### Save data

In [18]:
df_impressionist_sorted.to_csv('impressionsist_sorted.csv', index=False)

In [19]:
np.save('impressionist_sorted_matrix',matrix_sorted)