# Word Embedding Vectors

In [1]:
import pandas as pd
import numpy as np
import unicodedata
import os

In [2]:
BASE_PATH = '/root/work/datasets/'
BASE_SEQUENCE_PATH = os.path.join(BASE_PATH, 'artwork_sequence')
EMBEDDINGS_PATH = os.path.join(BASE_PATH, 'Word Embeddings Pretrained Vectors')

## Load data

In [6]:
df_metadata = pd.read_csv(os.path.join(BASE_SEQUENCE_PATH, 'all_metadata_with_style_genre.csv'))
df_metadata = df_metadata.drop(columns=['name reverse'])
df_metadata.head()

Unnamed: 0,id,author,data,image_url,title,tour_path,wikiart name,style,genre
0,1,n.v. haagsche plateelfabriek rozenburg,1914,https://lh3.googleusercontent.com/IJn7rB4WvYvv...,vaas beschilderd met paarse seringen en op een...,/root/work/datasets/artwork_sequence/rijksmuse...,jan van eyck,northern renaissance,portrait
1,2,n.v. haagsche plateelfabriek rozenburg,1900,https://lh3.googleusercontent.com/l0ccWh5aCgP5...,vaas,/root/work/datasets/artwork_sequence/rijksmuse...,jan van eyck,no value,no value
2,3,n.v. haagsche plateelfabriek rozenburg,1900,https://lh3.googleusercontent.com/uNQWFg-BhiPZ...,vase with lily decoration,/root/work/datasets/artwork_sequence/rijksmuse...,jan van eyck,northern renaissance,portrait
3,4,n.v. haagsche plateelfabriek rozenburg,1902,https://lh3.googleusercontent.com/QRdRjQDGyvDp...,vaas beschilderd met takken met seringen en ee...,/root/work/datasets/artwork_sequence/rijksmuse...,jan van eyck,northern renaissance,portrait
4,6,theo colenbrander,1886,https://lh3.googleusercontent.com/TZqVQVxb-1kl...,garniture of five vases,/root/work/datasets/artwork_sequence/rijksmuse...,theo van rysselberghe,impressionism,portrait


In [4]:
df_metadata.shape

(633, 9)

In [7]:
df_all_metadata = pd.read_csv(os.path.join(BASE_PATH, 'train_mayors_style_encoded.csv'))
df_all_metadata = df_all_metadata[['title', 'style', 'genre','date', 'artist']]
df_all_metadata.head()

Unnamed: 0,title,style,genre,date,artist
0,Forbidden Land,Surrealism,landscape,1937.0,Wolfgang Paalen
1,Storm at sea,Romanticism,marina,1873.0,Ivan Aivazovsky
2,Yachting in the Mediterranean,Realism,genre painting,1896.0,Julius LeBlanc Stewart
3,Death of Eleazer,Romanticism,religious painting,1866.0,Gustave Dore
4,The-Deluge,Romanticism,religious painting,,Gustave Dore


In [None]:
df_all_metadata.shape

In [None]:
all_data_matrix = np.load(os.path.join(BASE_PATH, 'train_mayors_style_encode.npy'))
all_data_matrix.shape

In [None]:
df_metadata = df_all_metadata

## Load Pre-trained embeddings

In [None]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec, KeyedVectors

**Load trained embeddings**

In [None]:
word2vect_model = KeyedVectors.load_word2vec_format(os.path.join(EMBEDDINGS_PATH, 'GLoVe/glove-w2v.6B.100d.txt'), binary=False)

In [None]:
vocab = list(word2vect_model.vocab.keys())

**Functions to prepare the data and compute the embedding vector**

In [None]:
def get_w2vec(soap, word2vect_model):
    # vector space of embeddings
    w2v = np.zeros((word2vect_model.vector_size,))
    for s in soap:
        w2v += word2vect_model.get_vector(s)
    return w2v / len(soap)

In [None]:
def process_feature(feature, word2vect_model):
    
    if not isinstance(feature, str):
        feature = str(feature)
    feature = feature.split(' ')
    feature_clean = list(filter(lambda x: x in word2vect_model.vocab, feature))
    return feature_clean

**Compute the embedding vector**

In [None]:
df_metadata['w2v data'] = [[]]*df_metadata.shape[0]
for f in ['style', 'genre']:
    df_metadata['w2v data'] = df_metadata['w2v data'] + df_metadata[f].apply(process_feature, args=(word2vect_model,))

target_embeddings = df_metadata['w2v data'].apply(get_w2vec, args=(word2vect_model,))

target_embeddings = np.stack(target_embeddings)
target_embeddings.shape

In [None]:
np.save(os.path.join(BASE_PATH, 'train_mayors_style_embedding' ), target_embeddings)