## Artwork Recommendation

In [2]:
import csv
import re
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from IPython.display import Image
from IPython.core.display import HTML 
from IPython.core.display import Image, display
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
from matplotlib import pyplot
from gensim.models import KeyedVectors
from tqdm import tqdm
import pandas as pd
from collections import defaultdict

In [455]:
class Artwork:
    def __init__(self, row, header, object_id):
        self.__dict__ = dict(zip(header, row)) 
        self.object_id = object_id
        processed_title = process_doc(self.title)
        processed_material = process_doc(self.material)
        processed_artist = process_doc(self.artist)
        processed_description = process_doc(self.description)
        self.metadata = processed_title + processed_material + processed_artist + processed_description

    def __repr__(self):
        result = [];
        NEW_LINE = "\n"
#         if (self.title is not None): result.append("title: " + ' '.join(self.title) + NEW_LINE);
        if (self.original_title != ''): result.append("title: " + self.original_title + NEW_LINE) 
        else: result.append("title: " + self.title + NEW_LINE);
        if (self.startYear != ''): result.append("startYear: " + self.startYear + NEW_LINE);
        if (self.endYear != ''): result.append("endYear: " + self.endYear + NEW_LINE);
#         if (self.category is not None): result.append("category: " + ' '.join(self.category) + NEW_LINE);
#         if (self.dimension is not None): result.append("dimension: " + ' '.join(self.dimension) + NEW_LINE);
        if (self.material != ''): result.append("material: " + self.material + NEW_LINE);
        if (self.artist != ''): result.append("artist: " + self.artist + NEW_LINE);
        if (self.museum != ''): result.append("museum: " + self.museum + NEW_LINE);
        if (self.url != ''): result.append("url: " + str(self.url) + NEW_LINE);
#         if (self.image is not None): result.append("image: " + str(self.image) + NEW_LINE);
#         if (self.description is not None): result.append("description: " + self.description + NEW_LINE);
        return " ".join(result)

In [456]:
# process word:
stemmer = SnowballStemmer('english')
stop_words = set(stopwords.words('english'))

def process_doc(doc):
    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~='''
    res = []
    for word in word_tokenize(doc):
        word = word.lower()
        if not word in stop_words and not word in punc:
            res.append(stemmer.stem(word))
    return res

def load_data(path):
    data = list(csv.reader(open(path)))
    artworks = [Artwork(a, data[0], k) for k, a in enumerate(data[1:])]
    return artworks

def recommendations(artid, data, tfidf_vectors):
    
    data = data[['index','title', 'image']]
    indices = pd.Series(data.index, index = data['index']).drop_duplicates()
         
    idx = indices[artid]
    cosine_similarities = cosine_similarity(tfidf_vectors,  tfidf_vectors)
    sim_scores = list(enumerate(cosine_similarities[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:11]
    art_indices = [i[0] for i in sim_scores]
    return art_indices

### I. TF-IDF + Word2Vec for artwork recommendation

In [None]:
def get_recommendations(path):
    artworks = load_data(path)
    df = pd.read_csv(path).reset_index()
    cleaned = [' '.join(a.metadata) for a in artworks]
    
    # Define corpus 
    corpus = []
    for words in cleaned:
        corpus.append(words.split())
        
    model = Word2Vec(window=10, sg = 1, hs=0, negative=10, alpha=0.03)
    model.build_vocab(corpus, progress_per=200)
    model.train(corpus, total_examples = model.corpus_count, epochs=10)
        
    # Get tfidf features:
    tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), stop_words='english')
    tfidf.fit(cleaned)
    tfidf_list = dict(zip(tfidf.get_feature_names(), list(tfidf.idf_)))
    tfidf_feature = tfidf.get_feature_names()
    
    tfidf_vectors = []

    for desc in corpus:
        vec1 = np.zeros(100) 
        weight_sum =0; 
        for word in desc: 
            if word in model.wv.index_to_key and word in tfidf_feature:
                vec = model.wv[word]
                tf_idf = tfidf_list[word]*(desc.count(word)/len(desc))
                vec1 += (vec * tf_idf)
                weight_sum += tf_idf
        if weight_sum != 0:
            vec1 /= weight_sum
        tfidf_vectors.append(vec1)
    
    recommend_dict = defaultdict(list)
    art_ids = df['index'].tolist()
    for artid in tqdm(art_ids):
        recommend = recommendations(artid, df, tfidf_vectors)
        recommend_dict[artid] = recommend
    
    features = ['art_id_'+str(i) for i in range(1,11)]
    recommend = pd.DataFrame(recommend_dict).T.reset_index()
    recommend.rename(columns=dict(zip(range(10), features)), inplace=True)
    
    df = pd.merge(df, recommend, how='inner', on='index').set_index('index')
    path_1 = path.split('.csv')[0]
    path_out = path_1 + '_with_rec.csv'
    df.to_csv(path_out)

In [None]:
path = ['nga.csv', 'louvre.csv', 'cleveland.csv', 'rijks.csv']
for p in path:
    get_recommendations(p)

### II. Random Query Search

In [457]:
# term weight
class TermWeights(NamedTuple):
    title: float
    material: float
    artist: float
    museum: float
    description: float

def compute_doc_freqs(arts):
    freq = Counter()
    for art in arts:
        words = set()
        for m in art.metadata:
            for word in m:
                words.add(word)
        for word in words:
            freq[word] += 1
    return freq

def compute_tf(art, doc_freqs, weights):
    vec = defaultdict(float)
    for word in art.metadata[0]:
        vec[word] += weights.title
    for word in art.metadata[1]:
        vec[word] += weights.material
    for word in art.metadata[2]:
        vec[word] += weights.artist
    for word in art.metadata[3]:
        vec[word] += weights.museum
    for word in art.metadata[4]:
        vec[word] += weights.description
    return dict(vec) 

def compute_tfidf(art, doc_freqs, weights):
    tf_idf = compute_tf(art, doc_freqs, weights)
    n = max(doc_freqs.values())
    for k in tf_idf:
        if k in doc_freqs:
            tf_idf[k] = tf_idf[k]*np.log(n/doc_freqs[k])
        else:
            tf_idf[k] = 0
    return tf_idf

In [458]:
def dictdot(x: Dict[str, float], y: Dict[str, float]):
    keys = list(x.keys()) if len(x) < len(y) else list(y.keys())
    return sum(x.get(key, 0) * y.get(key, 0) for key in keys)

def cosine_sim(x, y):
    num = dictdot(x, y)
    if num == 0:
        return 0
    return num / (norm(list(x.values())) * norm(list(y.values())))

In [470]:
doc_freqs = compute_doc_freqs(artworks)
term_weights = TermWeights(title=3, artist=2, material=1, museum=3, description=1)

def process_query(query, doc_freqs):
    tf_idf = defaultdict(float)
    query_vec = [stemmer.stem(word.lower()) for word in word_tokenize(query) if not word.lower() in stop_words]
    for i in query_vec:
        tf_idf[i] += 1
    tf_idf = dict(tf_idf)
    n = max(doc_freqs.values())
    for k in tf_idf:
        if k in doc_freqs:
            tf_idf[k] = tf_idf[k]*np.log(n/doc_freqs[k])
        else:
            tf_idf[k] = 0
    return tf_idf

In [469]:
def search(query, doc_vectors, doc_freqs):
    results = []

    query_vec = process_query(query, doc_freqs)
    results_with_score = [(doc_id, cosine_sim(query_vec, doc_vec)) 
                            for doc_id, doc_vec in enumerate(doc_vectors)]
    results_with_score = sorted(results_with_score, key=lambda x: -x[1])
    results = [x[0] for x in results_with_score]
    return results

In [473]:
# construct artwork matrix:
artworks = load_data('museum.csv')
doc_freqs = compute_doc_freqs(artworks)
term_weights = TermWeights(title=10, artist=5, material=2, museum=80, description=1)
art_vec = [compute_tfidf(art, doc_freqs, term_weights) for art in artworks]

In [477]:
def search_result(artworks, query, doc_vectors, doc_freqs):
    result = search(query, doc_vectors, doc_freqs)[:10]
    for i in result:
        artwork_image = artworks[i].image
        image = Image(url= artwork_image, width=250, height=250)
        display(image)
        print(artworks[i])

In [480]:
query = 'tiger'
search_result(artworks, query, art_vec, doc_freqs)

title: Un Tigre et quatre têtes de tigres
 startYear: 1600.0
 endYear: 1700.0
 material: cloth
 artist: Boel, Pieter ; Pays-Bas du Sud
 museum: louvre museum
 url: https://collections.louvre.fr/ark:/53355/cl010054231



title: Jeune tigre jouant avec sa mère, dit aussi Étude de deux tigres
 startYear: 1830.0
 material: oil on cloth
 artist: Delacroix, Eugène ; France
 museum: louvre museum
 url: https://collections.louvre.fr/ark:/53355/cl010059639



title: Tigre couché.
 startYear: 1833.0
 material: oil on cloth
 artist: Barye, Antoine Louis ; France
 museum: louvre museum
 url: https://collections.louvre.fr/ark:/53355/cl010059910



title: Bacchus et un tigre
 startYear: 1834.0
 material:  fresco  pigment diluted on coated to the lime
 artist: Delacroix, Eugène
 museum: louvre museum
 url: https://collections.louvre.fr/ark:/53355/cl010455673



title: Combat d'ours et de tigres
 startYear: 1600.0
 endYear: 1700.0
 material: cloth
 artist: Pays-Bas du Sud ; Vos, Paul de
 museum: louvre museum
 url: https://collections.louvre.fr/ark:/53355/cl010054469



title: Éléphants, lion et tigres morts
 startYear: 1600.0
 endYear: 1700.0
 material: oil on cloth
 artist: Anonyme ; France
 museum: louvre museum
 url: https://collections.louvre.fr/ark:/53355/cl010067226



title: Amours jouant avec un tigre
 startYear: 1800.0
 endYear: 1900.0
 material: cloth
 artist: Pays-Bas du Sud ; Sauvage, Piat Joseph
 museum: louvre museum
 url: https://collections.louvre.fr/ark:/53355/cl010056904



title: Twelve Views of Tiger Hill, Suzhou: The Sword Spring, Tiger Hill
 startYear: 1491.0
 endYear: 1509.0
 artist: Shen Zhou
 museum: cleveland museum of art
 url: https://clevelandart.org/art/1964.371.6



title: Twelve Views of Tiger Hill, Suzhou: Bamboo Pavilion, Tiger Hill
 startYear: 1491.0
 endYear: 1509.0
 artist: Shen Zhou
 museum: cleveland museum of art
 url: https://clevelandart.org/art/1964.371.11



title: Twelve Views of Tiger Hill, Suzhou: Distant View of Tiger Hill from the Canal Mooring
 startYear: 1491.0
 endYear: 1509.0
 artist: Shen Zhou
 museum: cleveland museum of art
 url: https://clevelandart.org/art/1964.371.1



## Search by color

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
import cv2
from collections import Counter
from skimage.color import rgb2lab, deltaE_cie76
from urllib.request import urlopen

In [None]:
def url_to_image(url):
    resp = urlopen(url)
    image = np.asarray(bytearray(resp.read()), dtype="uint8")
    image = cv2.imdecode(image, cv2.IMREAD_COLOR)
    return image

def get_colors(image, number_of_colors, show_chart):
    
    modified_image = cv2.resize(image, (600, 400), interpolation = cv2.INTER_AREA)
    modified_image = modified_image.reshape(modified_image.shape[0]*modified_image.shape[1], 3)
    
    clf = KMeans(n_clusters = number_of_colors)
    labels = clf.fit_predict(modified_image)
    
    counts = Counter(labels)
    counts = dict(sorted(counts.items()))
    
    center_colors = clf.cluster_centers_
    ordered_colors = [center_colors[i] for i in counts.keys()]
    rgb_colors = [ordered_colors[i] for i in counts.keys()]
    return rgb_colors

COLORS = {
    'GREEN': [0, 128, 0],
    'BLUE': [0, 0, 128],
    'YELLOW': [255, 255, 0],
    'RED': [255, 0, 0]
}

def match_image_by_color(image, color, threshold = 60, number_of_colors = 10): 
    
    image_colors = get_colors(image, number_of_colors, False)
    selected_color = rgb2lab(np.uint8(np.asarray([[color]])))

    select_image = False
    for i in range(number_of_colors):
        curr_color = rgb2lab(np.uint8(np.asarray([[image_colors[i]]])))
        diff = deltaE_cie76(selected_color, curr_color)
        if (diff < threshold):
            select_image = True
    return select_image

image_colors = []
def label_color(urls):
    for url in tqdm(urls[301:]):
        label_colors = ''
        img = url_to_image(url)
        for color in COLORS:
            matched = match_image_by_color(img, COLORS[color], threshold = 60, number_of_colors = 6)
            if matched:
                label_colors = label_colors + ',' + color
        image_colors.append(label_colors) 

In [None]:
df = pd.read_csv('nga.csv')
urls = df['image'].tolist()
label_color(urls)
df['color'] = image_colors

In [12]:
def search_by_color(color, df):
    for i in range(df.shape[0]):
        if color.upper() in df['color'][i]:
            artwork_image = df['image'][i]
            image = Image(url= artwork_image, width=250, height=250)
            display(image)  


In [13]:
search_by_color('green', df)