In [19]:
from gensim.models import KeyedVectors
import pandas as pd
from transformers import pipeline
import torch
import numpy as np

from tqdm import tqdm


In [20]:
# Prepare the input data
data = pd.read_csv('data/wikidata_all_paintings.csv') 
data

  data = pd.read_csv('data/wikidata_all_paintings.csv')


Unnamed: 0,item,creation_date,display_country,type,school,image_url,depicts,origin_country,time_period
0,http://www.wikidata.org/entity/Q724861,1612-01-01T00:00:00Z,United States of America,portrait,,http://commons.wikimedia.org/wiki/Special:File...,"book, man, skull, English people, White people...",,
1,http://www.wikidata.org/entity/Q727875,1538-01-01T00:00:00Z,Italy,mythological painting,Venetian school,http://commons.wikimedia.org/wiki/Special:File...,"dog, sky, twilight, earring, flower, nipple, t...",Republic of Venice,
2,http://www.wikidata.org/entity/Q727875,1538-01-01T00:00:00Z,Italy,nude,Venetian school,http://commons.wikimedia.org/wiki/Special:File...,"dog, sky, twilight, earring, flower, nipple, t...",Republic of Venice,
3,http://www.wikidata.org/entity/Q727875,1538-01-01T00:00:00Z,Italy,mythological painting,High Renaissance,http://commons.wikimedia.org/wiki/Special:File...,"dog, sky, twilight, earring, flower, nipple, t...",Republic of Venice,
4,http://www.wikidata.org/entity/Q727875,1538-01-01T00:00:00Z,Italy,nude,High Renaissance,http://commons.wikimedia.org/wiki/Special:File...,"dog, sky, twilight, earring, flower, nipple, t...",Republic of Venice,
...,...,...,...,...,...,...,...,...,...
658425,http://www.wikidata.org/entity/Q130724770,1851-01-01T00:00:00Z,,,,http://commons.wikimedia.org/wiki/Special:File...,,,
658426,http://www.wikidata.org/entity/Q130724778,1874-01-01T00:00:00Z,,,,http://commons.wikimedia.org/wiki/Special:File...,,,
658427,http://www.wikidata.org/entity/Q130724781,1646-01-01T00:00:00Z,,mythological painting,,http://commons.wikimedia.org/wiki/Special:File...,,,
658428,http://www.wikidata.org/entity/Q130724839,1928-01-01T00:00:00Z,Sweden,self-portrait,,http://commons.wikimedia.org/wiki/Special:File...,Ivan Ivarson,,


In [21]:
# Split the 'depicts' column into individual words and create a list of all words
all_words = data['depicts'].dropna().str.split(',').explode().str.strip()

# Count the occurrences of each word
word_counts = all_words.value_counts()

# Create a new dataframe with the unique words and their counts
unique_words_df = pd.DataFrame(word_counts).reset_index()
unique_words_df.columns = ['word', 'count']

# Display the dataframe
unique_words_df

Unnamed: 0,word,count
0,woman,13296
1,man,12781
2,tree,5178
3,Mary,4873
4,Christ Child,4868
...,...,...
30849,Aniela Radziwiłł,1
30850,Blue Palace,1
30851,Høje Tåstrup Church,1
30852,Ilarie Voronca,1


In [22]:
# Load the FastText model
model = KeyedVectors.load_word2vec_format('data/fasttext/wiki.simple.vec', binary=False)

# Sample DataFrame with keywords

data = {
    'word': [
        'picador', 'butter', 'spear', 'bullfighting', 'man', 
        'Saint François d‘Assise', 'Césarine d\'Houdetot', 
        'Paul et Virginie', 'apple', 'Scipion Pinel'
    ]
}

df = unique_words_df.copy()

# Define a list of food-related terms to calculate similarity
food_terms = ['food','wine', 'beverage', 'meat', 'fruit', 'vegetable', 'bread', 'dairy', 'dessert', 'seafood', 'meal', 'cookware', 'cheese']

# Check if MPS is available, otherwise fallback to CPU
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# Prepare food term embeddings and move them to MPS
food_embeddings = torch.stack(
    [torch.tensor(model[term], device=device) for term in food_terms if term in model]
)
food_term_names = [term for term in food_terms if term in model]

# Function to compute closest food term and its similarity score for a batch of words
def get_closest_food_terms_batch(words):
    # Filter out words not in model vocabulary
    valid_words = [word for word in words if word in model]
    if not valid_words:
        return [(False, 0.0)] * len(words)

    # Move word embeddings to MPS in batch
    word_embeddings = torch.stack([torch.tensor(model[word], device=device) for word in valid_words])

    # Calculate cosine similarities in a batched way
    similarities = torch.cosine_similarity(word_embeddings[:, None, :], food_embeddings[None, :, :], dim=-1)

    # Get the best matching food term and its similarity score for each word in the batch
    best_similarities, best_indices = similarities.max(dim=1)

    # Map results back to the original words list with threshold filtering
    results = []
    j = 0
    for word in words:
        if word in model:
            best_similarity = best_similarities[j].item()
            best_term = food_term_names[best_indices[j].item()]
            if best_similarity > 0.5:  # Adjust threshold as needed
                results.append((best_term, best_similarity))
            else:
                results.append((False, 0.0))
            j += 1
        else:
            results.append((False, 0.0))
    return results

# Apply batched function in chunks with tqdm progress tracking
batch_size = 5  # Adjust batch size for your needs
progress_bar = tqdm(total=len(df), desc="Processing Batches", unit="words")

results = []
for i in range(0, len(df), batch_size):
    batch_results = get_closest_food_terms_batch(df['word'][i:i + batch_size].tolist())
    results.extend(batch_results)
    progress_bar.update(len(batch_results))  # Update progress bar by batch size

progress_bar.close()

# Split the results into two columns in the DataFrame
df['closest_food_term'], df['food_similarity_score'] = zip(*results)

# Display the results
display(df)

Processing Batches: 100%|██████████| 30854/30854 [00:04<00:00, 6914.20words/s] 


Unnamed: 0,word,count,closest_food_term,food_similarity_score
0,woman,13296,False,0.0
1,man,12781,False,0.0
2,tree,5178,False,0.0
3,Mary,4873,False,0.0
4,Christ Child,4868,False,0.0
...,...,...,...,...
30849,Aniela Radziwiłł,1,False,0.0
30850,Blue Palace,1,False,0.0
30851,Høje Tåstrup Church,1,False,0.0
30852,Ilarie Voronca,1,False,0.0


In [23]:
food_related_words = df[~(df['closest_food_term'] == False)]
display(food_related_words.sort_values('count', ascending=False))
display(food_related_words.sort_values('food_similarity_score', ascending=False))

Unnamed: 0,word,count,closest_food_term,food_similarity_score
26,fruit,925,fruit,1.000000
46,cattle,561,dairy,0.525084
137,bread,242,bread,1.000000
260,tableware,140,cookware,0.623472
272,fish,132,seafood,0.570744
...,...,...,...,...
16812,distillation,1,beverage,0.528565
16117,catfish,1,seafood,0.514933
15035,cinnamon,1,dessert,0.568082
14117,treacle,1,bread,0.525488


Unnamed: 0,word,count,closest_food_term,food_similarity_score
137,bread,242,bread,1.000000
8871,dessert,2,dessert,1.000000
551,food,63,food,1.000000
26,fruit,925,fruit,1.000000
1279,meat,23,meat,1.000000
...,...,...,...,...
2169,lunch,11,meal,0.502950
4299,absinthe,5,wine,0.502482
28775,lemonade,1,beverage,0.502042
5015,biscuit,4,cheese,0.501518


In [26]:
# Define a blacklist of words to exclude
blacklist = ['cattle', 'vegetation', 'pasture', 'livestock', 'warehouse', 'twine']

# Filter the dataframe to exclude the blacklisted words
filtered_df = food_related_words[~food_related_words['word'].isin(blacklist)]

# Display the filtered dataframe
display(filtered_df)

Unnamed: 0,word,count,closest_food_term,food_similarity_score
26,fruit,925,fruit,1.000000
137,bread,242,bread,1.000000
260,tableware,140,cookware,0.623472
272,fish,132,seafood,0.570744
280,wine,127,wine,1.000000
...,...,...,...,...
28789,manure,1,vegetable,0.583847
28939,oats,1,dairy,0.515907
30192,veal,1,dairy,0.568498
30312,legume,1,vegetable,0.551566


In [27]:
filtered_df.to_csv('data/food_related_words.csv', index=False)