In [8]:
from gensim.models import KeyedVectors
import pandas as pd
from transformers import pipeline
import torch
import numpy as np

from tqdm import tqdm


In [9]:
# Prepare the input data
data = pd.read_csv('data/wikidata_all_paintings.csv') 
data

Unnamed: 0,item,title,creation_date,origin_country,display_country,type,school,image_url,depicts,time_period
0,http://www.wikidata.org/entity/Q605863,Portrait of a gentleman in his studio,1530-01-01T00:00:00Z,,Italy,portrait,,http://commons.wikimedia.org/wiki/Special:File...,"book, man, table, mandolin, lizard",
1,http://www.wikidata.org/entity/Q607598,Virgin of the Councillors,1445-01-01T00:00:00Z,,Spain,religious art,Gothic painting,http://commons.wikimedia.org/wiki/Special:File...,"Mary, Andrew the Apostle, throne, angel, Eulal...",
2,http://www.wikidata.org/entity/Q607761,The Death of the Picador,1793-01-01T00:00:00Z,,,genre art,Romanticism,http://commons.wikimedia.org/wiki/Special:File...,"death, horse, man, bullfighting, spear, stadiu...",
3,http://www.wikidata.org/entity/Q609572,Manaò tupapaú,1892-01-01T00:00:00Z,,United States of America,genre art,Impressionism,http://commons.wikimedia.org/wiki/Special:File...,"woman, ear, nudity, waist-length hair, buttock...",
4,http://www.wikidata.org/entity/Q609572,Manaò tupapaú,1892-01-01T00:00:00Z,,United States of America,portrait,Impressionism,http://commons.wikimedia.org/wiki/Special:File...,"woman, ear, nudity, waist-length hair, buttock...",
...,...,...,...,...,...,...,...,...,...,...
3047,http://www.wikidata.org/entity/Q3922605,,1458-01-01T00:00:00Z,Italy,Italy,religious art,Early Renaissance,http://commons.wikimedia.org/wiki/Special:File...,,
3048,http://www.wikidata.org/entity/Q3922606,,1458-01-01T00:00:00Z,Italy,Italy,religious art,Early Renaissance,http://commons.wikimedia.org/wiki/Special:File...,,
3049,http://www.wikidata.org/entity/Q3922632,Profile of a Man (Mantegna),1460-01-01T00:00:00Z,,Italy,portrait,,http://commons.wikimedia.org/wiki/Special:File...,man,
3050,http://www.wikidata.org/entity/Q3924387,Ballet Rehearsal,1874-01-01T00:00:00Z,,United Kingdom,,,http://commons.wikimedia.org/wiki/Special:File...,,


In [10]:
# Split the 'depicts' column into individual words and create a list of all words
all_words = data['depicts'].dropna().str.split(',').explode().str.strip()

# Count the occurrences of each word
word_counts = all_words.value_counts()

# Create a new dataframe with the unique words and their counts
unique_words_df = pd.DataFrame(word_counts).reset_index()
unique_words_df.columns = ['word', 'count']

# Display the dataframe
unique_words_df

Unnamed: 0,word,count
0,woman,1277
1,Mary,914
2,Christ Child,694
3,boy,645
4,man,623
...,...,...
3060,Georg Friedrich Kersting,1
3061,waiter,1
3062,sunglasses,1
3063,Hurzuf,1


In [11]:
# Load the FastText model available here : https://fasttext.cc/docs/en/pretrained-vectors.html
model = KeyedVectors.load_word2vec_format('labels/wiki.simple.vec', binary=False)

df = unique_words_df.copy()

# Define a list of food-related terms to calculate similarity
food_terms = ['food','wine', 'beverage', 'meat', 'fruit', 'vegetable', 'bread', 'dairy', 'dessert', 'seafood', 'meal', 'cookware', 'cheese']

# Check if MPS is available, otherwise fallback to CPU
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# Prepare food term embeddings and move them to MPS
food_embeddings = torch.stack(
    [torch.tensor(model[term], device=device) for term in food_terms if term in model]
)
food_term_names = [term for term in food_terms if term in model]

# Function to compute closest food term and its similarity score for a batch of words
def get_closest_food_terms_batch(words):
    # Filter out words not in model vocabulary
    valid_words = [word for word in words if word in model]
    if not valid_words:
        return [(False, 0.0)] * len(words)

    # Move word embeddings to MPS in batch
    word_embeddings = torch.stack([torch.tensor(model[word], device=device) for word in valid_words])

    # Calculate cosine similarities in a batched way
    similarities = torch.cosine_similarity(word_embeddings[:, None, :], food_embeddings[None, :, :], dim=-1)

    # Get the best matching food term and its similarity score for each word in the batch
    best_similarities, best_indices = similarities.max(dim=1)

    # Map results back to the original words list with threshold filtering
    results = []
    j = 0
    for word in words:
        if word in model:
            best_similarity = best_similarities[j].item()
            best_term = food_term_names[best_indices[j].item()]
            if best_similarity > 0.5:  # Adjust threshold as needed
                results.append((best_term, best_similarity))
            else:
                results.append((False, 0.0))
            j += 1
        else:
            results.append((False, 0.0))
    return results

# Apply batched function in chunks with tqdm progress tracking
batch_size = 5  # Adjust batch size for your needs
progress_bar = tqdm(total=len(df), desc="Processing Batches", unit="words")

results = []
for i in range(0, len(df), batch_size):
    batch_results = get_closest_food_terms_batch(df['word'][i:i + batch_size].tolist())
    results.extend(batch_results)
    progress_bar.update(len(batch_results))  # Update progress bar by batch size

progress_bar.close()

# Split the results into two columns in the DataFrame
df['closest_food_term'], df['food_similarity_score'] = zip(*results)

# Display the results
display(df)

Processing Batches: 100%|██████████| 3065/3065 [00:01<00:00, 2620.36words/s]


Unnamed: 0,word,count,closest_food_term,food_similarity_score
0,woman,1277,False,0.0
1,Mary,914,False,0.0
2,Christ Child,694,False,0.0
3,boy,645,False,0.0
4,man,623,False,0.0
...,...,...,...,...
3060,Georg Friedrich Kersting,1,False,0.0
3061,waiter,1,False,0.0
3062,sunglasses,1,False,0.0
3063,Hurzuf,1,False,0.0


In [12]:
food_related_words = df[~(df['closest_food_term'] == False)]
display(food_related_words.sort_values('count', ascending=False))
display(food_related_words.sort_values('food_similarity_score', ascending=False))

Unnamed: 0,word,count,closest_food_term,food_similarity_score
60,fruit,38,fruit,1.0
109,bread,26,bread,1.0
204,meal,16,meal,1.0
245,fish,13,seafood,0.570744
341,food,9,food,1.0
348,wine,9,wine,1.0
401,tableware,8,cookware,0.623472
507,pasture,6,dairy,0.505263
676,eating,5,meat,0.551564
675,drink,5,beverage,0.539468


Unnamed: 0,word,count,closest_food_term,food_similarity_score
109,bread,26,bread,1.0
341,food,9,food,1.0
60,fruit,38,fruit,1.0
2457,cheese,1,cheese,1.0
648,meat,5,meat,1.0
204,meal,16,meal,1.0
348,wine,9,wine,1.0
1386,vegetation,2,vegetable,0.646728
401,tableware,8,cookware,0.623472
2949,pancake,1,dessert,0.62053


In [13]:
# Define a blacklist of words to exclude
blacklist = ['cattle', 'vegetation', 'pasture', 'livestock', 'warehouse', 'twine','appliance','manure']

# Filter the dataframe to exclude the blacklisted words
filtered_df = food_related_words[~food_related_words['word'].isin(blacklist)]

# Display the filtered dataframe
display(filtered_df)

Unnamed: 0,word,count,closest_food_term,food_similarity_score
60,fruit,38,fruit,1.0
109,bread,26,bread,1.0
204,meal,16,meal,1.0
245,fish,13,seafood,0.570744
341,food,9,food,1.0
348,wine,9,wine,1.0
401,tableware,8,cookware,0.623472
648,meat,5,meat,1.0
675,drink,5,beverage,0.539468
676,eating,5,meat,0.551564


In [None]:
filtered_df.to_csv('labels/food_related_keywords.csv', index=False)