In [4]:
from gensim.models import KeyedVectors
import pandas as pd
from transformers import pipeline
import torch
import numpy as np

from tqdm import tqdm


In [5]:
# Prepare the input data
data = pd.read_csv('data/wikidata_all_paintings.csv') 
data

  data = pd.read_csv('data/wikidata_all_paintings.csv')


Unnamed: 0,item,title,creation_date,origin_country,display_country,type,image_url,depicts,school,time_period
0,http://www.wikidata.org/entity/Q724861,Ashbourne portrait,1612-01-01T00:00:00Z,,United States of America,portrait,http://commons.wikimedia.org/wiki/Special:File...,"book, man, skull, English people, White people...",,
1,http://www.wikidata.org/entity/Q727875,Venus of Urbino,1538-01-01T00:00:00Z,Republic of Venice,Italy,mythological painting,http://commons.wikimedia.org/wiki/Special:File...,"dog, flower, nipple, tapestry, sky, bracelet, ...",Venetian school,
2,http://www.wikidata.org/entity/Q727875,Venus of Urbino,1538-01-01T00:00:00Z,Republic of Venice,Italy,nude,http://commons.wikimedia.org/wiki/Special:File...,"dog, flower, nipple, tapestry, sky, bracelet, ...",Venetian school,
3,http://www.wikidata.org/entity/Q727875,Venus of Urbino,1538-01-01T00:00:00Z,Republic of Venice,Italy,mythological painting,http://commons.wikimedia.org/wiki/Special:File...,"dog, flower, nipple, tapestry, sky, bracelet, ...",High Renaissance,
4,http://www.wikidata.org/entity/Q727875,Venus of Urbino,1538-01-01T00:00:00Z,Republic of Venice,Italy,nude,http://commons.wikimedia.org/wiki/Special:File...,"dog, flower, nipple, tapestry, sky, bracelet, ...",High Renaissance,
...,...,...,...,...,...,...,...,...,...,...
666277,http://www.wikidata.org/entity/Q130724770,,1851-01-01T00:00:00Z,,,,http://commons.wikimedia.org/wiki/Special:File...,,,
666278,http://www.wikidata.org/entity/Q130724778,,1874-01-01T00:00:00Z,,,,http://commons.wikimedia.org/wiki/Special:File...,,,
666279,http://www.wikidata.org/entity/Q130724781,Atlas holding up the celestial globe,1646-01-01T00:00:00Z,,,mythological painting,http://commons.wikimedia.org/wiki/Special:File...,,,
666280,http://www.wikidata.org/entity/Q130724839,Self Portrait,1928-01-01T00:00:00Z,,Sweden,self-portrait,http://commons.wikimedia.org/wiki/Special:File...,Ivan Ivarson,,


In [6]:
# Split the 'depicts' column into individual words and create a list of all words
all_words = data['depicts'].dropna().str.split(',').explode().str.strip()

# Count the occurrences of each word
word_counts = all_words.value_counts()

# Create a new dataframe with the unique words and their counts
unique_words_df = pd.DataFrame(word_counts).reset_index()
unique_words_df.columns = ['word', 'count']

# Display the dataframe
unique_words_df

Unnamed: 0,word,count
0,woman,17670
1,man,17459
2,Mary,7916
3,Christ Child,7694
4,Jesus,6640
...,...,...
39254,Bruce Kent,1
39255,Willie Whitelaw,1
39256,Harold Pinter,1
39257,John Mortimer,1


In [7]:
# Load the FastText model
model = KeyedVectors.load_word2vec_format('data/fasttext/wiki.simple.vec', binary=False)



df = unique_words_df.copy()

# Define a list of food-related terms to calculate similarity
food_terms = ['food','wine', 'beverage', 'meat', 'fruit', 'vegetable', 'bread', 'dairy', 'dessert', 'seafood', 'meal', 'cookware', 'cheese']

# Check if MPS is available, otherwise fallback to CPU
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# Prepare food term embeddings and move them to MPS
food_embeddings = torch.stack(
    [torch.tensor(model[term], device=device) for term in food_terms if term in model]
)
food_term_names = [term for term in food_terms if term in model]

# Function to compute closest food term and its similarity score for a batch of words
def get_closest_food_terms_batch(words):
    # Filter out words not in model vocabulary
    valid_words = [word for word in words if word in model]
    if not valid_words:
        return [(False, 0.0)] * len(words)

    # Move word embeddings to MPS in batch
    word_embeddings = torch.stack([torch.tensor(model[word], device=device) for word in valid_words])

    # Calculate cosine similarities in a batched way
    similarities = torch.cosine_similarity(word_embeddings[:, None, :], food_embeddings[None, :, :], dim=-1)

    # Get the best matching food term and its similarity score for each word in the batch
    best_similarities, best_indices = similarities.max(dim=1)

    # Map results back to the original words list with threshold filtering
    results = []
    j = 0
    for word in words:
        if word in model:
            best_similarity = best_similarities[j].item()
            best_term = food_term_names[best_indices[j].item()]
            if best_similarity > 0.5:  # Adjust threshold as needed
                results.append((best_term, best_similarity))
            else:
                results.append((False, 0.0))
            j += 1
        else:
            results.append((False, 0.0))
    return results

# Apply batched function in chunks with tqdm progress tracking
batch_size = 5  # Adjust batch size for your needs
progress_bar = tqdm(total=len(df), desc="Processing Batches", unit="words")

results = []
for i in range(0, len(df), batch_size):
    batch_results = get_closest_food_terms_batch(df['word'][i:i + batch_size].tolist())
    results.extend(batch_results)
    progress_bar.update(len(batch_results))  # Update progress bar by batch size

progress_bar.close()

# Split the results into two columns in the DataFrame
df['closest_food_term'], df['food_similarity_score'] = zip(*results)

# Display the results
display(df)

Processing Batches: 100%|██████████| 39259/39259 [00:05<00:00, 7351.51words/s] 


Unnamed: 0,word,count,closest_food_term,food_similarity_score
0,woman,17670,False,0.0
1,man,17459,False,0.0
2,Mary,7916,False,0.0
3,Christ Child,7694,False,0.0
4,Jesus,6640,False,0.0
...,...,...,...,...
39254,Bruce Kent,1,False,0.0
39255,Willie Whitelaw,1,False,0.0
39256,Harold Pinter,1,False,0.0
39257,John Mortimer,1,False,0.0


In [8]:
food_related_words = df[~(df['closest_food_term'] == False)]
display(food_related_words.sort_values('count', ascending=False))
display(food_related_words.sort_values('food_similarity_score', ascending=False))

Unnamed: 0,word,count,closest_food_term,food_similarity_score
27,fruit,1112,fruit,1.000000
36,cattle,802,dairy,0.525084
96,bread,375,bread,1.000000
185,fish,240,seafood,0.570744
283,tableware,162,cookware,0.623472
...,...,...,...,...
20201,passionfruit,1,fruit,0.686316
20018,sheepskin,1,meat,0.505855
19345,distillation,1,beverage,0.528565
18679,catfish,1,seafood,0.514933


Unnamed: 0,word,count,closest_food_term,food_similarity_score
592,food,73,food,1.000000
96,bread,375,bread,1.000000
10287,dessert,2,dessert,1.000000
27,fruit,1112,fruit,1.000000
1136,meat,35,meat,1.000000
...,...,...,...,...
2379,lunch,13,meal,0.502950
4478,absinthe,6,wine,0.502482
29283,lemonade,1,beverage,0.502042
4966,biscuit,5,cheese,0.501518


In [11]:
# Define a blacklist of words to exclude
blacklist = ['cattle', 'vegetation', 'pasture', 'livestock', 'warehouse', 'twine','appliance','manure']

# Filter the dataframe to exclude the blacklisted words
filtered_df = food_related_words[~food_related_words['word'].isin(blacklist)]

# Display the filtered dataframe
display(filtered_df)

Unnamed: 0,word,count,closest_food_term,food_similarity_score
27,fruit,1112,fruit,1.000000
96,bread,375,bread,1.000000
185,fish,240,seafood,0.570744
283,tableware,162,cookware,0.623472
318,wine,139,wine,1.000000
...,...,...,...,...
33486,cinnamon,1,dessert,0.568082
35660,grapefruit,1,fruit,0.784507
35975,condiment,1,dessert,0.632016
37188,veal,1,dairy,0.568498


In [12]:
filtered_df.to_csv('data/food_related_keywords.csv', index=False)