In [None]:
from gensim.models import KeyedVectors
import pandas as pd
from transformers import pipeline
import torch



In [None]:
# Prepare the input data
df = pd.read_csv('data/paintings_with_descriptions.csv') 
df

In [None]:
# Split the 'depicts' column into individual words and create a list of all words
all_words = df['depicts'].dropna().str.split(',').explode().str.strip().unique()

# Create a new dataframe with the unique words
unique_words_df = pd.DataFrame(all_words, columns=['word'])

# Display the dataframe
unique_words_df

In [None]:
import pandas as pd
import torch
from gensim.models import KeyedVectors
from tqdm import tqdm  # Import tqdm for the progress bar

# Load the FastText model
model = KeyedVectors.load_word2vec_format('data/fasttext/wiki.simple.vec', binary=False)

# Sample DataFrame with keywords
data = {
    'word': [
        'picador', 'stadium', 'spear', 'bullfighting', 'man', 
        'Saint François d‘Assise', 'Césarine d\'Houdetot', 
        'Paul et Virginie', 'Pamplemousses', 'Scipion Pinel'
    ]
}
df = pd.DataFrame(data)

# Define a list of food-related terms to calculate similarity
food_terms = ["food", "fruit", "vegetable", "meat", "ingredient"]

# Check if MPS is available, otherwise fallback to CPU
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# Prepare food term embeddings and move them to MPS
food_embeddings = torch.stack(
    [torch.tensor(model[term], device=device) for term in food_terms if term in model]
)
food_term_names = [term for term in food_terms if term in model]

# Function to compute closest food term for a batch of words
def get_closest_food_terms_batch(words):
    # Filter out words not in model vocabulary
    valid_words = [word for word in words if word in model]
    if not valid_words:
        return ["Not Food-Related"] * len(words)

    # Move word embeddings to MPS in batch
    word_embeddings = torch.stack([torch.tensor(model[word], device=device) for word in valid_words])

    # Calculate cosine similarities in a batched way
    similarities = torch.cosine_similarity(word_embeddings[:, None, :], food_embeddings[None, :, :], dim=-1)

    # Get the best matching food term for each word in the batch
    best_similarities, best_indices = similarities.max(dim=1)

    # Map results back to original words list with threshold filtering
    results = []
    j = 0
    for word in words:
        if word in model:
            best_similarity = best_similarities[j].item()
            best_term = food_term_names[best_indices[j].item()]
            results.append(best_term if best_similarity > 0.3 else "Not Food-Related")
            j += 1
        else:
            results.append("Not Food-Related")
    return results

# Apply batched function in chunks with tqdm progress tracking
batch_size = 5  # Adjust batch size for your needs
progress_bar = tqdm(total=len(df), desc="Processing Batches", unit="words")

results = []
for i in range(0, len(df), batch_size):
    batch_results = get_closest_food_terms_batch(df['word'][i:i + batch_size].tolist())
    results.extend(batch_results)
    progress_bar.update(len(batch_results))  # Update progress bar by batch size

progress_bar.close()
df['closest_food_term'] = results

# Display the results
print(df)