In [151]:

import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re


In [152]:
# Prepare the input data
df = pd.read_csv('data/paintings_with_filenames.csv') 
#df

In [153]:

def extract_relevant_sections(text):
   
    if not isinstance(text, str):
        return None

    # Define regular expression patterns to match each section
    intro_pattern = r'^(.*?)(?=\n==)'
    description_pattern = r'== Description ==\n(.*?)(?=\n==|\Z)'
    interpretation_pattern = r'== Interpretations ==\n(.*?)(?=\n==|\Z)'

    # Search for each section
    intro = re.search(intro_pattern, text, re.DOTALL)
    description = re.search(description_pattern, text, re.DOTALL)
    interpretation = re.search(interpretation_pattern, text, re.DOTALL)

    # Extract the sections, if found
    result = ""
    if intro:
        result += intro.group(1).strip() + "\n\n"
    if description:
        result += "== Description ==\n" + description.group(1).strip() + "\n\n"
    if interpretation:
        result += "== Subject ==\n" + interpretation.group(1).strip()

    return result.strip()


In [154]:

df['wiki_description'] = df['article_text'].apply(extract_relevant_sections)

# Convert columns to strings and replace NaN with empty string
df['full_description'] = df[['title', 'depicts', 'wga_description', 'wiki_description']].astype(str).apply(lambda x: ' '.join(filter(None, x)), axis=1)

df.head()

Unnamed: 0,item,title,author_wikidata,author_name,creation_date,origin_country,display_country,display_location,type,school,...,image_url,depicts,wikipedia_url,article_text,title_clean,wga_url,wga_description,filename,wiki_description,full_description
0,http://www.wikidata.org/entity/Q607761,The Death of the Picador,http://www.wikidata.org/entity/Q5432,Francisco Goya,1793-01-01T00:00:00Z,,,,genre art,Romanticism,...,https://commons.wikimedia.org/wiki/Special:Fil...,"picador, stadium, spear, bullfighting, man, ho...",,,the death of the picador,,,La_muerte_del_picador.jpg,,"The Death of the Picador picador, stadium, spe..."
1,http://www.wikidata.org/entity/Q609572,Manaò tupapaú,http://www.wikidata.org/entity/Q37693,Paul Gauguin,1892-01-01T00:00:00Z,,United States of America,Buffalo AKG Art Museum,genre art,Impressionism,...,https://commons.wikimedia.org/wiki/Special:Fil...,"gaze, lying, intergluteal cleft, sole, barefoo...",https://en.wikipedia.org/wiki/Spirit_of_the_De...,Spirit of the Dead Watching (Manao tupapau) is...,manaò tupapaú,,,Paul_Gauguin_-_Manaò_tupapaú_(Spirit_of_the_De...,Spirit of the Dead Watching (Manao tupapau) is...,"Manaò tupapaú gaze, lying, intergluteal cleft,..."
2,http://www.wikidata.org/entity/Q607598,Virgin of the Councillors,http://www.wikidata.org/entity/Q723863,Lluís Dalmau,1445-01-01T00:00:00Z,,Spain,Museu Nacional d'Art de Catalunya,religious art,Gothic painting,...,https://commons.wikimedia.org/wiki/Special:Fil...,"Madonna and Child, Eulalia of Barcelona, Joan ...",https://en.wikipedia.org/wiki/Virgin_of_the_Co...,The Virgin of the Councillors is a panel paint...,virgin of the councillors,,,Dalmau_Mare_de_Deu_dels_Consellers.jpg,The Virgin of the Councillors is a panel paint...,"Virgin of the Councillors Madonna and Child, E..."
3,http://www.wikidata.org/entity/Q734082,Regatta at Sainte-Adresse,http://www.wikidata.org/entity/Q296,Claude Monet,1867-01-01T00:00:00Z,,United States of America,Metropolitan Museum of Art,marine art,Impressionism,...,https://commons.wikimedia.org/wiki/Special:Fil...,"parasol, sailboat, Sainte-Adresse, church, mar...",https://en.wikipedia.org/wiki/Regatta_at_Saint...,The Regatta at Sainte-Adresse is an oil-on-can...,regatta at sainteadresse,https://www.wga.hu/html/m/monet/01/early16.html,"In the summer of 1867, Monet painted a number ...","Claude_Monet,_1867,_Regatta_at_Sainte-Adresse,...",The Regatta at Sainte-Adresse is an oil-on-can...,"Regatta at Sainte-Adresse parasol, sailboat, S..."
4,http://www.wikidata.org/entity/Q472037,By the Seashore,http://www.wikidata.org/entity/Q39931,Pierre-Auguste Renoir,1883-01-01T00:00:00Z,,United States of America,Metropolitan Museum of Art,portrait,Impressionism,...,https://commons.wikimedia.org/wiki/Special:Fil...,"portrait, Saint Peter Port, coast, chair, woman",https://en.wikipedia.org/wiki/By_the_Seashore,By the Seashore is a painting by Pierre-August...,by the seashore,https://www.wga.hu/html/r/renoir/3/3renoi20.html,This canvas was painted in the artist's studio...,Pierre-Auguste_Renoir_-_Femme_assise_au_bord_d...,By the Seashore is a painting by Pierre-August...,"By the Seashore portrait, Saint Peter Port, co..."


In [155]:

def create_food_keywords() :

    return {
        # Meals
        'breakfast', 'lunch', 'dinner', 'feast', 'banquet', 'brunch', 'snack', 'supper', 'tea', 'buffet', 'picnic', 'potluck', 'appetizer', 'dessert',

        # Common foods
        'bread', 'fruit', 'meat', 'fish', 'cheese', 'wine', 'vegetables', 'salad', 'pasta', 'soup', 'stew', 'rice', 'cereal', 'nuts', 'juice', 'milk', 
        'honey', 'egg', 'butter', 'yogurt', 'coffee', 'cake', 'pie', 'ice cream', 'chocolate', 'sugar', 'spice', 'olive oil', 'vinegar', 'salt', 'pepper', 
        'biscuit',

        # Common fruits
        'apple', 'banana', 'orange', 'grape', 'lemon', 'lime', 'cherry', 'peach', 'pear', 'plum', 'watermelon', 'melon', 'kiwi', 'mango', 'pineapple', 
        'strawberry', 'blueberry', 'raspberry', 'blackberry', 'grapefruit', 'pomegranate', 'papaya', 'coconut', 'avocado', 'figs', 'apricot',

        # Common vegetables
        'tomato', 'potato', 'carrot', 'onion', 'garlic', 'pepper', 'broccoli', 'cabbage', 'spinach', 'lettuce', 'cucumber', 'zucchini', 'squash', 'celery', 
        'mushroom', 'eggplant', 'cauliflower', 'bean', 'peas', 'asparagus', 'artichoke', 'leek', 'beet', 'radish', 'turnip', 'parsnip',

        # Categories
        'food', 'meal', 'dish', 'cuisine', 'ingredient', 'beverage', 'dessert', 'appetizer',

        # Common verbs
        'eating', 'dining', 'cooking', 'baking', 'frying', 'grilling', 'roasting', 'boiling', 'steaming', 'toasting', 'serving', 'tasting',
        # Tests
        'burgers', 'soup', 'canned',
    }


# Download required NLTK data files
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')

def check_food_mentions(df: pd.DataFrame,
                        description_column: str) -> pd.DataFrame:
    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Combine default and custom keywords
    food_keywords = create_food_keywords()
    food_keywords_set = set(food_keywords)  # For faster lookup
    # Function to tokenize and lemmatize text
    def extract_food_words(text):
        tokens = word_tokenize(text.lower())

        lemmatized = [lemmatizer.lemmatize(token) for token in tokens]

        return list(food_keywords_set.intersection(lemmatized))

    # Add 'food_words' column
    df['food_words'] = df[description_column].apply(extract_food_words)

    # Add 'contains_food' column
    df['contains_food'] = df['food_words'].apply(lambda words: len(words) > 0)

    return df


[nltk_data] Downloading package wordnet to /Users/jpl/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jpl/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/jpl/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
# Check for food mentions in the dataframe
result = check_food_mentions(df, 'full_description')
result

In [157]:
result.to_csv('data/paintings_with_food_mentions.csv', index=False)