In [49]:
import pandas as pd
from typing import List, Set
import re


In [50]:
# Prepare the input data
df = pd.read_csv('data/paintings_with_filenames.csv') 
df

Unnamed: 0,item,title,author_wikidata,author_name,creation_date,origin_country,display_country,display_location,type,school,time_period,wiki_url,image_url,depicts,wikipedia_url,article_text,title_clean,wga_url,wga_description,filename
0,http://www.wikidata.org/entity/Q607761,The Death of the Picador,http://www.wikidata.org/entity/Q5432,Francisco Goya,1793-01-01T00:00:00Z,,,,genre art,Romanticism,,,https://commons.wikimedia.org/wiki/Special:Fil...,"picador, stadium, spear, bullfighting, man, ho...",,,the death of the picador,,,La_muerte_del_picador.jpg
1,http://www.wikidata.org/entity/Q609572,Manaò tupapaú,http://www.wikidata.org/entity/Q37693,Paul Gauguin,1892-01-01T00:00:00Z,,United States of America,Buffalo AKG Art Museum,genre art,Impressionism,,,https://commons.wikimedia.org/wiki/Special:Fil...,"gaze, lying, intergluteal cleft, sole, barefoo...",https://en.wikipedia.org/wiki/Spirit_of_the_De...,Spirit of the Dead Watching (Manao tupapau) is...,manaò tupapaú,,,Paul_Gauguin_-_Manaò_tupapaú_(Spirit_of_the_De...
2,http://www.wikidata.org/entity/Q607598,Virgin of the Councillors,http://www.wikidata.org/entity/Q723863,Lluís Dalmau,1445-01-01T00:00:00Z,,Spain,Museu Nacional d'Art de Catalunya,religious art,Gothic painting,,,https://commons.wikimedia.org/wiki/Special:Fil...,"Madonna and Child, Eulalia of Barcelona, Joan ...",https://en.wikipedia.org/wiki/Virgin_of_the_Co...,The Virgin of the Councillors is a panel paint...,virgin of the councillors,,,Dalmau_Mare_de_Deu_dels_Consellers.jpg
3,http://www.wikidata.org/entity/Q734082,Regatta at Sainte-Adresse,http://www.wikidata.org/entity/Q296,Claude Monet,1867-01-01T00:00:00Z,,United States of America,Metropolitan Museum of Art,marine art,Impressionism,,,https://commons.wikimedia.org/wiki/Special:Fil...,"parasol, sailboat, Sainte-Adresse, church, mar...",https://en.wikipedia.org/wiki/Regatta_at_Saint...,The Regatta at Sainte-Adresse is an oil-on-can...,regatta at sainteadresse,https://www.wga.hu/html/m/monet/01/early16.html,"In the summer of 1867, Monet painted a number ...","Claude_Monet,_1867,_Regatta_at_Sainte-Adresse,..."
4,http://www.wikidata.org/entity/Q472037,By the Seashore,http://www.wikidata.org/entity/Q39931,Pierre-Auguste Renoir,1883-01-01T00:00:00Z,,United States of America,Metropolitan Museum of Art,portrait,Impressionism,,,https://commons.wikimedia.org/wiki/Special:Fil...,"portrait, Saint Peter Port, coast, chair, woman",https://en.wikipedia.org/wiki/By_the_Seashore,By the Seashore is a painting by Pierre-August...,by the seashore,https://www.wga.hu/html/r/renoir/3/3renoi20.html,This canvas was painted in the artist's studio...,Pierre-Auguste_Renoir_-_Femme_assise_au_bord_d...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121662,http://www.wikidata.org/entity/Q98966261,Musical Entertainment,http://www.wikidata.org/entity/Q18613400,Jakob Emanuel Gaisser,1899-01-01T00:00:00Z,,,,genre art,,,,https://commons.wikimedia.org/wiki/Special:Fil...,,,,musical entertainment,,,Jakob_Emanuel_Gaisser_-_Musical_Entertainment.jpg
121663,http://www.wikidata.org/entity/Q98977855,"Césarine de Houdetot, Baronne de Barante, read...",http://www.wikidata.org/entity/Q51077254,Louise Bouteiller,1818-01-01T00:00:00Z,France,France,Château de Barante,portrait,,,,https://commons.wikimedia.org/wiki/Special:Fil...,"Saint François d‘Assise, Césarine d'Houdetot, ...",,,césarine de houdetot baronne de barante readin...,,,Portrait_of_Cesarine_de_Houdetot_by_Louise_Bou...
121664,http://www.wikidata.org/entity/Q99025930,The Broken Jug,http://www.wikidata.org/entity/Q97477673,Jenny Berger-Désoras,1847-01-01T00:00:00Z,,,,genre art,,,,https://commons.wikimedia.org/wiki/Special:Fil...,,,,the broken jug,,,The_Broken_Jug_by_Jenny_Berger-Desoras.jpg
121665,http://www.wikidata.org/entity/Q98970362,Dr Philippe Pinel (1745-1826) and his family,http://www.wikidata.org/entity/Q3291501,Marie-Anne-Julie Forestier,1807-01-01T00:00:00Z,,,,family portrait,,,,https://commons.wikimedia.org/wiki/Special:Fil...,"Scipion Pinel, Philippe Pinel, physician, chil...",,,dr philippe pinel 17451826 and his family,,,Philippe_Pinel_and_his_family_by_Julie_Foresti...


In [51]:

def extract_relevant_sections(text):
   
    if not isinstance(text, str):
        return None

    # Define regular expression patterns to match each section
    intro_pattern = r'^(.*?)(?=\n==)'
    description_pattern = r'== Description ==\n(.*?)(?=\n==|\Z)'
    interpretation_pattern = r'== Interpretations ==\n(.*?)(?=\n==|\Z)'

    # Search for each section
    intro = re.search(intro_pattern, text, re.DOTALL)
    description = re.search(description_pattern, text, re.DOTALL)
    interpretation = re.search(interpretation_pattern, text, re.DOTALL)

    # Extract the sections, if found
    result = ""
    if intro:
        result += intro.group(1).strip() + "\n\n"
    if description:
        result += "== Description ==\n" + description.group(1).strip() + "\n\n"
    if interpretation:
        result += "== Subject ==\n" + interpretation.group(1).strip()

    return result.strip()


In [52]:

df['wiki_description'] = df['article_text'].apply(extract_relevant_sections)

# Convert columns to strings and replace NaN with empty string
df['full_description'] = df[['title', 'depicts', 'wga_description', 'wiki_description']].astype(str).apply(lambda x: ' '.join(filter(None, x)), axis=1)

df.head()

Unnamed: 0,item,title,author_wikidata,author_name,creation_date,origin_country,display_country,display_location,type,school,...,image_url,depicts,wikipedia_url,article_text,title_clean,wga_url,wga_description,filename,wiki_description,full_description
0,http://www.wikidata.org/entity/Q607761,The Death of the Picador,http://www.wikidata.org/entity/Q5432,Francisco Goya,1793-01-01T00:00:00Z,,,,genre art,Romanticism,...,https://commons.wikimedia.org/wiki/Special:Fil...,"picador, stadium, spear, bullfighting, man, ho...",,,the death of the picador,,,La_muerte_del_picador.jpg,,"The Death of the Picador picador, stadium, spe..."
1,http://www.wikidata.org/entity/Q609572,Manaò tupapaú,http://www.wikidata.org/entity/Q37693,Paul Gauguin,1892-01-01T00:00:00Z,,United States of America,Buffalo AKG Art Museum,genre art,Impressionism,...,https://commons.wikimedia.org/wiki/Special:Fil...,"gaze, lying, intergluteal cleft, sole, barefoo...",https://en.wikipedia.org/wiki/Spirit_of_the_De...,Spirit of the Dead Watching (Manao tupapau) is...,manaò tupapaú,,,Paul_Gauguin_-_Manaò_tupapaú_(Spirit_of_the_De...,Spirit of the Dead Watching (Manao tupapau) is...,"Manaò tupapaú gaze, lying, intergluteal cleft,..."
2,http://www.wikidata.org/entity/Q607598,Virgin of the Councillors,http://www.wikidata.org/entity/Q723863,Lluís Dalmau,1445-01-01T00:00:00Z,,Spain,Museu Nacional d'Art de Catalunya,religious art,Gothic painting,...,https://commons.wikimedia.org/wiki/Special:Fil...,"Madonna and Child, Eulalia of Barcelona, Joan ...",https://en.wikipedia.org/wiki/Virgin_of_the_Co...,The Virgin of the Councillors is a panel paint...,virgin of the councillors,,,Dalmau_Mare_de_Deu_dels_Consellers.jpg,The Virgin of the Councillors is a panel paint...,"Virgin of the Councillors Madonna and Child, E..."
3,http://www.wikidata.org/entity/Q734082,Regatta at Sainte-Adresse,http://www.wikidata.org/entity/Q296,Claude Monet,1867-01-01T00:00:00Z,,United States of America,Metropolitan Museum of Art,marine art,Impressionism,...,https://commons.wikimedia.org/wiki/Special:Fil...,"parasol, sailboat, Sainte-Adresse, church, mar...",https://en.wikipedia.org/wiki/Regatta_at_Saint...,The Regatta at Sainte-Adresse is an oil-on-can...,regatta at sainteadresse,https://www.wga.hu/html/m/monet/01/early16.html,"In the summer of 1867, Monet painted a number ...","Claude_Monet,_1867,_Regatta_at_Sainte-Adresse,...",The Regatta at Sainte-Adresse is an oil-on-can...,"Regatta at Sainte-Adresse parasol, sailboat, S..."
4,http://www.wikidata.org/entity/Q472037,By the Seashore,http://www.wikidata.org/entity/Q39931,Pierre-Auguste Renoir,1883-01-01T00:00:00Z,,United States of America,Metropolitan Museum of Art,portrait,Impressionism,...,https://commons.wikimedia.org/wiki/Special:Fil...,"portrait, Saint Peter Port, coast, chair, woman",https://en.wikipedia.org/wiki/By_the_Seashore,By the Seashore is a painting by Pierre-August...,by the seashore,https://www.wga.hu/html/r/renoir/3/3renoi20.html,This canvas was painted in the artist's studio...,Pierre-Auguste_Renoir_-_Femme_assise_au_bord_d...,By the Seashore is a painting by Pierre-August...,"By the Seashore portrait, Saint Peter Port, co..."


In [None]:
def create_food_keywords() -> Set[str]:

    return {
        # Meals
        'breakfast', 'lunch', 'dinner', 'feast', 'banquet', 'brunch', 'snack', 'supper', 'tea', 'buffet', 'picnic', 'potluck', 'appetizer', 'dessert', 'main course',

        # Common foods
        'bread', 'fruit', 'meat', 'fish', 'cheese', 'wine', 'vegetables', 'salad', 'pasta', 'soup', 'stew', 'rice', 'cereal', 'nuts', 'seeds', 'juice', 'milk', 'honey', 'eggs', 'butter', 'yogurt', 'coffee', 'cake', 'pie', 'ice cream', 'chocolate', 'sugar', 'spices', 'herbs', 'olive oil', 'vinegar', 'salt', 'pepper',

        # Categories
        'food', 'meal', 'dish', 'cuisine', 'ingredient', 'beverage', 'desserts', 'appetizers',

        # Common verbs
        'eating', 'dining', 'cooking', 'baking', 'frying', 'grilling', 'roasting', 'boiling', 'steaming', 'toasting', 'preparing', 'serving', 'tasting', 'slicing', 'chopping', 'whisking', 'seasoning', 'marinating', 'garnishing'

    }


def check_food_mentions(df: pd.DataFrame,
                        description_column: str) -> pd.DataFrame:

    # Convert descriptions to lowercase for case-insensitive matching
    descriptions_lower = df[description_column].str.lower()

    # Combine default and custom keywords
    food_keywords = create_food_keywords()

    # Create regex pattern for efficient matching
    pattern = '|'.join(r'\b' + word + r'\b' for word in food_keywords)

    # Add columns to DataFrame
    df['contains_food'] = descriptions_lower.str.contains(pattern, regex=True)
    df['food_words'] = descriptions_lower.apply(
        lambda x: [word for word in food_keywords if f' {word} ' in f' {x} ']
    )
    df['food_word_count'] = df['food_words'].str.len()

    return df


""" # Example usage
if __name__ == "__main__":
    # Sample data
    data = {
        'title': ['Still Life with Fruit', 'Portrait of a Lady', 'The Last Supper'],
        'description': [
            'A beautiful arrangement of fresh fruits and bread on a table.',
            'A woman in a blue dress standing by a window.',
            'Jesus and disciples sharing their final meal together.'
        ]
    }
    df = pd.DataFrame(data)


    # Process the descriptions
    result = check_food_mentions(df, 'description') """

In [None]:
result = check_food_mentions(df, 'full_description')

In [None]:
result