In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import pandas as pd
from tqdm import tqdm
import re


In [2]:
# Prepare the input data
df = pd.read_csv('data/paintings_with_filenames.csv') 
df

Unnamed: 0,item,title,author_wikidata,author_name,creation_date,origin_country,display_country,display_location,type,school,time_period,wiki_url,image_url,depicts,wikipedia_url,article_text,title_clean,wga_url,wga_description,filename
0,http://www.wikidata.org/entity/Q607761,The Death of the Picador,http://www.wikidata.org/entity/Q5432,Francisco Goya,1793-01-01T00:00:00Z,,,,genre art,Romanticism,,,https://commons.wikimedia.org/wiki/Special:Fil...,"picador, stadium, spear, bullfighting, man, ho...",,,the death of the picador,,,La_muerte_del_picador.jpg
1,http://www.wikidata.org/entity/Q609572,Manaò tupapaú,http://www.wikidata.org/entity/Q37693,Paul Gauguin,1892-01-01T00:00:00Z,,United States of America,Buffalo AKG Art Museum,genre art,Impressionism,,,https://commons.wikimedia.org/wiki/Special:Fil...,"gaze, lying, intergluteal cleft, sole, barefoo...",https://en.wikipedia.org/wiki/Spirit_of_the_De...,Spirit of the Dead Watching (Manao tupapau) is...,manaò tupapaú,,,Paul_Gauguin_-_Manaò_tupapaú_(Spirit_of_the_De...
2,http://www.wikidata.org/entity/Q607598,Virgin of the Councillors,http://www.wikidata.org/entity/Q723863,Lluís Dalmau,1445-01-01T00:00:00Z,,Spain,Museu Nacional d'Art de Catalunya,religious art,Gothic painting,,,https://commons.wikimedia.org/wiki/Special:Fil...,"Madonna and Child, Eulalia of Barcelona, Joan ...",https://en.wikipedia.org/wiki/Virgin_of_the_Co...,The Virgin of the Councillors is a panel paint...,virgin of the councillors,,,Dalmau_Mare_de_Deu_dels_Consellers.jpg
3,http://www.wikidata.org/entity/Q734082,Regatta at Sainte-Adresse,http://www.wikidata.org/entity/Q296,Claude Monet,1867-01-01T00:00:00Z,,United States of America,Metropolitan Museum of Art,marine art,Impressionism,,,https://commons.wikimedia.org/wiki/Special:Fil...,"parasol, sailboat, Sainte-Adresse, church, mar...",https://en.wikipedia.org/wiki/Regatta_at_Saint...,The Regatta at Sainte-Adresse is an oil-on-can...,regatta at sainteadresse,https://www.wga.hu/html/m/monet/01/early16.html,"In the summer of 1867, Monet painted a number ...","Claude_Monet,_1867,_Regatta_at_Sainte-Adresse,..."
4,http://www.wikidata.org/entity/Q472037,By the Seashore,http://www.wikidata.org/entity/Q39931,Pierre-Auguste Renoir,1883-01-01T00:00:00Z,,United States of America,Metropolitan Museum of Art,portrait,Impressionism,,,https://commons.wikimedia.org/wiki/Special:Fil...,"portrait, Saint Peter Port, coast, chair, woman",https://en.wikipedia.org/wiki/By_the_Seashore,By the Seashore is a painting by Pierre-August...,by the seashore,https://www.wga.hu/html/r/renoir/3/3renoi20.html,This canvas was painted in the artist's studio...,Pierre-Auguste_Renoir_-_Femme_assise_au_bord_d...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121662,http://www.wikidata.org/entity/Q98966261,Musical Entertainment,http://www.wikidata.org/entity/Q18613400,Jakob Emanuel Gaisser,1899-01-01T00:00:00Z,,,,genre art,,,,https://commons.wikimedia.org/wiki/Special:Fil...,,,,musical entertainment,,,Jakob_Emanuel_Gaisser_-_Musical_Entertainment.jpg
121663,http://www.wikidata.org/entity/Q98977855,"Césarine de Houdetot, Baronne de Barante, read...",http://www.wikidata.org/entity/Q51077254,Louise Bouteiller,1818-01-01T00:00:00Z,France,France,Château de Barante,portrait,,,,https://commons.wikimedia.org/wiki/Special:Fil...,"Saint François d‘Assise, Césarine d'Houdetot, ...",,,césarine de houdetot baronne de barante readin...,,,Portrait_of_Cesarine_de_Houdetot_by_Louise_Bou...
121664,http://www.wikidata.org/entity/Q99025930,The Broken Jug,http://www.wikidata.org/entity/Q97477673,Jenny Berger-Désoras,1847-01-01T00:00:00Z,,,,genre art,,,,https://commons.wikimedia.org/wiki/Special:Fil...,,,,the broken jug,,,The_Broken_Jug_by_Jenny_Berger-Desoras.jpg
121665,http://www.wikidata.org/entity/Q98970362,Dr Philippe Pinel (1745-1826) and his family,http://www.wikidata.org/entity/Q3291501,Marie-Anne-Julie Forestier,1807-01-01T00:00:00Z,,,,family portrait,,,,https://commons.wikimedia.org/wiki/Special:Fil...,"Scipion Pinel, Philippe Pinel, physician, chil...",,,dr philippe pinel 17451826 and his family,,,Philippe_Pinel_and_his_family_by_Julie_Foresti...


In [3]:

def extract_relevant_sections(text):
   
    if not isinstance(text, str):
        return None

    # Define regular expression patterns to match each section
    intro_pattern = r'^(.*?)(?=\n==)'
    description_pattern = r'== Description ==\n(.*?)(?=\n==|\Z)'
    interpretation_pattern = r'== Interpretations ==\n(.*?)(?=\n==|\Z)'

    # Search for each section
    intro = re.search(intro_pattern, text, re.DOTALL)
    description = re.search(description_pattern, text, re.DOTALL)
    interpretation = re.search(interpretation_pattern, text, re.DOTALL)

    # Extract the sections, if found
    result = ""
    if intro:
        result += intro.group(1).strip() + "\n\n"
    if description:
        result += "== Description ==\n" + description.group(1).strip() + "\n\n"
    if interpretation:
        result += "== Subject ==\n" + interpretation.group(1).strip()

    return result.strip()


In [4]:

df['wiki_description'] = df['article_text'].apply(extract_relevant_sections)

df['full_description'] = df['title'] + ' ' + df['depicts'] + ' ' + df['wga_description'] + ' ' + df['wiki_description']

# Filter out NaN values from 'full_description'
df = df.dropna(subset=['full_description'])
df.head()

Unnamed: 0,item,title,author_wikidata,author_name,creation_date,origin_country,display_country,display_location,type,school,...,image_url,depicts,wikipedia_url,article_text,title_clean,wga_url,wga_description,filename,wiki_description,full_description
3,http://www.wikidata.org/entity/Q734082,Regatta at Sainte-Adresse,http://www.wikidata.org/entity/Q296,Claude Monet,1867-01-01T00:00:00Z,,United States of America,Metropolitan Museum of Art,marine art,Impressionism,...,https://commons.wikimedia.org/wiki/Special:Fil...,"parasol, sailboat, Sainte-Adresse, church, mar...",https://en.wikipedia.org/wiki/Regatta_at_Saint...,The Regatta at Sainte-Adresse is an oil-on-can...,regatta at sainteadresse,https://www.wga.hu/html/m/monet/01/early16.html,"In the summer of 1867, Monet painted a number ...","Claude_Monet,_1867,_Regatta_at_Sainte-Adresse,...",The Regatta at Sainte-Adresse is an oil-on-can...,"Regatta at Sainte-Adresse parasol, sailboat, S..."
4,http://www.wikidata.org/entity/Q472037,By the Seashore,http://www.wikidata.org/entity/Q39931,Pierre-Auguste Renoir,1883-01-01T00:00:00Z,,United States of America,Metropolitan Museum of Art,portrait,Impressionism,...,https://commons.wikimedia.org/wiki/Special:Fil...,"portrait, Saint Peter Port, coast, chair, woman",https://en.wikipedia.org/wiki/By_the_Seashore,By the Seashore is a painting by Pierre-August...,by the seashore,https://www.wga.hu/html/r/renoir/3/3renoi20.html,This canvas was painted in the artist's studio...,Pierre-Auguste_Renoir_-_Femme_assise_au_bord_d...,By the Seashore is a painting by Pierre-August...,"By the Seashore portrait, Saint Peter Port, co..."
8,http://www.wikidata.org/entity/Q877191,The Three Philosophers,http://www.wikidata.org/entity/Q8459,Giorgione,1500-01-01T00:00:00Z,,Austria,Kunsthistorisches Museum,landscape painting,,...,https://commons.wikimedia.org/wiki/Special:Fil...,"philosopher, landscape",https://en.wikipedia.org/wiki/The_Three_Philos...,The Three Philosophers is an oil painting on c...,the three philosophers,https://www.wga.hu/html/g/giorgion/various/thr...,The Three Philosophers must be a work of the l...,Giorgione_-_Three_Philosophers_-_Google_Art_Pr...,The Three Philosophers is an oil painting on c...,"The Three Philosophers philosopher, landscape ..."
9,http://www.wikidata.org/entity/Q878981,The Mocking of Christ,http://www.wikidata.org/entity/Q154338,Matthias Grünewald,1504-01-01T00:00:00Z,,Germany,Alte Pinakothek,religious art,,...,https://commons.wikimedia.org/wiki/Special:Fil...,"Mocking of Jesus, Jesus",https://en.wikipedia.org/wiki/The_Mocking_of_C...,The Mocking of Christ (German: Die Verspottung...,the mocking of christ,https://www.wga.hu/html/g/grunewal/1/04mock.html,Grünewald's earliest datable work is the Mocki...,Mathis_Gothart_Grünewald_062.jpg,The Mocking of Christ (German: Die Verspottung...,"The Mocking of Christ Mocking of Jesus, Jesus ..."
28,http://www.wikidata.org/entity/Q212616,The Raft of the Medusa,http://www.wikidata.org/entity/Q184212,Théodore Géricault,1819-01-01T00:00:00Z,,France,Room 700,marine art,Romanticism,...,https://commons.wikimedia.org/wiki/Special:Fil...,"sitting, lying, standing, Méduse, agony, raft,...",https://en.wikipedia.org/wiki/The_Raft_of_the_...,The Raft of the Medusa (French: Le Radeau de l...,the raft of the medusa,https://www.wga.hu/html/g/gericaul/1/105geric....,In expressing the predicament of the shipwreck...,JEAN_LOUIS_THÉODORE_GÉRICAULT_-_La_Balsa_de_la...,The Raft of the Medusa (French: Le Radeau de l...,"The Raft of the Medusa sitting, lying, standin..."


In [5]:
# tests
df = df[df['full_description'].str.contains('supper', case=False, na=False)]
df = df.head(128)
df

Unnamed: 0,item,title,author_wikidata,author_name,creation_date,origin_country,display_country,display_location,type,school,...,image_url,depicts,wikipedia_url,article_text,title_clean,wga_url,wga_description,filename,wiki_description,full_description
190,http://www.wikidata.org/entity/Q318947,Supper at Emmaus,http://www.wikidata.org/entity/Q42207,Caravaggio,1601-01-01T00:00:00Z,,United Kingdom,National Gallery,religious art,Baroque,...,https://commons.wikimedia.org/wiki/Special:Fil...,"man, Jesus",https://en.wikipedia.org/wiki/Supper_at_Emmaus...,The Supper at Emmaus is a painting by the Ita...,supper at emmaus,https://www.wga.hu/html/c/caravagg/06/35emmau....,The gospel according to St Luke (24:13-32) tel...,Supper_at_Emmaus-Caravaggio_(1601).jpg,The Supper at Emmaus is a painting by the Ita...,"Supper at Emmaus man, Jesus The gospel accordi..."
191,http://www.wikidata.org/entity/Q318947,Supper at Emmaus,http://www.wikidata.org/entity/Q42207,Caravaggio,1601-01-01T00:00:00Z,,United Kingdom,National Gallery,religious art,Baroque,...,https://commons.wikimedia.org/wiki/Special:Fil...,"man, Jesus",https://en.wikipedia.org/wiki/Supper_at_Emmaus...,The Supper at Emmaus is a painting by the Ita...,supper at emmaus,https://www.wga.hu/html/c/caravagg/06/35emmau....,The gospel according to St Luke (24:13-32) tel...,Supper_at_Emmaus-Caravaggio_(1601).jpg,The Supper at Emmaus is a painting by the Ita...,"Supper at Emmaus man, Jesus The gospel accordi..."
192,http://www.wikidata.org/entity/Q318947,Supper at Emmaus,http://www.wikidata.org/entity/Q42207,Caravaggio,1601-01-01T00:00:00Z,,United Kingdom,National Gallery,religious art,Baroque,...,https://commons.wikimedia.org/wiki/Special:Fil...,"man, Jesus",https://en.wikipedia.org/wiki/Supper_at_Emmaus...,The Supper at Emmaus is a painting by the Ita...,supper at emmaus,https://www.wga.hu/html/c/caravagg/08/47emmau....,This later version of the subject is more rest...,Supper_at_Emmaus-Caravaggio_(1601).jpg,The Supper at Emmaus is a painting by the Ita...,"Supper at Emmaus man, Jesus This later version..."
193,http://www.wikidata.org/entity/Q318947,Supper at Emmaus,http://www.wikidata.org/entity/Q42207,Caravaggio,1601-01-01T00:00:00Z,,United Kingdom,National Gallery,religious art,Baroque,...,https://commons.wikimedia.org/wiki/Special:Fil...,"man, Jesus",https://en.wikipedia.org/wiki/Supper_at_Emmaus...,The Supper at Emmaus is a painting by the Ita...,supper at emmaus,https://www.wga.hu/html/c/caravagg/08/47emmau....,This later version of the subject is more rest...,Supper_at_Emmaus-Caravaggio_(1601).jpg,The Supper at Emmaus is a painting by the Ita...,"Supper at Emmaus man, Jesus This later version..."
330,http://www.wikidata.org/entity/Q25207,The Last Supper,http://www.wikidata.org/entity/Q25200,Daniele Crespi,1624-01-01T00:00:00Z,,Italy,Pinacoteca di Brera,religious art,,...,https://commons.wikimedia.org/wiki/Special:Fil...,"Last Supper, man, Jesus",https://en.wikipedia.org/wiki/The_Last_Supper_...,The Last Supper is a painting by Daniele Cresp...,the last supper,https://www.wga.hu/html/c/crespi/daniele/lasts...,The Last Supper by Daniele Crespi comes from t...,Dcrespi.jpg,The Last Supper is a painting by Daniele Cresp...,"The Last Supper Last Supper, man, Jesus The La..."
705,http://www.wikidata.org/entity/Q3208041,The Last Supper,http://www.wikidata.org/entity/Q301,El Greco,1568-01-01T00:00:00Z,,Italy,Pinacoteca Nazionale di Bologna,religious art,,...,https://commons.wikimedia.org/wiki/Special:Fil...,"Last Supper, table, man",https://en.wikipedia.org/wiki/Last_Supper_(El_...,Last Supper is a 1568 painting by Greek painte...,the last supper,https://www.wga.hu/html/g/greco_el/02/0207grec...,This unsigned painting evinces a similar techn...,El_Greco_020.jpg,Last Supper is a 1568 painting by Greek painte...,"The Last Supper Last Supper, table, man This u..."
3479,http://www.wikidata.org/entity/Q5823887,Apostle St James the Less,http://www.wikidata.org/entity/Q301,El Greco,1609-01-01T00:00:00Z,,Spain,Toledo Cathedral,religious art,,...,https://commons.wikimedia.org/wiki/Special:Fil...,"James, son of Alphaeus",https://en.wikipedia.org/wiki/Saint_James_the_...,Saint James the Less is a 1609 painting by El ...,apostle st james the less,https://www.wga.hu/html/g/greco_el/18/1809grec...,Apostle St James the Less is generally regarde...,El_Greco_-_St._James_the_Lesser_-_Cathedral_of...,Saint James the Less is a 1609 painting by El ...,"Apostle St James the Less James, son of Alphae..."
3993,http://www.wikidata.org/entity/Q2270291,Basket of Fruit,http://www.wikidata.org/entity/Q42207,Caravaggio,1600-01-01T00:00:00Z,,Italy,Pinacoteca Ambrosiana,still life,Italian Baroque painting,...,https://commons.wikimedia.org/wiki/Special:Fil...,"pear, fruit, fig, basket, leaf, grape, apple",https://en.wikipedia.org/wiki/Basket_of_Fruit_...,Basket of Fruit (c.1599) is a still life paint...,basket of fruit,https://www.wga.hu/html/c/caravagg/02/14basket...,Caravaggio is reported to have claimed that he...,Caravaggio_-_Natura_morta_con_frutta_(collezio...,Basket of Fruit (c.1599) is a still life paint...,"Basket of Fruit pear, fruit, fig, basket, leaf..."
6575,http://www.wikidata.org/entity/Q2097931,Supper at Emmaus,http://www.wikidata.org/entity/Q42207,Caravaggio,1606-01-01T00:00:00Z,,Italy,Pinacoteca di Brera,religious art,Baroque,...,https://commons.wikimedia.org/wiki/Special:Fil...,"Eucharist in the Catholic Church, Jesus",https://en.wikipedia.org/wiki/Supper_at_Emmaus...,Supper at Emmaus (1606) is a painting by the I...,supper at emmaus,https://www.wga.hu/html/c/caravagg/06/35emmau....,The gospel according to St Luke (24:13-32) tel...,Supper_at_Emmaus-Caravaggio_(1606).jpg,Supper at Emmaus (1606) is a painting by the I...,Supper at Emmaus Eucharist in the Catholic Chu...
6576,http://www.wikidata.org/entity/Q2097931,Supper at Emmaus,http://www.wikidata.org/entity/Q42207,Caravaggio,1606-01-01T00:00:00Z,,Italy,Pinacoteca di Brera,religious art,Baroque,...,https://commons.wikimedia.org/wiki/Special:Fil...,"Eucharist in the Catholic Church, Jesus",https://en.wikipedia.org/wiki/Supper_at_Emmaus...,Supper at Emmaus (1606) is a painting by the I...,supper at emmaus,https://www.wga.hu/html/c/caravagg/06/35emmau....,The gospel according to St Luke (24:13-32) tel...,Supper_at_Emmaus-Caravaggio_(1606).jpg,Supper at Emmaus (1606) is a painting by the I...,Supper at Emmaus Eucharist in the Catholic Chu...


In [6]:


# Check if MPS is available and set the device
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS device")
else:
    device = torch.device("cpu")
    print("Using CPU device")

# Load the model and tokenizer
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name).to(device)

# Define NER pipeline with batch processing
ner_pipeline = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0 if device.type == "mps" else -1  # 0 for GPU/MPS, -1 for CPU
)

# Sample dataframe with descriptions
df = pd.DataFrame({
    'full_description': [
        "There are herring busses in front of the Rotterdam Gate.",
        "People are eating fruit.",
        "The painting shows a cityscape with a harbor."
    ]
})

# Define set of food entities for faster lookup
food_entities = {'food','wine', 'beverage', 'meat', 'fruit', 'vegetable', 'bread', 
                'dairy', 'dessert', 'seafood', 'eggs', 'fish'}

# Function to check for food entities in a list of NER results
def mentions_food_batch(results_batch):
    mentions = []
    for results in results_batch:
        found = any(entity['word'].lower() in food_entities for entity in results)
        mentions.append(found)
    return mentions

# Process descriptions in batches
batch_size = 16 
descriptions = df['full_description'].tolist()
mentions_food = []

num_batches = (len(descriptions) + batch_size - 1) // batch_size

for i in tqdm(range(0, len(descriptions), batch_size), total=num_batches, desc="Processing Batches", unit="batch", ncols=80, leave=True):
    batch_texts = descriptions[i:i + batch_size]
    results_batch = ner_pipeline(batch_texts)
    mentions_food.extend(mentions_food_batch(results_batch))

# Assign the results to the dataframe
df['mentions_food'] = mentions_food



Using MPS device


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Processing Batches: 100%|██████████████████████| 1/1 [00:04<00:00,  4.43s/batch]


In [7]:
display(df)

Unnamed: 0,full_description,mentions_food
0,There are herring busses in front of the Rotte...,False
1,People are eating fruit.,False
2,The painting shows a cityscape with a harbor.,False


In [8]:
df.to_csv('data/mentions_food.csv', index=False)

In [9]:
display(df[df['mentions_food'] == True])

Unnamed: 0,full_description,mentions_food
