In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import pandas as pd
from tqdm import tqdm
import re


In [2]:
# Prepare the input data
df = pd.read_csv('data/paintings_with_filenames.csv') 
df

Unnamed: 0,item,title,author_wikidata,author_name,creation_date,origin_country,display_country,display_location,type,school,time_period,wiki_url,image_url,depicts,wikipedia_url,article_text,title_clean,wga_url,wga_description,filename
0,http://www.wikidata.org/entity/Q607761,The Death of the Picador,http://www.wikidata.org/entity/Q5432,Francisco Goya,1793-01-01T00:00:00Z,,,,genre art,Romanticism,,,https://commons.wikimedia.org/wiki/Special:Fil...,"picador, stadium, spear, bullfighting, man, ho...",,,the death of the picador,,,La_muerte_del_picador.jpg
1,http://www.wikidata.org/entity/Q609572,Manaò tupapaú,http://www.wikidata.org/entity/Q37693,Paul Gauguin,1892-01-01T00:00:00Z,,United States of America,Buffalo AKG Art Museum,genre art,Impressionism,,,https://commons.wikimedia.org/wiki/Special:Fil...,"gaze, lying, intergluteal cleft, sole, barefoo...",https://en.wikipedia.org/wiki/Spirit_of_the_De...,Spirit of the Dead Watching (Manao tupapau) is...,manaò tupapaú,,,Paul_Gauguin_-_Manaò_tupapaú_(Spirit_of_the_De...
2,http://www.wikidata.org/entity/Q607598,Virgin of the Councillors,http://www.wikidata.org/entity/Q723863,Lluís Dalmau,1445-01-01T00:00:00Z,,Spain,Museu Nacional d'Art de Catalunya,religious art,Gothic painting,,,https://commons.wikimedia.org/wiki/Special:Fil...,"Madonna and Child, Eulalia of Barcelona, Joan ...",https://en.wikipedia.org/wiki/Virgin_of_the_Co...,The Virgin of the Councillors is a panel paint...,virgin of the councillors,,,Dalmau_Mare_de_Deu_dels_Consellers.jpg
3,http://www.wikidata.org/entity/Q734082,Regatta at Sainte-Adresse,http://www.wikidata.org/entity/Q296,Claude Monet,1867-01-01T00:00:00Z,,United States of America,Metropolitan Museum of Art,marine art,Impressionism,,,https://commons.wikimedia.org/wiki/Special:Fil...,"parasol, sailboat, Sainte-Adresse, church, mar...",https://en.wikipedia.org/wiki/Regatta_at_Saint...,The Regatta at Sainte-Adresse is an oil-on-can...,regatta at sainteadresse,https://www.wga.hu/html/m/monet/01/early16.html,"In the summer of 1867, Monet painted a number ...","Claude_Monet,_1867,_Regatta_at_Sainte-Adresse,..."
4,http://www.wikidata.org/entity/Q472037,By the Seashore,http://www.wikidata.org/entity/Q39931,Pierre-Auguste Renoir,1883-01-01T00:00:00Z,,United States of America,Metropolitan Museum of Art,portrait,Impressionism,,,https://commons.wikimedia.org/wiki/Special:Fil...,"portrait, Saint Peter Port, coast, chair, woman",https://en.wikipedia.org/wiki/By_the_Seashore,By the Seashore is a painting by Pierre-August...,by the seashore,https://www.wga.hu/html/r/renoir/3/3renoi20.html,This canvas was painted in the artist's studio...,Pierre-Auguste_Renoir_-_Femme_assise_au_bord_d...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121662,http://www.wikidata.org/entity/Q98966261,Musical Entertainment,http://www.wikidata.org/entity/Q18613400,Jakob Emanuel Gaisser,1899-01-01T00:00:00Z,,,,genre art,,,,https://commons.wikimedia.org/wiki/Special:Fil...,,,,musical entertainment,,,Jakob_Emanuel_Gaisser_-_Musical_Entertainment.jpg
121663,http://www.wikidata.org/entity/Q98977855,"Césarine de Houdetot, Baronne de Barante, read...",http://www.wikidata.org/entity/Q51077254,Louise Bouteiller,1818-01-01T00:00:00Z,France,France,Château de Barante,portrait,,,,https://commons.wikimedia.org/wiki/Special:Fil...,"Saint François d‘Assise, Césarine d'Houdetot, ...",,,césarine de houdetot baronne de barante readin...,,,Portrait_of_Cesarine_de_Houdetot_by_Louise_Bou...
121664,http://www.wikidata.org/entity/Q99025930,The Broken Jug,http://www.wikidata.org/entity/Q97477673,Jenny Berger-Désoras,1847-01-01T00:00:00Z,,,,genre art,,,,https://commons.wikimedia.org/wiki/Special:Fil...,,,,the broken jug,,,The_Broken_Jug_by_Jenny_Berger-Desoras.jpg
121665,http://www.wikidata.org/entity/Q98970362,Dr Philippe Pinel (1745-1826) and his family,http://www.wikidata.org/entity/Q3291501,Marie-Anne-Julie Forestier,1807-01-01T00:00:00Z,,,,family portrait,,,,https://commons.wikimedia.org/wiki/Special:Fil...,"Scipion Pinel, Philippe Pinel, physician, chil...",,,dr philippe pinel 17451826 and his family,,,Philippe_Pinel_and_his_family_by_Julie_Foresti...


In [4]:
# Split the 'depicts' column into individual words and create a list of all words
all_words = df['depicts'].dropna().str.split(',').explode().str.strip().unique()

# Create a new dataframe with the unique words
unique_words_df = pd.DataFrame(all_words, columns=['word'])

# Display the dataframe
unique_words_df

Unnamed: 0,word
0,picador
1,stadium
2,spear
3,bullfighting
4,man
...,...
23322,Saint François d‘Assise
23323,Césarine d'Houdetot
23324,Paul et Virginie
23325,Pamplemousses


In [None]:

def extract_relevant_sections(text):
   
    if not isinstance(text, str):
        return None

    # Define regular expression patterns to match each section
    intro_pattern = r'^(.*?)(?=\n==)'
    description_pattern = r'== Description ==\n(.*?)(?=\n==|\Z)'
    interpretation_pattern = r'== Interpretations ==\n(.*?)(?=\n==|\Z)'

    # Search for each section
    intro = re.search(intro_pattern, text, re.DOTALL)
    description = re.search(description_pattern, text, re.DOTALL)
    interpretation = re.search(interpretation_pattern, text, re.DOTALL)

    # Extract the sections, if found
    result = ""
    if intro:
        result += intro.group(1).strip() + "\n\n"
    if description:
        result += "== Description ==\n" + description.group(1).strip() + "\n\n"
    if interpretation:
        result += "== Subject ==\n" + interpretation.group(1).strip()

    return result.strip()


In [None]:

df['wiki_description'] = df['article_text'].apply(extract_relevant_sections)

df['full_description'] = df['title'] + ' ' + df['depicts'] + ' ' + df['wga_description'] + ' ' + df['wiki_description']

# Filter out NaN values from 'full_description'
df = df.dropna(subset=['full_description'])
df.head()

In [None]:
# tests
#df = df[df['full_description'].str.contains('supper', case=False, na=False)]
df = df.head(128)
df

In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from tqdm import tqdm

# Set the minimum confidence score threshold
score_threshold = 0.5

# Check if MPS is available and set the device
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS device")
else:
    device = torch.device("cpu")
    print("Using CPU device")

# Load the model and tokenizer
model_name = "chambliss/distilbert-for-food-extraction"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name).to(device)

# Print available labels in the model
labels = model.config.id2label
print("Available labels:", labels)

# Define NER pipeline with batch processing
ner_pipeline = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0 if device.type == "mps" else -1  # 0 for GPU/MPS, -1 for CPU
)

# Sample dataframe with descriptions
df_test = pd.DataFrame({
    'full_description': [
        "There are herring busses in front of the Rotterdam Gate.",
        "People are eating fruits.",
        "People are eating apples.",
        "People are eating dirt.",
        "People are feasting wines.",
        "The painting shows a cityscape with a harbor."
    ]
})

# Function to organize entities by label, filtering by score threshold
def organize_entities_by_label(results_batch, score_threshold):
    organized_entities_list = []

    # Process each description in the batch
    for results in results_batch:
        entity_dict = {label: [] for label in labels.values()}
        for entity in results:
            if entity['score'] >= score_threshold:  # Only consider entities above the score threshold
                label = labels[int(entity['entity_group'][-1])]  # Convert entity_group like "LABEL_0" to int and get label
                entity_dict[label].append(entity['word'])
        organized_entities_list.append(entity_dict)
    
    return organized_entities_list

# Process descriptions in batches and organize entities by label
batch_size = 16
descriptions = df['full_description'].tolist()
all_organized_entities = []

num_batches = (len(descriptions) + batch_size - 1) // batch_size

for i in tqdm(range(0, len(descriptions), batch_size), total=num_batches, desc="Processing Batches", unit="batch", ncols=80, leave=True):
    batch_texts = descriptions[i:i + batch_size]
    results_batch = ner_pipeline(batch_texts)
    
    # Organize entities by label for each batch with score filtering
    all_organized_entities.extend(organize_entities_by_label(results_batch, score_threshold))

# Convert organized entities to separate columns in the DataFrame
for label in labels.values():
    df[label] = [entities[label] for entities in all_organized_entities]

# Display the updated dataframe
display(df)

In [None]:


# Check if MPS is available and set the device
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS device")
else:
    device = torch.device("cpu")
    print("Using CPU device")

# Load the model and tokenizer
model_name = "chambliss/distilbert-for-food-extraction"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name).to(device)

# Print available labels in the model
labels = model.config.id2label
print("Available labels:", labels)


# Define NER pipeline with batch processing
ner_pipeline = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0 if device.type == "mps" else -1  # 0 for GPU/MPS, -1 for CPU
)

# Sample dataframe with descriptions
df_test = pd.DataFrame({
    'full_description': [
        "There are herring busses in front of the Rotterdam Gate.",
        "People are eating fruit.",
        "People are eating apples.",
        "People are eating so many apples.",
        "People are eating so many biscuits.",
        "People are moving apples.",
        "People are eating dirt.",
        "People are feasting wines.",
        "The painting shows a cityscape with a harbor."
    ]
})

# Function to organize entities by label
def organize_entities_by_label(results_batch):
    # Prepare a dictionary to store entities by label for each description
    organized_entities = {label: [] for label in labels.values()}
    organized_entities_list = []

    # Process each description in the batch
    for results in results_batch:
        print(results)
        entity_dict = {label: [] for label in labels.values()}
        for entity in results:
            label = labels[int(entity['entity_group'][-1])]  # Convert entity_group like "LABEL_0" to int and get label
            entity_dict[label].append(entity['word'])
        organized_entities_list.append(entity_dict)
    
    return organized_entities_list

# Define set of food entities for faster lookup
food_entities = {'food','wine', 'beverage', 'meat', 'fruit', 'vegetable', 'bread', 
                'dairy', 'dessert', 'seafood', 'eggs', 'fish'}

# Process descriptions in batches and organize entities by label
batch_size = 16
descriptions = df['full_description'].tolist()
all_organized_entities = []

num_batches = (len(descriptions) + batch_size - 1) // batch_size

for i in tqdm(range(0, len(descriptions), batch_size), total=num_batches, desc="Processing Batches", unit="batch", ncols=80, leave=True):
    batch_texts = descriptions[i:i + batch_size]
    results_batch = ner_pipeline(batch_texts)
    
    # Organize entities by label for each batch
    all_organized_entities.extend(organize_entities_by_label(results_batch))

# Convert organized entities to separate columns in the DataFrame
for label in labels.values():
    df[label] = [entities[label] for entities in all_organized_entities]




In [None]:
display(df)

In [None]:
df.to_csv('data/mentions_food.csv', index=False)