In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import pandas as pd
from tqdm import tqdm
import re


In [None]:
# Prepare the input data
df = pd.read_csv('data/paintings_with_filenames.csv') 
df

In [None]:

def extract_relevant_sections(text):
   
    if not isinstance(text, str):
        return None

    # Define regular expression patterns to match each section
    intro_pattern = r'^(.*?)(?=\n==)'
    description_pattern = r'== Description ==\n(.*?)(?=\n==|\Z)'
    interpretation_pattern = r'== Interpretations ==\n(.*?)(?=\n==|\Z)'

    # Search for each section
    intro = re.search(intro_pattern, text, re.DOTALL)
    description = re.search(description_pattern, text, re.DOTALL)
    interpretation = re.search(interpretation_pattern, text, re.DOTALL)

    # Extract the sections, if found
    result = ""
    if intro:
        result += intro.group(1).strip() + "\n\n"
    if description:
        result += "== Description ==\n" + description.group(1).strip() + "\n\n"
    if interpretation:
        result += "== Subject ==\n" + interpretation.group(1).strip()

    return result.strip()


In [None]:

df['wiki_description'] = df['article_text'].apply(extract_relevant_sections)

df['full_description'] = df['title'] + ' ' + df['depicts'] + ' ' + df['wga_description'] + ' ' + df['wiki_description']

# Filter out NaN values from 'full_description'
df = df.dropna(subset=['full_description'])
df.head()

In [None]:
# tests
df = df[df['full_description'].str.contains('supper', case=False, na=False)]
df = df.head(128)
df

In [None]:


# Check if MPS is available and set the device
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS device")
else:
    device = torch.device("cpu")
    print("Using CPU device")

# Load the model and tokenizer
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name).to(device)

# Define NER pipeline with batch processing
ner_pipeline = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0 if device.type == "mps" else -1  # 0 for GPU/MPS, -1 for CPU
)

# Sample dataframe with descriptions
df = pd.DataFrame({
    'full_description': [
        "There are herring busses in front of the Rotterdam Gate.",
        "People are eating fruit.",
        "The painting shows a cityscape with a harbor."
    ]
})

# Define set of food entities for faster lookup
food_entities = {'food','wine', 'beverage', 'meat', 'fruit', 'vegetable', 'bread', 
                'dairy', 'dessert', 'seafood', 'eggs', 'fish'}

# Function to check for food entities in a list of NER results
def mentions_food_batch(results_batch):
    mentions = []
    for results in results_batch:
        found = any(entity['word'].lower() in food_entities for entity in results)
        mentions.append(found)
    return mentions

# Process descriptions in batches
batch_size = 16 
descriptions = df['full_description'].tolist()
mentions_food = []

num_batches = (len(descriptions) + batch_size - 1) // batch_size

for i in tqdm(range(0, len(descriptions), batch_size), total=num_batches, desc="Processing Batches", unit="batch", ncols=80, leave=True):
    batch_texts = descriptions[i:i + batch_size]
    results_batch = ner_pipeline(batch_texts)
    mentions_food.extend(mentions_food_batch(results_batch))

# Assign the results to the dataframe
df['mentions_food'] = mentions_food



In [None]:
display(df)

In [None]:
df.to_csv('data/mentions_food.csv', index=False)

In [None]:
display(df[df['mentions_food'] == True])

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import torch
from tqdm import tqdm

# 1. Device Configuration: Use MPS if available, else CPU
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS device")
else:
    device = torch.device("cpu")
    print("Using CPU device")

# 2. Load the Model and Tokenizer
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name).to(device)

# 3. Define the NER Pipeline with Batch Processing
ner_pipeline = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0 if device.type == "mps" else -1  # 0 for MPS, -1 for CPU
)

# 4. Sample DataFrame with Descriptions
df_test = pd.DataFrame({
    'description': [
        "There are herring busses in front of the Rotterdam Gate.",
        "The painting shows a cityscape with a harbor.",
        "She enjoys a glass of wine with her dinner.",
        "Fresh vegetables are essential for a healthy diet.",
        "The conference was attended by experts in marine biology.",
        "They served delicious seafood at the restaurant.",
        "He bought eggs and bread from the store.",
        "A variety of fruits are available at the market.",
        "The dairy products were of high quality.",
        "She prepared a vegetable salad for lunch."
    ]
})

# 5. Get and Print Entity Labels from the Model
labels = model.config.id2label
print("\nEntity labels in the model:")
for id, label in labels.items():
    print(f"{id}: {label}")

# 6. Define Entity Labels for Food
food_entity_labels = {'FOOD'}  # Adjust based on model's entity_group labels

# 7. Function to Check for Food Entities in a Batch
def mentions_food_batch(results_batch):
    mentions = []
    for results in results_batch:
        found = any(entity['entity_group'] in food_entity_labels for entity in results)
        mentions.append(found)
    return mentions

# 8. Function to Print NER Results for Each Description
def print_ner_results(descriptions, results_batch, start_index):
    for idx, (description, entities) in enumerate(zip(descriptions, results_batch)):
        absolute_idx = start_index + idx
        print(f"\nDescription [{absolute_idx}]: {description}")
        if entities:
            print("NER Entities:")
            for entity in entities:
                print(f"  - {entity['word']} ({entity['entity_group']}) [Score: {entity['score']:.3f}]")
        else:
            print("NER Entities: None")
        print("-" * 60)

# 9. Process Descriptions in Batches with Progress Tracking and NER Results Printing
batch_size = 4  # Adjust based on memory constraints and readability
descriptions = df_test['description'].tolist()
mentions_food = []

# Calculate the number of batches
num_batches = (len(descriptions) + batch_size - 1) // batch_size

# Flag to control verbosity (set to True to print all results)
verbose = True

# Use tqdm to display a progress bar
for batch_num, i in enumerate(tqdm(range(0, len(descriptions), batch_size), total=num_batches, desc="Processing Batches")):
    batch_texts = descriptions[i:i + batch_size]
    results_batch = ner_pipeline(batch_texts)
    
    # Append the mention_food results
    mentions_food.extend(mentions_food_batch(results_batch))
    
    # Print NER results for the current batch
    if verbose:
        print_ner_results(batch_texts, results_batch, i)

# 10. Assign the Results to the DataFrame
df_test['mentions_food'] = mentions_food

# 11. Display the Final DataFrame
print("\nFinal Results:")
print(df_test)