In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import pandas as pd
from tqdm import tqdm
import re


In [None]:
# Prepare the input data
df = pd.read_csv('data/paintings_with_filenames.csv') 
df

In [None]:

def extract_relevant_sections(text):
   
    if not isinstance(text, str):
        return None

    # Define regular expression patterns to match each section
    intro_pattern = r'^(.*?)(?=\n==)'
    description_pattern = r'== Description ==\n(.*?)(?=\n==|\Z)'
    interpretation_pattern = r'== Interpretations ==\n(.*?)(?=\n==|\Z)'

    # Search for each section
    intro = re.search(intro_pattern, text, re.DOTALL)
    description = re.search(description_pattern, text, re.DOTALL)
    interpretation = re.search(interpretation_pattern, text, re.DOTALL)

    # Extract the sections, if found
    result = ""
    if intro:
        result += intro.group(1).strip() + "\n\n"
    if description:
        result += "== Description ==\n" + description.group(1).strip() + "\n\n"
    if interpretation:
        result += "== Subject ==\n" + interpretation.group(1).strip()

    return result.strip()


In [None]:

df['wiki_description'] = df['article_text'].apply(extract_relevant_sections)

df['full_description'] = df['title'] + ' ' + df['depicts'] + ' ' + df['wga_description'] + ' ' + df['wiki_description']

# Filter out NaN values from 'full_description'
df = df.dropna(subset=['full_description'])
df.head()

In [None]:
# tests
#df = df[df['full_description'].str.contains('supper', case=False, na=False)]
df = df.head(128)
df

In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from tqdm import tqdm

# Set the minimum confidence score threshold
score_threshold = 0.5

# Check if MPS is available and set the device
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS device")
else:
    device = torch.device("cpu")
    print("Using CPU device")

# Load the model and tokenizer
model_name = "chambliss/distilbert-for-food-extraction"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name).to(device)

# Print available labels in the model
labels = model.config.id2label
print("Available labels:", labels)

# Define NER pipeline with batch processing
ner_pipeline = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0 if device.type == "mps" else -1  # 0 for GPU/MPS, -1 for CPU
)

# Sample dataframe with descriptions
df_test = pd.DataFrame({
    'full_description': [
        "There are herring busses in front of the Rotterdam Gate.",
        "People are eating fruits.",
        "People are eating apples.",
        "People are eating dirt.",
        "People are feasting wines.",
        "The painting shows a cityscape with a harbor."
    ]
})

# Function to organize entities by label, filtering by score threshold
def organize_entities_by_label(results_batch, score_threshold):
    organized_entities_list = []

    # Process each description in the batch
    for results in results_batch:
        entity_dict = {label: [] for label in labels.values()}
        for entity in results:
            if entity['score'] >= score_threshold:  # Only consider entities above the score threshold
                label = labels[int(entity['entity_group'][-1])]  # Convert entity_group like "LABEL_0" to int and get label
                entity_dict[label].append(entity['word'])
        organized_entities_list.append(entity_dict)
    
    return organized_entities_list

# Process descriptions in batches and organize entities by label
batch_size = 16
descriptions = df['full_description'].tolist()
all_organized_entities = []

num_batches = (len(descriptions) + batch_size - 1) // batch_size

for i in tqdm(range(0, len(descriptions), batch_size), total=num_batches, desc="Processing Batches", unit="batch", ncols=80, leave=True):
    batch_texts = descriptions[i:i + batch_size]
    results_batch = ner_pipeline(batch_texts)
    
    # Organize entities by label for each batch with score filtering
    all_organized_entities.extend(organize_entities_by_label(results_batch, score_threshold))

# Convert organized entities to separate columns in the DataFrame
for label in labels.values():
    df[label] = [entities[label] for entities in all_organized_entities]

# Display the updated dataframe
display(df)

In [None]:


# Check if MPS is available and set the device
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS device")
else:
    device = torch.device("cpu")
    print("Using CPU device")

# Load the model and tokenizer
model_name = "chambliss/distilbert-for-food-extraction"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name).to(device)

# Print available labels in the model
labels = model.config.id2label
print("Available labels:", labels)


# Define NER pipeline with batch processing
ner_pipeline = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0 if device.type == "mps" else -1  # 0 for GPU/MPS, -1 for CPU
)

# Sample dataframe with descriptions
df_test = pd.DataFrame({
    'full_description': [
        "There are herring busses in front of the Rotterdam Gate.",
        "People are eating fruit.",
        "People are eating apples.",
        "People are eating so many apples.",
        "People are eating so many biscuits.",
        "People are moving apples.",
        "People are eating dirt.",
        "People are feasting wines.",
        "The painting shows a cityscape with a harbor."
    ]
})

# Function to organize entities by label
def organize_entities_by_label(results_batch):
    # Prepare a dictionary to store entities by label for each description
    organized_entities = {label: [] for label in labels.values()}
    organized_entities_list = []

    # Process each description in the batch
    for results in results_batch:
        print(results)
        entity_dict = {label: [] for label in labels.values()}
        for entity in results:
            label = labels[int(entity['entity_group'][-1])]  # Convert entity_group like "LABEL_0" to int and get label
            entity_dict[label].append(entity['word'])
        organized_entities_list.append(entity_dict)
    
    return organized_entities_list

# Define set of food entities for faster lookup
food_entities = {'food','wine', 'beverage', 'meat', 'fruit', 'vegetable', 'bread', 
                'dairy', 'dessert', 'seafood', 'eggs', 'fish'}

# Process descriptions in batches and organize entities by label
batch_size = 16
descriptions = df['full_description'].tolist()
all_organized_entities = []

num_batches = (len(descriptions) + batch_size - 1) // batch_size

for i in tqdm(range(0, len(descriptions), batch_size), total=num_batches, desc="Processing Batches", unit="batch", ncols=80, leave=True):
    batch_texts = descriptions[i:i + batch_size]
    results_batch = ner_pipeline(batch_texts)
    
    # Organize entities by label for each batch
    all_organized_entities.extend(organize_entities_by_label(results_batch))

# Convert organized entities to separate columns in the DataFrame
for label in labels.values():
    df[label] = [entities[label] for entities in all_organized_entities]




In [None]:
display(df)

In [None]:
df.to_csv('data/mentions_food.csv', index=False)