In [None]:
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import numpy as np


In [None]:
# Prepare the input data
df = pd.read_csv('data/paintings_with_filenames.csv') 
df['full_description'] = df['title'] + ' ' + df['depicts']+ ' ' + df['wga_description']+ ' ' + df['article_text']
# Filter out NaN values from 'full_description'
df = df.dropna(subset=['full_description'])
df.head()


In [None]:

# Define the classes
classes = ['wine', 'beverage', 'meat', 'fruit', 'vegetable', 'bread', 'dairy', 'dessert', 'seafood']

# Load TinyBERT model and tokenizer
model_name = 'huawei-noah/TinyBERT_General_4L_312D'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(
    model_name, num_labels=len(classes)
)

# Set device to MPS if available
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model.to(device)

# Batch tokenize the text
encoded_dict = tokenizer.batch_encode_plus(
    df['full_description'].tolist(),
    add_special_tokens=True,
    max_length=512,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)

input_ids = encoded_dict['input_ids']
attention_masks = encoded_dict['attention_mask']

# Create DataLoader for batch processing
dataset = TensorDataset(input_ids, attention_masks)
batch_size = 64  # You can adjust this based on your system's memory
dataloader = DataLoader(
    dataset, sampler=SequentialSampler(dataset), batch_size=batch_size
)

# Classify the paintings
model.eval()
predictions = []

with torch.no_grad():
    for batch in dataloader:
        b_input_ids = batch[0].to(device)
        b_attention_masks = batch[1].to(device)
        outputs = model(b_input_ids, attention_mask=b_attention_masks)
        logits = outputs.logits
        predictions.append(logits.cpu())

# Concatenate all predictions
predictions = torch.cat(predictions, dim=0)

# Apply sigmoid to get probabilities
probabilities = torch.sigmoid(predictions).numpy()

# Apply a threshold to get the predicted classes
df['predicted_classes'] = [
    [classes[i] for i in range(len(classes)) if pred[i] > 0.5]
    for pred in probabilities
]

# Display the results
print(df[['full_description', 'predicted_classes']])