# PV056 project

### Follow the instructions and run the cells in this notebook to reproduce all the results.

In [None]:
!pip3 install requests
!pip3 install matplotlib
!pip3 install pandas
!pip3 install datasets
!pip3 install torch
!pip install transformers[torch]
!pip3 install matplotlib
!pip3 install ipywidgets
!pip install scikit-learn

Collecting torch
  Using cached torch-2.6.0-cp312-cp312-manylinux1_x86_64.whl.metadata (28 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Using cached nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Using cached nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1

In [None]:
import os
import random

import requests
import pandas as pd
import matplotlib.pyplot as plt
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score


os.environ["WANDB_DISABLED"] = "true"

## Load the datasets

In [None]:
dataset_parts = ["train", "test", "valid"]
for dataset_part in dataset_parts:
    url = f"https://raw.githubusercontent.com/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io/refs/heads/main/data/incidents_{dataset_part}.csv"
    response = requests.get(url)

    with open(f"incidents_{dataset_part}.csv", "wb") as f:
        f.write(response.content)

trainset = pd.read_csv('incidents_train.csv', index_col=0)
validset = pd.read_csv('incidents_valid.csv', index_col=0)
testset = pd.read_csv('incidents_valid.csv', index_col=0)

In [None]:
trainset_mirek_hazard = trainset.copy()
trainset_mirek_product = trainset.copy()

validset_mirek_hazard = validset.copy()
validset_mirek_product = validset.copy()

testset_mirek_hazard = testset.copy()
testset_mirek_product = testset.copy()

In [None]:
for dataset in [trainset, validset, testset]:
    dataset = dataset.rename(columns={"hazard-category": "hazard_category", "product-category": "product_category"})

## Explore the data

In [None]:
trainset.sample()

In [None]:
trainset.info()

In [None]:
trainset.head()

In [None]:
for i in range(10):
    x = random.randint(0, len(trainset))
    print(trainset["text"][x])       # change the column name to view another column data
    print()
    print("XXX")
    print()

In [None]:
#DISTRIBUTION OF HAZARDS IN DATASET
fig, ax = plt.subplots()

ax.barh(trainset['hazard_category'].value_counts().index.to_list(), trainset['hazard_category'].value_counts().values, orientation='horizontal')

plt.xlabel('Frequency')
plt.ylabel('Type of hazard')
plt.title('Distribution of hazard category')
plt.show()

In [None]:
#DISTRIBUTION OF PRODUCT TYPES IN DATASET
fig, ax = plt.subplots()

ax.barh(trainset['product_category'].value_counts().index.to_list(), trainset['product_category'].value_counts().values, orientation='horizontal')

plt.xlabel('Type of product')
plt.ylabel('Frequency')
plt.title('Distribution of product category')
plt.show()


## Generate synthetic data for rare product and hazard categories

In [None]:
from food_hazard_detection.balance_dataset import (generate_prompt_triplets_by_hazard, generate_prompt_triplets_by_product,
                             generate_synthetic_data)

from food_hazard_detection import settings
from food_hazard_detection.settings import FILES_DIR, SYNTHETIC_DATA_DIR

rare_hazard_categories = ["migration", "food additives and flavourings",
                              "organoleptic aspects", "packaging defect"]
rare_product_categories = ["sugars and syrups", "feed materials", "food contact materials",
                           "honey and royal jelly", "food additives and flavourings", "fats and oils",
                           "pet feed", "other food product / mixed", "alcoholic beverages"]

In [None]:
combinations = generate_prompt_triplets_by_hazard(rare_hazard_categories, trainset)
generate_synthetic_data(SYNTHETIC_DATA_DIR / "synthetic_data_hazard.csv",
                        FILES_DIR / "prompts/generate_synthetic_data.md", combinations)

combinations = generate_prompt_triplets_by_product(rare_product_categories, trainset)
generate_synthetic_data(SYNTHETIC_DATA_DIR / "synthetic_data_product.csv",
                        FILES_DIR / "prompts/generate_synthetic_data.md", combinations)

The data produced by Mistral are not perfect. So at this point some manual curration is needed. Because of that, we use later in the code already preprocessed synthetic data.

Load and check the generated data.

In [None]:
synthetic_hazard = pd.read_csv(SYNTHETIC_DATA_DIR / "synthetic_data_hazard_processed.csv", sep=settings.SEPARATOR, engine='python', on_bad_lines='warn')
synthetic_product = pd.read_csv(SYNTHETIC_DATA_DIR / "synthetic_data_product_processed.csv", sep=settings.SEPARATOR, engine='python', on_bad_lines='warn')
print(synthetic_hazard.info())
print(synthetic_product.info())

# Mirek - training with BERT

In [None]:
trainset_mirek_hazard = trainset_mirek_hazard.rename(columns={"hazard-category": "label"})
trainset_mirek_hazard["text"] = trainset_mirek_hazard["title"] + ": " + trainset_mirek_hazard["text"]
trainset_mirek_hazard = trainset_mirek_hazard.drop(columns=["day", "month", "year", "country", "product-category", "hazard", "product", "title"])
trainset_mirek_hazard.head()

In [None]:
trainset_mirek_product = trainset_mirek_product.rename(columns={"product-category": "label"})
trainset_mirek_product["text"] = trainset_mirek_product["title"] + ": " + trainset_mirek_product["text"]
trainset_mirek_product = trainset_mirek_product.drop(columns=["day", "month", "year", "country", "hazard-category", "hazard", "product", "title"])
trainset_mirek_product.head()

In [None]:
validset_mirek_hazard = validset_mirek_hazard.rename(columns={"hazard-category": "label"})
validset_mirek_hazard["text"] = validset_mirek_hazard["title"] + ": " + validset_mirek_hazard["text"]
validset_mirek_hazard = validset_mirek_hazard.drop(columns=["day", "month", "year", "country", "product-category", "hazard", "product", "title"])
validset_mirek_hazard.head()

In [None]:
validset_mirek_product = validset_mirek_product.rename(columns={"product-category": "label"})
validset_mirek_product["text"] = validset_mirek_product["title"] + ": " + validset_mirek_product["text"]
validset_mirek_product = validset_mirek_product.drop(columns=["day", "month", "year", "country", "hazard-category", "hazard", "product", "title"])
validset_mirek_product.head()

In [None]:
testset_mirek_hazard = testset_mirek_hazard.rename(columns={"hazard-category": "label"})
testset_mirek_hazard["text"] = testset_mirek_hazard["title"] + ": " + testset_mirek_hazard["text"]
testset_mirek_hazard = testset_mirek_hazard.drop(columns=["day", "month", "year", "country", "product-category", "hazard", "product", "title"])
testset_mirek_hazard.head()

In [None]:
testset_mirek_product = testset_mirek_product.rename(columns={"product-category": "label"})
testset_mirek_product["text"] = testset_mirek_product["title"] + ": " + testset_mirek_product["text"]
testset_mirek_product = testset_mirek_product.drop(columns=["day", "month", "year", "country", "hazard-category", "hazard", "product", "title"])
testset_mirek_product.head()

In [None]:
id2label = {0: "biological", 1: "allergens", 2: "foreign bodies", 3: "fraud", 4: "chemical", 5: "other hazard", 6: "packaging defect", 7: "organoleptic aspects", 8: "food additives and flavourings", 9: "migration"}
label2id_hazards = {"biological": 0, "allergens": 1, "foreign bodies" : 2, "fraud" : 3, "chemical" : 4, "other hazard" : 5, "packaging defect" : 6, "organoleptic aspects" : 7, "food additives and flavourings" : 8, "migration" : 9}
label2id_products = {'meat, egg and dairy products' : 0,
                     'ices and desserts' : 1,
                     'cereals and bakery products' : 2,
                     'alcoholic beverages' : 3,
                     'prepared dishes and snacks' : 4,
                     'seafood' : 5,
                     'soups, broths, sauces and condiments' : 6,
                     'fats and oils' : 7,
                     'non-alcoholic beverages' : 8,
                     'confectionery' : 9,
                     'other food product / mixed' : 10,
                     'fruits and vegetables' : 11,
                     'herbs and spices' : 12,
                     'nuts, nut products and seeds' : 13,
                     'cocoa and cocoa preparations, coffee and tea' : 14,
                     'dietetic foods, food supplements, fortified foods' : 15,
                     'feed materials' : 16,
                     'pet feed' : 17,
                     'food additives and flavourings' : 18,
                     'honey and royal jelly' : 19,
                     'sugars and syrups': 20,
                     'food contact materials' : 21}

In [None]:
trainset_mirek_hazard = trainset_mirek_hazard.replace({"label": label2id_hazards})
validset_mirek_hazard = validset_mirek_hazard.replace({"label": label2id_hazards})
testset_mirek_hazard = testset_mirek_hazard.replace({"label": label2id_hazards})

trainset_mirek_product = trainset_mirek_product.replace({"label": label2id_products})
validset_mirek_product = validset_mirek_product.replace({"label": label2id_products})
testset_mirek_product = testset_mirek_product.replace({"label": label2id_products})

In [None]:
dataset_hazard = DatasetDict({"train" : Dataset.from_pandas(trainset_mirek_hazard),"test" : Dataset.from_pandas(validset_mirek_hazard), "unsupervised" : Dataset.from_pandas(testset_mirek_hazard)})
dataset_product = DatasetDict({"train" : Dataset.from_pandas(trainset_mirek_product),"test" : Dataset.from_pandas(validset_mirek_product), "unsupervised" : Dataset.from_pandas(testset_mirek_product)})

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_hazard = dataset_hazard.map(tokenize_function, batched=True)
tokenized_product = dataset_product.map(tokenize_function, batched=True)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_hazard = dataset_hazard.map(tokenize_function, batched=True)
tokenized_product = dataset_product.map(tokenize_function, batched=True)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_hazard = dataset_hazard.map(tokenize_function, batched=True)
tokenized_product = dataset_product.map(tokenize_function, batched=True)

In [None]:
training_args = TrainingArguments("test_trainer",
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",)

trainer_hazard = Trainer(
    model=model_hazard, args=training_args, train_dataset=tokenized_hazard["train"], eval_dataset=tokenized_hazard["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
)

trainer_hazard.train()

In [None]:
predictions_hazard = trainer_hazard.predict(tokenized_hazard["test"])

In [None]:
accuracy_score(predictions_hazard.predictions.argmax(-1), testset_mirek_hazard["label"])

In [None]:
f1_score(predictions_hazard.predictions.argmax(-1), testset_mirek_hazard["label"], average='macro')

In [None]:
model_product = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=22
)

training_args = TrainingArguments("test_trainer",
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",)

trainer_product = Trainer(
    model=model_product, args=training_args, train_dataset=tokenized_product["train"], eval_dataset=tokenized_product["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
)

trainer_product.train()

In [None]:
predictions_product = trainer.predict(tokenized_hazard["test"])

In [None]:
accuracy_score(predictions_product.predictions.argmax(-1), testset_mirek_product["label"])

In [None]:
f1_score(predictions_product.predictions.argmax(-1), testset_mirek_product["label"], average='macro')