# PV056 project

### Follow the instructions and run the cells in this notebook to reproduce all the results.

In [None]:
!pip3 install -r requirements.txt

In [None]:
import os
import random

import requests
import pandas as pd
import matplotlib.pyplot as plt


os.environ["WANDB_DISABLED"] = "true"

## Load the datasets

In [None]:
# Uncomment if you want to download the dataset yourself
dataset_parts = ["train", "test", "valid"]
# for dataset_part in dataset_parts:
#     url = f"https://raw.githubusercontent.com/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io/refs/heads/main/data/incidents_{dataset_part}.csv"
#     response = requests.get(url)
#
#     with open(f"incidents_{dataset_part}.csv", "wb") as f:
#         f.write(response.content)

trainset = pd.read_csv('incidents_train.csv', index_col=0)
validset = pd.read_csv('incidents_valid.csv', index_col=0)
testset = pd.read_csv('incidents_valid.csv', index_col=0)


## Explore the data

In [None]:
trainset.sample()

In [None]:
trainset.info()

In [None]:
trainset.head()

In [None]:
for i in range(10):
    x = random.randint(0, len(trainset))
    print(trainset["text"][x])       # change the column name to view another column data
    print()
    print("XXX")
    print()

In [None]:
#DISTRIBUTION OF HAZARDS IN DATASET
fig, ax = plt.subplots()

ax.barh(trainset['hazard_category'].value_counts().index.to_list(), trainset['hazard_category'].value_counts().values, orientation='horizontal')

plt.xlabel('Frequency')
plt.ylabel('Type of hazard')
plt.title('Distribution of hazard category')
plt.show()

In [None]:
#DISTRIBUTION OF PRODUCT TYPES IN DATASET
fig, ax = plt.subplots()

ax.barh(trainset['product_category'].value_counts().index.to_list(), trainset['product_category'].value_counts().values, orientation='horizontal')

plt.xlabel('Type of product')
plt.ylabel('Frequency')
plt.title('Distribution of product category')
plt.show()


## Generate synthetic data for rare product and hazard categories

In [None]:
from food_hazard_detection.balance_dataset import (generate_prompt_triplets_by_hazard, generate_prompt_triplets_by_product,
                             generate_synthetic_data)

from food_hazard_detection.settings import FILES_DIR, SYNTHETIC_DATA_DIR

rare_hazard_categories = ["migration", "food additives and flavourings",
                              "organoleptic aspects", "packaging defect"]
rare_product_categories = ["sugars and syrups", "feed materials", "food contact materials",
                           "honey and royal jelly", "food additives and flavourings", "fats and oils",
                           "pet feed", "other food product / mixed", "alcoholic beverages"]

In [None]:
combinations_hazard = generate_prompt_triplets_by_hazard(rare_hazard_categories, trainset)
# Uncomment if you want really to generate the data. It takes some time.
# generate_synthetic_data(SYNTHETIC_DATA_DIR / "synthetic_data_hazard.csv",
#                         FILES_DIR / "prompts/generate_synthetic_data.md", combinations_hazard)

combinations_product = generate_prompt_triplets_by_product(rare_product_categories, trainset)
# generate_synthetic_data(SYNTHETIC_DATA_DIR / "synthetic_data_product.csv",
#                         FILES_DIR / "prompts/generate_synthetic_data.md", combinations_product)

print("Number of to-be generated synthetic data points:", len(combinations_hazard)+len(combinations_product))

Notes:

The data produced by Mistral are not perfect. So at this point some manual curration is needed (e.g. quote the text column to parse the csv properly or drop some rows with missing values). Because of that, we use later in the code already preprocessed synthetic data.

If you want to generate more synthetic data, change the code following the notes in `_generate_triplets` function in `balance_dataset.py` script.

Load and check the generated data.
We generated smaller and bigger amount of synthetic data and we want to test whether it has some impact on performance.

In [None]:
synthetic_data_small = pd.read_csv(SYNTHETIC_DATA_DIR / "synthetic_data_small.csv")
print(synthetic_data_small.info())

In [None]:
synthetic_data_big = pd.read_csv(SYNTHETIC_DATA_DIR / "synthetic_data_big.csv")
print(synthetic_data_big.info())

In [None]:
train_with_small = pd.concat([trainset, synthetic_data_small])
train_with_big = pd.concat([trainset, synthetic_data_big])
train_with_big.info()
train_with_small.info()

### Preprocess the data


1. As we intend to use, among ther models, a TF-IDF-based model, it is necessary to remove stop words and punctuation, then apply tokenization and lemmatization.

In [None]:
from food_hazard_detection.preprocessing import preprocessing

In [None]:
import nltk
nltk.download("wordnet")
trainset['text_preprocessed'] = trainset.text.apply(lambda x: preprocessing(x))
validset['text_preprocessed'] = validset.text.apply(lambda x: preprocessing(x))
testset['text_preprocessed'] = testset.text.apply(lambda x: preprocessing(x))

### Load high-level features from text with LLM (gpt-4o-mini)

In [None]:
from food_hazard_detection.preprocessing import process_txt_files

In [None]:
folder_path = FILES_DIR / "datasets/llm_features/outputs"
df_llm_feats = process_txt_files(folder_path, "hazard")
df_llm_feats

In [None]:
df_llm_feats = df_llm_feats.drop(columns=["id", "custom_id", "recall_date", "company_name", "product_batch_code", 'product_size'], errors='ignore')


for col in df_llm_feats.columns:
    df_llm_feats[col] = df_llm_feats[col].apply(lambda x: str(x) if isinstance(x, list) else x)

df_llm_feats = pd.get_dummies(df_llm_feats, sparse=False, prefix_sep='_')

In [None]:
from sklearn.metrics import f1_score

def compute_score(hazards_true, products_true, hazards_pred, products_pred):
    # compute f1 for hazards:
    f1_hazards = f1_score(
        hazards_true,
        hazards_pred,
        average='macro'
    )
    print(f"F1 for hazard_category: {round(f1_hazards, 2)}")
    # compute f1 for products:
    f1_products = f1_score(
        products_true[hazards_pred == hazards_true],
        products_pred[hazards_pred == hazards_true],
        average='macro'
    )
    print(f"F1 for product_category: {round(f1_products, 2)}")
    return (f1_hazards + f1_products) / 2.


# Sub-Task 1 - LLM features only
This task consists of predicting 2 category labels:
- hazard_category: the type of hazard (e.g. microbiological, chemical, etc.)
- product_category: the type of product (e.g. meat, fish, etc.)

Observed metric: weighted F1 score - hazard_category is preffered

### hazard_category

In [None]:
#features
X_train = df_llm_feats.iloc[:trainset.shape[0]]
X_val = df_llm_feats.iloc[trainset.shape[0]:trainset.shape[0] + validset.shape[0]]
X_test = df_llm_feats.tail(testset.shape[0])
#labels
y_train = trainset['hazard_category']
y_val = validset['hazard_category']
y_test = testset['hazard_category']

In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


model = RandomForestClassifier(random_state=42)

param_dist = {
    "n_estimators": [50, 100, 200, 400],      # Number of trees in the forest
    "max_depth": [3, 5, 10, None],            # Max depth of the tree
    "min_samples_split": [2, 5, 10],          # Minimal number of samples for splitting
    "min_samples_leaf": [1, 2, 5, 10],        # Minimal number of samples in one leaf
}

random_search = RandomizedSearchCV(
    model,
    param_distributions=param_dist,
    n_iter=20,
    cv=10,
    scoring="f1_macro",
    random_state=42,
    n_jobs=-1,
    verbose=1
)

random_search.fit(X_train, y_train)

print("Best paramaters: ", random_search.best_params_)
print("Best score on training CV: ", random_search.best_score_)
hazard_true = y_val

In [None]:
best_model = random_search.best_estimator_
hazard_pred = best_model.predict(X_val)
print("Classification report na testu:")
print(classification_report(hazard_true ,hazard_pred, zero_division=0))

### product_category

In [None]:
y_train = trainset['product_category']
y_val = validset['product_category']

In [None]:
random_search = RandomizedSearchCV(
    model,
    param_distributions=param_dist,
    n_iter=20,
    cv=10,
    scoring="f1_macro",
    random_state=42,
    n_jobs=-1,
    verbose=1
)

random_search.fit(X_train, y_train)

print("Best paramaters: ", random_search.best_params_)
print("Best score on training CV: ", random_search.best_score_)
product_true = y_val

In [None]:
best_model = random_search.best_estimator_
product_pred = best_model.predict(X_val)
print("Classification report na testu:")
print(classification_report(product_true, product_pred, zero_division=0))

### Sub-Task 1 results:

In [None]:
print(f"Final ST1 F1 score: {round(compute_score(hazard_true, product_true, hazard_pred, product_pred), 2)}")

# Sub-Task 2 - LLM features only
This task consists of predicting 2 concrete labels:
- hazard - the type of hazard (e.g. salmonella, etc.)
- product -  the type of product (e.g. chicken, etc.)

Observed metric: weighted F1 score - hazard is preffered

### hazard

In [None]:
#features
X_train = df_llm_feats.iloc[:trainset.shape[0]]
X_val = df_llm_feats.iloc[trainset.shape[0]:trainset.shape[0] + validset.shape[0]]
X_test = df_llm_feats.tail(testset.shape[0])
#labels
y_train = trainset['hazard']
y_val = validset['hazard']
y_test = testset['hazard']


model = RandomForestClassifier(random_state=42)

random_search = RandomizedSearchCV(
    model,
    param_distributions=param_dist,
    n_iter=20,
    cv=10,
    scoring="f1_macro",
    random_state=42,
    n_jobs=-1,
    verbose=1
)

random_search.fit(X_train, y_train)

print("Best paramaters: ", random_search.best_params_)
print("Best score on training CV: ", random_search.best_score_)
hazard_true = y_val

In [None]:
best_model = random_search.best_estimator_
hazard_pred = best_model.predict(X_val)
print("Classification report na testu:")
print(classification_report(hazard_true ,hazard_pred, zero_division=0))

### product


In [None]:
y_train = trainset['product']
y_val = validset['product']
random_search = RandomizedSearchCV(
    model,
    param_distributions=param_dist,
    n_iter=20,
    cv=10,
    scoring="f1_macro",
    random_state=42,
    n_jobs=-1,
    verbose=1
)

random_search.fit(X_train, y_train)

print("Best paramaters: ", random_search.best_params_)
print("Best score on training CV: ", random_search.best_score_)
product_true = y_val

In [None]:
best_model = random_search.best_estimator_
product_pred = best_model.predict(X_val)
print("Classification report na testu:")
print(classification_report(product_true, product_pred, zero_division=0))

### Sub-Task 2 results:

In [None]:
print(f"Final ST2 F1 score: {round(compute_score(hazard_true, product_true, hazard_pred, product_pred), 2)}")

## BERT and RoBERTa

In this part, we use two BERT based models - DistillBERT and RoBERTa - to predict categories. We use "title" and "text" columns together as an input. Since BERT can only predict single cathegory, we must train each model 2 times.

Warning: the models take a very long time to fine-tune.

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, BertTokenizer, BertForSequenceClassification, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset, DatasetDict
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
def assign_labels(dataset, column, label_dict):
    # autoencoding
    unique_values = dataset[column].unique()
    for value in unique_values:
        if value not in label_dict:
            label_dict[value] = len(label_dict)


def transform_dataset(dataset):
    for split in dataset.keys():
        dataset[split] = dataset[split].replace({
            "hazard_category": labels_hazard_categories,
            "product_category": labels_product_categories,
            "product": labels_products,
            "hazard": labels_hazards
        })
        dataset[split]["text"] = dataset[split]["title"] + ": " + dataset[split]["text"]
        dataset[split] = dataset[split].drop(columns=["day", "month", "year", "country", "title"])

    return dataset

In [None]:
labels_hazard_categories = {}
labels_hazards = {}
labels_product_categories = {}
labels_products = {}

unique_products = set()
for split in [train_with_big, validset, testset]:
    unique_products.update(split["product"])

assign_labels(train_with_big, "hazard_category", labels_hazard_categories)
assign_labels(train_with_big, "hazard", labels_hazards)
assign_labels(train_with_big, "product_category", labels_product_categories)

for value in unique_products:
    if value not in labels_products:
        labels_products[value] = len(labels_products)

In [None]:
bert_dataset = transform_dataset({"train": train_with_big.copy(), "valid": validset.copy(), "test": testset.copy()})

In [None]:
bert_dataset = DatasetDict({"train" : Dataset.from_pandas(bert_dataset["train"]),"valid" : Dataset.from_pandas(bert_dataset["valid"]), "test" : Dataset.from_pandas(bert_dataset["test"])})

In [None]:
bert_tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

def bert_tokenize_function(examples):
    return bert_tokenizer(examples["text"], truncation=True, padding="max_length")

In [None]:
bert_tokenized = bert_dataset.map(bert_tokenize_function, batched=True)

In [None]:
bert_data_collator = DataCollatorWithPadding(tokenizer=bert_tokenizer)

bert_training_args = TrainingArguments("test_trainer",
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch")

In [None]:
bert_trainers = dict()

length = {'hazard' : len(labels_hazards), 'product' : len(labels_products), 'hazard_category' : len(labels_hazard_categories), 'product_category' : len(labels_product_categories)} 

for label in ['product', 'hazard', 'product_category', 'hazard_category']:
    train_dataset = bert_tokenized["train"].rename_column(label, "label")
    eval_dataset = bert_tokenized["valid"].rename_column(label, "label")

    bert_model = AutoModelForSequenceClassification.from_pretrained(
        "distilbert/distilbert-base-uncased", num_labels=length[label]
    )

    trainer = Trainer(
        model=bert_model,
        args=bert_training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        processing_class=bert_tokenizer,
        data_collator=bert_data_collator,
    )

    trainer.train()

    bert_trainers[label] = trainer

In [None]:
devset_bert = dict()

for category in bert_trainers.keys():
    devset_bert[category] = bert_trainers[category].predict(bert_tokenized['test'])

In [None]:
import numpy
print(f"Score Sub-Task 1: {compute_score(bert_tokenized['test']['hazard_category'], bert_tokenized['test']['product_category'], devset_bert['hazard_category'].predictions.argmax(-1).tolist(), devset_bert['product_category'].predictions.argmax(-1).tolist()):.3f}")
print(f"Score Sub-Task 2: {compute_score(bert_tokenized['test']['hazard'], bert_tokenized['test']['product'], devset_bert['hazard'].predictions.argmax(-1), devset_bert['product'].predictions.argmax(-1)):.3f}")

F1 score after training was:

Hazard score 0.8453172117313383

Product score 0.646217380479502

Hazard Category score 0.45974956313754045

Product Category score 0.11395288958485314

DistilBERT is thus better on predicting than baseline.

### Roberta

In [None]:
roberta_dataset = transform_dataset({"train": trainset.copy(), "valid": validset.copy(), "test": testset.copy()})

In [None]:
roberta_dataset = DatasetDict({"train" : Dataset.from_pandas(roberta_dataset["train"]),"valid" : Dataset.from_pandas(roberta_dataset["valid"]), "test" : Dataset.from_pandas(roberta_dataset["test"])})

In [None]:
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def roberta_tokenize_function(examples):
    return roberta_tokenizer(examples["text"], truncation=True, padding="max_length")

In [None]:
roberta_tokenized = roberta_dataset.map(roberta_tokenize_function, batched=True)

In [None]:
roberta_data_collator = DataCollatorWithPadding(tokenizer=roberta_tokenizer)

roberta_training_args = TrainingArguments("test_trainer",
                                  num_train_epochs=3,
                                  weight_decay=0.01,
                                  eval_strategy="epoch")

In [None]:
roberta_trainers = dict()

for label in ['product', 'hazard', 'product_category', 'hazard_category']:
    train_dataset = roberta_tokenized["train"].rename_column(label, "label")
    eval_dataset = roberta_tokenized["valid"].rename_column(label, "label")

    roberta_model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=length[label])

    trainer = Trainer(
        model=roberta_model,
        args=roberta_training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        processing_class=roberta_tokenizer,
        data_collator=roberta_data_collator,
    )

    trainer.train()

    bert_trainers[label] = trainer

In [None]:
devset_roberta = dict()

for category in bert_trainers.keys():
    devset_roberta[category] = roberta_trainers[category].predict(roberta_tokenized['test'])

In [None]:
print(f"Score Sub-Task 1: {compute_score(bert_tokenized['test']['hazard_category'], bert_tokenized['test']['product_category'], devset_bert['hazard_category'].predictions.argmax(-1).tolist(), devset_bert['product_category'].predictions.argmax(-1).tolist()):.3f}")

print(f"Score Sub-Task 2: {compute_score(bert_tokenized['test']['hazard'], bert_tokenized['test']['product'], devset_bert['hazard'].predictions.argmax(-1), devset_bert['product'].predictions.argmax(-1)):.3f}")