# PV056 project

### Follow the instructions and run the cells in this notebook to reproduce all the results.

In [40]:
!pip3 install requests
!pip3 install matplotlib
!pip3 install pandas



In [60]:
import os
import random

import requests
import pandas as pd
import matplotlib.pyplot as plt


os.environ["WANDB_DISABLED"] = "true"

## Load the datasets

In [61]:
dataset_parts = ["train", "test", "valid"]
for dataset_part in dataset_parts:
    url = f"https://raw.githubusercontent.com/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io/refs/heads/main/data/incidents_{dataset_part}.csv"
    response = requests.get(url)

    with open(f"incidents_{dataset_part}.csv", "wb") as f:
        f.write(response.content)

trainset = pd.read_csv('incidents_train.csv', index_col=0)
validset = pd.read_csv('incidents_valid.csv', index_col=0)
testset = pd.read_csv('incidents_valid.csv', index_col=0)

for dataset in [trainset, validset, testset]:
    dataset.rename(columns={"hazard-category": "hazard_category", "product-category": "product_category"}, inplace=True)

## Explore the data

In [56]:
trainset.sample()

NameError: name 'trainset' is not defined

In [44]:
trainset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5082 entries, 0 to 5983
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   year              5082 non-null   int64 
 1   month             5082 non-null   int64 
 2   day               5082 non-null   int64 
 3   country           5082 non-null   object
 4   title             5082 non-null   object
 5   text              5082 non-null   object
 6   hazard_category   5082 non-null   object
 7   product_category  5082 non-null   object
 8   hazard            5082 non-null   object
 9   product           5082 non-null   object
dtypes: int64(3), object(7)
memory usage: 436.7+ KB


In [62]:
trainset.head()

Unnamed: 0,year,month,day,country,title,text,hazard_category,product_category,hazard,product
0,1994,1,7,us,Recall Notification: FSIS-024-94,Case Number: 024-94 \n Date Opene...,biological,"meat, egg and dairy products",listeria monocytogenes,smoked sausage
1,1994,3,10,us,Recall Notification: FSIS-033-94,Case Number: 033-94 \n Date Opene...,biological,"meat, egg and dairy products",listeria spp,sausage
2,1994,3,28,us,Recall Notification: FSIS-014-94,Case Number: 014-94 \n Date Opene...,biological,"meat, egg and dairy products",listeria monocytogenes,ham slices
3,1994,4,3,us,Recall Notification: FSIS-009-94,Case Number: 009-94 \n Date Opene...,foreign bodies,"meat, egg and dairy products",plastic fragment,thermal processed pork meat
4,1994,7,1,us,Recall Notification: FSIS-001-94,Case Number: 001-94 \n Date Opene...,foreign bodies,"meat, egg and dairy products",plastic fragment,chicken breast


In [46]:
for i in range(10):
    x = random.randint(0, len(trainset))
    print(trainset["text"][x])       # change the column name to view another column data
    print()
    print("XXX")
    print()

Simplot Australia has recalled John West Tuna Tempters Sweet Seeded Mustard flavour (with a batch code 4ER12) from Coles, Woolworths, IGA and other independent supermarkets nationally due to foreign matter (glass fragments). Food products containing glass fragments may cause injury if consumed. Consumers should not eat this product and should return the product to the place of purchase for a full refund.


XXX

Precautionary Withdrawal of The Happy Pear Hero Sprouts Due to Possible Presence of Harmful Bacteria Tweet Friday, 29 April 2016 Summary Category 2: For Information Alert Notification: 2016.15 Product: The Happy Pear Hero Sprouts; pack size: 80 g Batch Code: Use by date: 10.05.2016 Country Of Origin: Ireland Message: As a precaution, The Happy Pear is voluntarily withdrawing the above batch of Hero Sprouts due to the possible presence of pathogenic bacteria. The Hero Sprouts product contains a mix of sprouted alfafa, broccoli, clover and radish seeds. Further testing is underway

KeyError: 3041

In [None]:
#DISTRIBUTION OF HAZARDS IN DATASET
fig, ax = plt.subplots()

ax.barh(trainset['hazard_category'].value_counts().index.to_list(), trainset['hazard_category'].value_counts().values, orientation='horizontal')

plt.xlabel('Frequency')
plt.ylabel('Type of hazard')
plt.title('Distribution of hazard category')
plt.show()

In [None]:
#DISTRIBUTION OF PRODUCT TYPES IN DATASET
fig, ax = plt.subplots()

ax.barh(trainset['product_category'].value_counts().index.to_list(), trainset['product_category'].value_counts().values, orientation='horizontal')

plt.xlabel('Type of product')
plt.ylabel('Frequency')
plt.title('Distribution of product category')
plt.show()


## Generate synthetic data for rare product and hazard categories

In [63]:
from food_hazard_detection.balance_dataset import (generate_prompt_triplets_by_hazard, generate_prompt_triplets_by_product,
                             generate_synthetic_data)

from food_hazard_detection import settings
from food_hazard_detection.settings import FILES_DIR, SYNTHETIC_DATA_DIR

rare_hazard_categories = ["migration", "food additives and flavourings",
                              "organoleptic aspects", "packaging defect"]
rare_product_categories = ["sugars and syrups", "feed materials", "food contact materials",
                           "honey and royal jelly", "food additives and flavourings", "fats and oils",
                           "pet feed", "other food product / mixed", "alcoholic beverages"]

In [70]:
combinations_hazard = generate_prompt_triplets_by_hazard(rare_hazard_categories, trainset)
generate_synthetic_data(SYNTHETIC_DATA_DIR / "synthetic_data_hazard.csv",
                        FILES_DIR / "prompts/generate_synthetic_data.md", combinations_hazard)

combinations_product = generate_prompt_triplets_by_product(rare_product_categories, trainset)
generate_synthetic_data(SYNTHETIC_DATA_DIR / "synthetic_data_product.csv",
                        FILES_DIR / "prompts/generate_synthetic_data.md", combinations_product)

print("Number of generated synthetic data points:", len(combinations_hazard)+len(combinations_product))

The data produced by Mistral are not perfect. So at this point some manual curration is needed (e.g. quote the text column to parse the csv properly or drop some rows with missing values). Because of that, we use later in the code already preprocessed synthetic data.

Load and check the generated data.
We generated smaller and bigger amount of synthetic data and we want to test whether it has some impact on performance.

In [73]:
synthetic_data_small = pd.read_csv(SYNTHETIC_DATA_DIR / "synthetic_data_small.csv")
print(synthetic_data_small.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83 entries, 0 to 82
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   year              83 non-null     int64 
 1   month             83 non-null     int64 
 2   day               83 non-null     int64 
 3   country           83 non-null     object
 4   title             83 non-null     object
 5   text              83 non-null     object
 6   hazard_category   83 non-null     object
 7   product_category  83 non-null     object
 8   hazard            83 non-null     object
 9   product           83 non-null     object
dtypes: int64(3), object(7)
memory usage: 6.6+ KB
None
   year  month  day country  \
0  2021      9   15      de   
1  2021      3   12      us   
2  2022     10   15      uk   
3  2023      1   25      uk   
4  2023     10    5      us   

                                               title  \
0  Glass Fragments Found in Sugar Syrup Prompts

In [None]:
synthetic_data_big = pd.read_csv(SYNTHETIC_DATA_DIR / "synthetic_data_big.csv")
print(synthetic_data_big.info())

In [None]:
train_small = pd.concat([trainset, synthetic_data_small], ignore_index=True)
train_big = pd.concat([trainset, synthetic_data_big], ignore_index=True)