<a href="https://colab.research.google.com/github/jerryk42/SemEval-Food-Hazard-Detection-Challenge/blob/main/Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Cloning into 'My-Colab'...


In [14]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm  # Import tqdm for progress bars
import matplotlib.pyplot as plt  # Import matplotlib for plotting


In [15]:
config = {
    'max_len': 256,
    'batch_size': 16,
    'learning_rate': 0.00005,
    'epochs': 100,
    'model_name': "dmis-lab/biobert-base-cased-v1.1"  # BioBERT model name
}


In [16]:
# Set device for training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cuda


In [17]:
import pandas as pd

# URL of the raw file
url = "https://raw.githubusercontent.com/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io/refs/heads/main/data/incidents_train.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(url)

# Display the first few rows of the DataFrame
print(df.head())


   Unnamed: 0  year  month  day country                             title  \
0           0  1994      1    7      us  Recall Notification: FSIS-024-94   
1           1  1994      3   10      us  Recall Notification: FSIS-033-94   
2           2  1994      3   28      us  Recall Notification: FSIS-014-94   
3           3  1994      4    3      us  Recall Notification: FSIS-009-94   
4           4  1994      7    1      us  Recall Notification: FSIS-001-94   

                                                text hazard-category  \
0  Case Number: 024-94   \n            Date Opene...      biological   
1  Case Number: 033-94   \n            Date Opene...      biological   
2  Case Number: 014-94   \n            Date Opene...      biological   
3  Case Number: 009-94   \n            Date Opene...  foreign bodies   
4  Case Number: 001-94   \n            Date Opene...  foreign bodies   

               product-category                  hazard  \
0  meat, egg and dairy products  listeria mon

In [18]:
# Drop the first column
df = df.drop(df.columns[0], axis=1)

In [19]:
# Check the structure of the DataFrame
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5082 entries, 0 to 5081
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   year              5082 non-null   int64 
 1   month             5082 non-null   int64 
 2   day               5082 non-null   int64 
 3   country           5082 non-null   object
 4   title             5082 non-null   object
 5   text              5082 non-null   object
 6   hazard-category   5082 non-null   object
 7   product-category  5082 non-null   object
 8   hazard            5082 non-null   object
 9   product           5082 non-null   object
dtypes: int64(3), object(7)
memory usage: 397.2+ KB
None


In [20]:
# Custom Dataset for Text Data
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [21]:
# Function to clean text (title or text) and remove stopwords
def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()
    text = ' '.join(text.split())
    return text


In [22]:
# Load tokenizer for Microsoft PubMedBERT model
tokenizer = AutoTokenizer.from_pretrained(config['model_name'])

# Assuming df is your DataFrame
df['title'] = df['title'].apply(clean_text)
df['text'] = df['text'].apply(clean_text)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

In [23]:
# Define relevant features and targets
features = ['year', 'month', 'day', 'country']
targets_subtask1 = ['hazard-category','product-category']
targets_subtask2 = ['hazard','product']


In [24]:
# Encode target labels to numeric values
label_encoders = {}
for target in targets_subtask1 + targets_subtask2:
    le = LabelEncoder()
    df[target] = le.fit_transform(df[target])
    label_encoders[target] = le


In [25]:
# Prepare data for both title and text
def prepare_data(text_column):
    X = df[features + [text_column]]
    y_subtask1 = df[targets_subtask1]
    y_subtask2 = df[targets_subtask2]

    data_splits = {}
    for target in targets_subtask1 + targets_subtask2:
        X_train, X_test, y_train, y_test = train_test_split(
            X, df[target], test_size=0.1, random_state=42
        )

        # Reset indices to ensure matching
        X_train = X_train.reset_index(drop=True)
        y_train = y_train.reset_index(drop=True)
        X_test = X_test.reset_index(drop=True)
        y_test = y_test.reset_index(drop=True)

        data_splits[target] = (X_train, X_test, y_train, y_test)

    return data_splits


In [26]:
# Prepare data for title and text
title_splits = prepare_data('title')
text_splits = prepare_data('text')


In [27]:
def train_and_evaluate_nn(data_splits, targets, model_type='title', early_stopping_patience=6, lr_reduce_factor=0.1):
    f1_scores = []  # List to store F1 scores for each task

    for target in targets:
        print(f"\nStarting training for task: {target}")  # Print task message

        X_train, X_test, y_train, y_test = data_splits[target]

        # Prepare text data using the tokenizer
        if model_type == 'title':
            texts_train = X_train['title'].values
            texts_test = X_test['title'].values
        else:
            texts_train = X_train['text'].values
            texts_test = X_test['text'].values

        # Create DataLoader for training and testing
        train_dataset = TextDataset(texts_train, y_train, tokenizer, config['max_len'])
        test_dataset = TextDataset(texts_test, y_test, tokenizer, config['max_len'])

        train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False)

        # Model setup
        num_labels = len(label_encoders[target].classes_)
        model = AutoModelForSequenceClassification.from_pretrained(config['model_name'], num_labels=num_labels).to(device)

        optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=lr_reduce_factor, patience=3, verbose=True)
        criterion = nn.CrossEntropyLoss()

        # Training process
        model.train()
        best_loss = float('inf')
        early_stop_counter = 0

        for epoch in range(config['epochs']):
            print(f"Epoch {epoch+1}/{config['epochs']} - Training: {target}")
            progress_bar = tqdm(train_loader, desc=f"Training Epoch {epoch+1}", total=len(train_loader), leave=True)
            epoch_loss = 0.0

            for batch in progress_bar:
                optimizer.zero_grad()
                input_ids = batch['input_ids'].squeeze(1).to(device)
                attention_mask = batch['attention_mask'].squeeze(1).to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                loss = criterion(outputs.logits, labels)
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item()
                progress_bar.set_postfix(loss=loss.item())

            # Average loss for the epoch
            avg_epoch_loss = epoch_loss / len(train_loader)
            print(f"Average Training Loss for Epoch {epoch+1}: {avg_epoch_loss}")

            # Evaluate on the test set to compute validation loss
            val_loss = 0.0
            model.eval()
            with torch.no_grad():
                for batch in test_loader:
                    input_ids = batch['input_ids'].squeeze(1).to(device)
                    attention_mask = batch['attention_mask'].squeeze(1).to(device)
                    labels = batch['label'].to(device)

                    outputs = model(input_ids, attention_mask=attention_mask)
                    loss = criterion(outputs.logits, labels)
                    val_loss += loss.item()

            avg_val_loss = val_loss / len(test_loader)
            print(f"Validation Loss after Epoch {epoch+1}: {avg_val_loss}")

            # Step the scheduler with the validation loss
            scheduler.step(avg_val_loss)

            # Early stopping check
            if avg_val_loss < best_loss:
                best_loss = avg_val_loss
                early_stop_counter = 0
                torch.save(model.state_dict(), f"best_model_{target}.pt")  # Save the best model
            else:
                early_stop_counter += 1
                print(f"Early stopping counter: {early_stop_counter}/{early_stopping_patience}")

            if early_stop_counter >= early_stopping_patience:
                print("Early stopping triggered.")
                break

        # Load the best model for evaluation
        model.load_state_dict(torch.load(f"best_model_{target}.pt"))

        # Evaluation process
        print(f"Evaluating model for task: {target}")
        model.eval()
        y_preds = []
        y_true = []
        with torch.no_grad():
            for batch in tqdm(test_loader, desc="Evaluating", total=len(test_loader), leave=True):
                input_ids = batch['input_ids'].squeeze(1).to(device)
                attention_mask = batch['attention_mask'].squeeze(1).to(device)
                labels = batch['label'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask)
                _, preds = torch.max(outputs.logits, dim=1)
                y_preds.extend(preds.cpu().numpy())
                y_true.extend(labels.cpu().numpy())

        # Decode labels back to original categories using the label encoder
        decoded_preds = label_encoders[target].inverse_transform(y_preds)
        decoded_true = label_encoders[target].inverse_transform(y_true)

        # Calculate F1 score for the task
        f1 = f1_score(decoded_true, decoded_preds, average='weighted')
        f1_scores.append(f1)
        print(f"F1-Score for {target}: {f1}")

        # Print classification report
        print(f"Classification Report for {target}:\n")
        print(classification_report(decoded_true, decoded_preds, zero_division=0))

    return f1_scores  # Return the list of F1 scores for plotting


In [None]:
# Train and evaluate for both title and text
print("\nTraining and Evaluating for Title Tasks:")
title_f1_scores = train_and_evaluate_nn(title_splits, targets_subtask1 + targets_subtask2, model_type='title')

In [30]:
print("\nTraining and Evaluating for Text Tasks:")
text_f1_scores = train_and_evaluate_nn(text_splits, targets_subtask1 + targets_subtask2, model_type='text')



Training and Evaluating for Text Tasks:

Starting training for task: hazard-category


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/100 - Training: hazard-category


Training Epoch 1: 100%|██████████| 255/255 [01:23<00:00,  3.06it/s, loss=2.59]


Average Training Loss for Epoch 1: 0.6798259653589305
Validation Loss after Epoch 1: 0.33254388661589473
Epoch 2/100 - Training: hazard-category


Training Epoch 2: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.0155]


Average Training Loss for Epoch 2: 0.27947679690140137
Validation Loss after Epoch 2: 0.25995737037737854
Epoch 3/100 - Training: hazard-category


Training Epoch 3: 100%|██████████| 255/255 [01:22<00:00,  3.10it/s, loss=0.00392]


Average Training Loss for Epoch 3: 0.17962386618897902
Validation Loss after Epoch 3: 0.2506352998316288
Epoch 4/100 - Training: hazard-category


Training Epoch 4: 100%|██████████| 255/255 [01:22<00:00,  3.10it/s, loss=0.0217]


Average Training Loss for Epoch 4: 0.13146512895165122
Validation Loss after Epoch 4: 0.2559465115045896
Early stopping counter: 1/9
Epoch 5/100 - Training: hazard-category


Training Epoch 5: 100%|██████████| 255/255 [01:22<00:00,  3.10it/s, loss=0.00293]


Average Training Loss for Epoch 5: 0.10796547501494048
Validation Loss after Epoch 5: 0.22673909986770013
Epoch 6/100 - Training: hazard-category


Training Epoch 6: 100%|██████████| 255/255 [01:22<00:00,  3.10it/s, loss=0.00293]


Average Training Loss for Epoch 6: 0.08250430733440262
Validation Loss after Epoch 6: 0.2552821682402282
Early stopping counter: 1/9
Epoch 7/100 - Training: hazard-category


Training Epoch 7: 100%|██████████| 255/255 [01:22<00:00,  3.10it/s, loss=0.00261]


Average Training Loss for Epoch 7: 0.06817066462107879
Validation Loss after Epoch 7: 0.2755258594661427
Early stopping counter: 2/9
Epoch 8/100 - Training: hazard-category


Training Epoch 8: 100%|██████████| 255/255 [01:22<00:00,  3.10it/s, loss=0.713]


Average Training Loss for Epoch 8: 0.05193269004104842
Validation Loss after Epoch 8: 0.26822977215852006
Early stopping counter: 3/9
Epoch 9/100 - Training: hazard-category


Training Epoch 9: 100%|██████████| 255/255 [01:22<00:00,  3.10it/s, loss=0.00152]


Average Training Loss for Epoch 9: 0.0782761278804209
Validation Loss after Epoch 9: 0.26615375583060086
Early stopping counter: 4/9
Epoch 10/100 - Training: hazard-category


Training Epoch 10: 100%|██████████| 255/255 [01:22<00:00,  3.10it/s, loss=0.00901]


Average Training Loss for Epoch 10: 0.032497320988891173
Validation Loss after Epoch 10: 0.26557707077881787
Early stopping counter: 5/9
Epoch 11/100 - Training: hazard-category


Training Epoch 11: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.000774]


Average Training Loss for Epoch 11: 0.01998385538954251
Validation Loss after Epoch 11: 0.27503021656593774
Early stopping counter: 6/9
Epoch 12/100 - Training: hazard-category


Training Epoch 12: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.000895]


Average Training Loss for Epoch 12: 0.014619499716369033
Validation Loss after Epoch 12: 0.2888569576643931
Early stopping counter: 7/9
Epoch 13/100 - Training: hazard-category


Training Epoch 13: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.00098]


Average Training Loss for Epoch 13: 0.010073064479922109
Validation Loss after Epoch 13: 0.30030168795019563
Early stopping counter: 8/9
Epoch 14/100 - Training: hazard-category


Training Epoch 14: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.00237]


Average Training Loss for Epoch 14: 0.007186391522797445
Validation Loss after Epoch 14: 0.3008114660260617
Early stopping counter: 9/9
Early stopping triggered.


  model.load_state_dict(torch.load(f"best_model_{target}.pt"))


Evaluating model for task: hazard-category


Evaluating: 100%|██████████| 64/64 [00:06<00:00,  9.43it/s]


F1-Score for hazard-category: 0.9433545110826577
Classification Report for hazard-category:

                                precision    recall  f1-score   support

                     allergens       0.95      0.98      0.97       377
                    biological       0.99      0.98      0.99       339
                      chemical       0.93      0.96      0.94        68
food additives and flavourings       0.30      0.60      0.40         5
                foreign bodies       0.96      0.99      0.97       111
                         fraud       0.82      0.72      0.77        68
                     migration       0.00      0.00      0.00         1
          organoleptic aspects       0.64      0.70      0.67        10
                  other hazard       0.82      0.67      0.73        27
              packaging defect       1.00      0.64      0.78        11

                      accuracy                           0.94      1017
                     macro avg       0.74

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/100 - Training: product-category


Training Epoch 1: 100%|██████████| 255/255 [01:23<00:00,  3.05it/s, loss=0.664]


Average Training Loss for Epoch 1: 1.860068087718066
Validation Loss after Epoch 1: 1.254404972307384
Epoch 2/100 - Training: product-category


Training Epoch 2: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.929]


Average Training Loss for Epoch 2: 1.040272042739625
Validation Loss after Epoch 2: 0.9498994033783674
Epoch 3/100 - Training: product-category


Training Epoch 3: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.0185]


Average Training Loss for Epoch 3: 0.6437753416378709
Validation Loss after Epoch 3: 0.9160427339375019
Epoch 4/100 - Training: product-category


Training Epoch 4: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.0336]


Average Training Loss for Epoch 4: 0.3833046844776939
Validation Loss after Epoch 4: 0.9890541359782219
Early stopping counter: 1/9
Epoch 5/100 - Training: product-category


Training Epoch 5: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.00402]


Average Training Loss for Epoch 5: 0.24658244466226475
Validation Loss after Epoch 5: 0.9926937860436738
Early stopping counter: 2/9
Epoch 6/100 - Training: product-category


Training Epoch 6: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.0252]


Average Training Loss for Epoch 6: 0.1438320073792163
Validation Loss after Epoch 6: 1.0674756047083065
Early stopping counter: 3/9
Epoch 7/100 - Training: product-category


Training Epoch 7: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.0351]


Average Training Loss for Epoch 7: 0.08668091528336791
Validation Loss after Epoch 7: 1.1298464983701706
Early stopping counter: 4/9
Epoch 8/100 - Training: product-category


Training Epoch 8: 100%|██████████| 255/255 [01:22<00:00,  3.10it/s, loss=0.00417]


Average Training Loss for Epoch 8: 0.04252996213515015
Validation Loss after Epoch 8: 1.0658981469459832
Early stopping counter: 5/9
Epoch 9/100 - Training: product-category


Training Epoch 9: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.00183]


Average Training Loss for Epoch 9: 0.02400453854580501
Validation Loss after Epoch 9: 1.0747963403118774
Early stopping counter: 6/9
Epoch 10/100 - Training: product-category


Training Epoch 10: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.0033]


Average Training Loss for Epoch 10: 0.018851797365784354
Validation Loss after Epoch 10: 1.0872080671833828
Early stopping counter: 7/9
Epoch 11/100 - Training: product-category


Training Epoch 11: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.0418]


Average Training Loss for Epoch 11: 0.016374234013332455
Validation Loss after Epoch 11: 1.0988473910838366
Early stopping counter: 8/9
Epoch 12/100 - Training: product-category


Training Epoch 12: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.00163]


Average Training Loss for Epoch 12: 0.014261760258608881
Validation Loss after Epoch 12: 1.100366580300033
Early stopping counter: 9/9
Early stopping triggered.


  model.load_state_dict(torch.load(f"best_model_{target}.pt"))


Evaluating model for task: product-category


Evaluating: 100%|██████████| 64/64 [00:06<00:00,  9.38it/s]


F1-Score for product-category: 0.7327494946372082
Classification Report for product-category:

                                                   precision    recall  f1-score   support

                              alcoholic beverages       1.00      0.57      0.73         7
                      cereals and bakery products       0.75      0.69      0.72       123
     cocoa and cocoa preparations, coffee and tea       0.70      0.80      0.74        49
                                    confectionery       0.53      0.45      0.49        40
dietetic foods, food supplements, fortified foods       0.64      0.75      0.69        24
                                    fats and oils       1.00      0.50      0.67         4
                                   feed materials       0.00      0.00      0.00         3
                           food contact materials       0.00      0.00      0.00         1
                            fruits and vegetables       0.84      0.68      0.75     

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/100 - Training: hazard


Training Epoch 1: 100%|██████████| 255/255 [01:23<00:00,  3.05it/s, loss=3.17]


Average Training Loss for Epoch 1: 2.56149242438522
Validation Loss after Epoch 1: 1.4875082005746663
Epoch 2/100 - Training: hazard


Training Epoch 2: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.0122]


Average Training Loss for Epoch 2: 1.210525272666093
Validation Loss after Epoch 2: 1.1105729159899056
Epoch 3/100 - Training: hazard


Training Epoch 3: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=5.5]


Average Training Loss for Epoch 3: 0.866402884850315
Validation Loss after Epoch 3: 0.9031850447645411
Epoch 4/100 - Training: hazard


Training Epoch 4: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.0206]


Average Training Loss for Epoch 4: 0.6304155398963713
Validation Loss after Epoch 4: 0.8385995001299307
Epoch 5/100 - Training: hazard


Training Epoch 5: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.0136]


Average Training Loss for Epoch 5: 0.4661100546409394
Validation Loss after Epoch 5: 0.7624372491845861
Epoch 6/100 - Training: hazard


Training Epoch 6: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.103]


Average Training Loss for Epoch 6: 0.3352111782221233
Validation Loss after Epoch 6: 0.7656751431350131
Early stopping counter: 1/9
Epoch 7/100 - Training: hazard


Training Epoch 7: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.0265]


Average Training Loss for Epoch 7: 0.2542060794783573
Validation Loss after Epoch 7: 0.7850925178208854
Early stopping counter: 2/9
Epoch 8/100 - Training: hazard


Training Epoch 8: 100%|██████████| 255/255 [01:22<00:00,  3.10it/s, loss=0.00573]


Average Training Loss for Epoch 8: 0.1944229768497833
Validation Loss after Epoch 8: 0.774550450121751
Early stopping counter: 3/9
Epoch 9/100 - Training: hazard


Training Epoch 9: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.0276]


Average Training Loss for Epoch 9: 0.15537802213985547
Validation Loss after Epoch 9: 0.7779683887347346
Early stopping counter: 4/9
Epoch 10/100 - Training: hazard


Training Epoch 10: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.00507]


Average Training Loss for Epoch 10: 0.10635463304306363
Validation Loss after Epoch 10: 0.7947975320566911
Early stopping counter: 5/9
Epoch 11/100 - Training: hazard


Training Epoch 11: 100%|██████████| 255/255 [01:22<00:00,  3.10it/s, loss=0.206]


Average Training Loss for Epoch 11: 0.0792879583387106
Validation Loss after Epoch 11: 0.7890628517925506
Early stopping counter: 6/9
Epoch 12/100 - Training: hazard


Training Epoch 12: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.0441]


Average Training Loss for Epoch 12: 0.06803843196043197
Validation Loss after Epoch 12: 0.7960599010402802
Early stopping counter: 7/9
Epoch 13/100 - Training: hazard


Training Epoch 13: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.0222]


Average Training Loss for Epoch 13: 0.05999292516898291
Validation Loss after Epoch 13: 0.7953772420005407
Early stopping counter: 8/9
Epoch 14/100 - Training: hazard


Training Epoch 14: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.252]


Average Training Loss for Epoch 14: 0.05462691542491609
Validation Loss after Epoch 14: 0.7962887064059032
Early stopping counter: 9/9
Early stopping triggered.


  model.load_state_dict(torch.load(f"best_model_{target}.pt"))


Evaluating model for task: hazard


Evaluating: 100%|██████████| 64/64 [00:06<00:00,  9.37it/s]


F1-Score for hazard: 0.8221973906841757
Classification Report for hazard:

                                                   precision    recall  f1-score   support

                                        Aflatoxin       1.00      1.00      1.00         4
                                  alcohol content       0.00      0.00      0.00         1
                                        alkaloids       0.00      0.00      0.00         2
                                        allergens       0.00      0.00      0.00         4
                                           almond       0.93      0.93      0.93        14
             altered organoleptic characteristics       0.00      0.00      0.00         2
                                        amygdalin       0.00      0.00      0.00         2
                           antibiotics, vet drugs       1.00      1.00      1.00         1
                                    bacillus spp.       1.00      1.00      1.00         1
              

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/100 - Training: product


Training Epoch 1: 100%|██████████| 255/255 [01:23<00:00,  3.05it/s, loss=6.05]


Average Training Loss for Epoch 1: 6.430973843967213
Validation Loss after Epoch 1: 6.279362641274929
Epoch 2/100 - Training: product


Training Epoch 2: 100%|██████████| 255/255 [01:22<00:00,  3.10it/s, loss=3.4]


Average Training Loss for Epoch 2: 5.873840578864603
Validation Loss after Epoch 2: 5.775948576629162
Epoch 3/100 - Training: product


Training Epoch 3: 100%|██████████| 255/255 [01:22<00:00,  3.10it/s, loss=2.99]


Average Training Loss for Epoch 3: 4.985759925842285
Validation Loss after Epoch 3: 5.043755181133747
Epoch 4/100 - Training: product


Training Epoch 4: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=5.12]


Average Training Loss for Epoch 4: 4.144231489592907
Validation Loss after Epoch 4: 4.64018539339304
Epoch 5/100 - Training: product


Training Epoch 5: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=4.64]


Average Training Loss for Epoch 5: 3.4366704753800934
Validation Loss after Epoch 5: 4.2881148904562
Epoch 6/100 - Training: product


Training Epoch 6: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=4.41]


Average Training Loss for Epoch 6: 2.8240125081118417
Validation Loss after Epoch 6: 4.066495042294264
Epoch 7/100 - Training: product


Training Epoch 7: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.679]


Average Training Loss for Epoch 7: 2.288476370830162
Validation Loss after Epoch 7: 3.841326478868723
Epoch 8/100 - Training: product


Training Epoch 8: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.204]


Average Training Loss for Epoch 8: 1.8376911141124426
Validation Loss after Epoch 8: 3.7146271392703056
Epoch 9/100 - Training: product


Training Epoch 9: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=3.97]


Average Training Loss for Epoch 9: 1.4633474535801831
Validation Loss after Epoch 9: 3.6377406418323517
Epoch 10/100 - Training: product


Training Epoch 10: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=3.64]


Average Training Loss for Epoch 10: 1.1793136149644852
Validation Loss after Epoch 10: 3.569000944495201
Epoch 11/100 - Training: product


Training Epoch 11: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.0886]


Average Training Loss for Epoch 11: 0.9222025815470546
Validation Loss after Epoch 11: 3.551720470190048
Epoch 12/100 - Training: product


Training Epoch 12: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.5]


Average Training Loss for Epoch 12: 0.7322027014166701
Validation Loss after Epoch 12: 3.49192688241601
Epoch 13/100 - Training: product


Training Epoch 13: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.0622]


Average Training Loss for Epoch 13: 0.5790062916045096
Validation Loss after Epoch 13: 3.495384454727173
Early stopping counter: 1/9
Epoch 14/100 - Training: product


Training Epoch 14: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.0537]


Average Training Loss for Epoch 14: 0.4576020341731754
Validation Loss after Epoch 14: 3.5044474937021732
Early stopping counter: 2/9
Epoch 15/100 - Training: product


Training Epoch 15: 100%|██████████| 255/255 [01:22<00:00,  3.10it/s, loss=0.0526]


Average Training Loss for Epoch 15: 0.3676407421774724
Validation Loss after Epoch 15: 3.483444821089506
Epoch 16/100 - Training: product


Training Epoch 16: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.0855]


Average Training Loss for Epoch 16: 0.29396467586504477
Validation Loss after Epoch 16: 3.4846994262188673
Early stopping counter: 1/9
Epoch 17/100 - Training: product


Training Epoch 17: 100%|██████████| 255/255 [01:22<00:00,  3.10it/s, loss=0.275]


Average Training Loss for Epoch 17: 0.23235210139377444
Validation Loss after Epoch 17: 3.508937168866396
Early stopping counter: 2/9
Epoch 18/100 - Training: product


Training Epoch 18: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.0238]


Average Training Loss for Epoch 18: 0.191148871490184
Validation Loss after Epoch 18: 3.5136890467256308
Early stopping counter: 3/9
Epoch 19/100 - Training: product


Training Epoch 19: 100%|██████████| 255/255 [01:22<00:00,  3.10it/s, loss=0.0397]


Average Training Loss for Epoch 19: 0.15074771335896323
Validation Loss after Epoch 19: 3.5524194538593292
Early stopping counter: 4/9
Epoch 20/100 - Training: product


Training Epoch 20: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.00333]


Average Training Loss for Epoch 20: 0.11106671730659026
Validation Loss after Epoch 20: 3.5382126942276955
Early stopping counter: 5/9
Epoch 21/100 - Training: product


Training Epoch 21: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.0509]


Average Training Loss for Epoch 21: 0.09880056541193934
Validation Loss after Epoch 21: 3.5408432744443417
Early stopping counter: 6/9
Epoch 22/100 - Training: product


Training Epoch 22: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.044]


Average Training Loss for Epoch 22: 0.09195403958506444
Validation Loss after Epoch 22: 3.545782368630171
Early stopping counter: 7/9
Epoch 23/100 - Training: product


Training Epoch 23: 100%|██████████| 255/255 [01:22<00:00,  3.09it/s, loss=0.0699]


Average Training Loss for Epoch 23: 0.08702425911438232
Validation Loss after Epoch 23: 3.5480423625558615
Early stopping counter: 8/9
Epoch 24/100 - Training: product


Training Epoch 24: 100%|██████████| 255/255 [01:22<00:00,  3.10it/s, loss=0.00883]


Average Training Loss for Epoch 24: 0.08267992053034844
Validation Loss after Epoch 24: 3.548329433426261
Early stopping counter: 9/9
Early stopping triggered.


  model.load_state_dict(torch.load(f"best_model_{target}.pt"))


Evaluating model for task: product


Evaluating: 100%|██████████| 64/64 [00:06<00:00,  9.43it/s]


F1-Score for product: 0.46133704490940275
Classification Report for product:

                                                   precision    recall  f1-score   support

                           Catfishes (freshwater)       0.83      1.00      0.91         5
                                  Dried pork meat       0.00      0.00      0.00         0
                            Fishes not identified       0.43      0.50      0.46         6
                         Not classified pork meat       0.00      0.00      0.00         3
                       Pangas catfishes (generic)       0.00      0.00      0.00         1
              Precooked cooked pork meat products       0.00      0.00      0.00         1
                                    Veggie Burger       0.50      0.50      0.50         2
                               after dinner mints       0.00      0.00      0.00         1
                                  alfalfa sprouts       0.00      0.00      0.00         0
           

In [31]:
# Create DataFrames for F1 scores for title and text
f1_scores_title_df = pd.DataFrame({
    'Task': targets_subtask1 + targets_subtask2,
    'F1-Score': title_f1_scores
})

f1_scores_text_df = pd.DataFrame({
    'Task': targets_subtask1 + targets_subtask2,
    'F1-Score': text_f1_scores
})

# Print the collected F1-scores for title
print("\nCollected F1-Scores for Title-Focused Classification (BioBert):")
print(f1_scores_title_df)

# Print the collected F1-scores for text
print("\nCollected F1-Scores for Text-Focused Classification (BioBert):")
print(f1_scores_text_df)

# Plot F1-scores for visual comparison
plt.figure(figsize=(10, 6))
plt.bar(f1_scores_title_df['Task'], f1_scores_title_df['F1-Score'], alpha=0.7, label='Title-Focused')
plt.bar(f1_scores_text_df['Task'], f1_scores_text_df['F1-Score'], alpha=0.7, label='Text-Focused')
plt.ylabel("F1-Score")
plt.title("F1-Score Comparison Between Title and Text Tasks (BioBert)")
plt.xticks(rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.show()


NameError: name 'title_f1_scores' is not defined

In [33]:
f1_scores_text_df = pd.DataFrame({
    'Task': targets_subtask1 + targets_subtask2,
    'F1-Score': text_f1_scores
})

# Print the collected F1-scores for text
print("\nCollected F1-Scores for Text-Focused Classification (BioBert):")
print(f1_scores_text_df)


Collected F1-Scores for Text-Focused Classification (BioBert):
               Task  F1-Score
0   hazard-category  0.943355
1  product-category  0.732749
2            hazard  0.822197
3           product  0.461337
