<a href="https://colab.research.google.com/github/jerryk42/SemEval-Food-Hazard-Detection-Challenge/blob/main/SimpleML_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# URL of the raw file
url = "https://raw.githubusercontent.com/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io/refs/heads/main/data/incidents_train.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(url)

# Display the first few rows of the DataFrame
print(df.head())


   Unnamed: 0  year  month  day country                             title  \
0           0  1994      1    7      us  Recall Notification: FSIS-024-94   
1           1  1994      3   10      us  Recall Notification: FSIS-033-94   
2           2  1994      3   28      us  Recall Notification: FSIS-014-94   
3           3  1994      4    3      us  Recall Notification: FSIS-009-94   
4           4  1994      7    1      us  Recall Notification: FSIS-001-94   

                                                text hazard-category  \
0  Case Number: 024-94   \n            Date Opene...      biological   
1  Case Number: 033-94   \n            Date Opene...      biological   
2  Case Number: 014-94   \n            Date Opene...      biological   
3  Case Number: 009-94   \n            Date Opene...  foreign bodies   
4  Case Number: 001-94   \n            Date Opene...  foreign bodies   

               product-category                  hazard  \
0  meat, egg and dairy products  listeria mon

In [None]:
# Drop the first column
df = df.drop(df.columns[0], axis=1)

In [None]:
# Check the structure of the DataFrame
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5082 entries, 0 to 5081
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   year              5082 non-null   int64 
 1   month             5082 non-null   int64 
 2   day               5082 non-null   int64 
 3   country           5082 non-null   object
 4   title             5082 non-null   object
 5   text              5082 non-null   object
 6   hazard-category   5082 non-null   object
 7   product-category  5082 non-null   object
 8   hazard            5082 non-null   object
 9   product           5082 non-null   object
dtypes: int64(3), object(7)
memory usage: 397.2+ KB
None


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC, SVR
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error, r2_score

# Load the DataFrame (assuming it's already loaded as `df`)

# Step 1: Encode 'title' using TF-IDF
tfidf = TfidfVectorizer(max_features=3500)
X = tfidf.fit_transform(df['text']).toarray()

# Step 2: Encode labels for classification and regression
label_encoder_product_category = LabelEncoder()
label_encoder_hazard_category = LabelEncoder()
label_encoder_product = LabelEncoder()
label_encoder_hazard = LabelEncoder()

y_product_category = label_encoder_product_category.fit_transform(df['product-category'])
y_hazard_category = label_encoder_hazard_category.fit_transform(df['hazard-category'])
y_product = label_encoder_product.fit_transform(df['product'])
y_hazard = label_encoder_hazard.fit_transform(df['hazard'])

# Step 3: Split the dataset
X_train, X_test, y_train_pc, y_test_pc = train_test_split(X, y_product_category, test_size=0.2, random_state=42)
_, _, y_train_hc, y_test_hc = train_test_split(X, y_hazard_category, test_size=0.2, random_state=42)
_, _, y_train_p, y_test_p = train_test_split(X, y_product, test_size=0.2, random_state=42)
_, _, y_train_h, y_test_h = train_test_split(X, y_hazard, test_size=0.2, random_state=42)

# Step 4: Train SVM models
# Product-Category Classification
svm_product_category = SVC(kernel='linear', C=1.0, random_state=42)
svm_product_category.fit(X_train, y_train_pc)

# Hazard-Category Classification
svm_hazard_category = SVC(kernel='linear', C=1.0, random_state=42)
svm_hazard_category.fit(X_train, y_train_hc)

# Product Prediction
svm_product = SVC(kernel='linear', C=1.0, random_state=42)  # Use SVR if continuous
svm_product.fit(X_train, y_train_p)

# Hazard Prediction
svm_hazard = SVC(kernel='linear', C=1.0, random_state=42)  # Use SVR if continuous
svm_hazard.fit(X_train, y_train_h)

# Step 5: Evaluate the models
# Product-Category
y_pred_pc = svm_product_category.predict(X_test)
print("Product-Category Classification Report:")
print(classification_report(y_test_pc, y_pred_pc))
print("Accuracy:", accuracy_score(y_test_pc, y_pred_pc))

# Hazard-Category
y_pred_hc = svm_hazard_category.predict(X_test)
print("\nHazard-Category Classification Report:")
print(classification_report(y_test_hc, y_pred_hc))
print("Accuracy:", accuracy_score(y_test_hc, y_pred_hc))

# Product
y_pred_p = svm_product.predict(X_test)
print("\nProduct Classification Report:")
print(classification_report(y_test_p, y_pred_p))
print("Accuracy:", accuracy_score(y_test_p, y_pred_p))

# Hazard
y_pred_h = svm_hazard.predict(X_test)
print("\nHazard Classification Report:")
print(classification_report(y_test_h, y_pred_h))
print("Accuracy:", accuracy_score(y_test_h, y_pred_h))

# Optional: Decode predictions back to original labels
decoded_product_predictions = label_encoder_product.inverse_transform(y_pred_p)
decoded_hazard_predictions = label_encoder_hazard.inverse_transform(y_pred_h)


Product-Category Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.57      0.67         7
           1       0.43      0.76      0.55       123
           2       0.78      0.63      0.70        49
           3       0.91      0.25      0.39        40
           4       0.88      0.58      0.70        24
           5       1.00      0.50      0.67         4
           6       0.00      0.00      0.00         3
           8       0.00      0.00      0.00         1
           9       0.62      0.62      0.62       112
          10       0.57      0.25      0.35        16
          11       0.00      0.00      0.00         1
          12       0.95      0.66      0.78        56
          13       0.74      0.94      0.83       282
          14       0.90      0.61      0.73        31
          15       0.72      0.54      0.62        63
          16       1.00      0.22      0.36         9
          17       1.00      0.83      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Hazard-Category Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95       377
           1       0.89      0.99      0.94       339
           2       0.86      0.79      0.82        68
           3       1.00      0.20      0.33         5
           4       0.89      0.93      0.91       111
           5       0.79      0.54      0.64        68
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00        10
           8       0.75      0.33      0.46        27
           9       1.00      0.18      0.31        11

    accuracy                           0.89      1017
   macro avg       0.71      0.49      0.54      1017
weighted avg       0.88      0.89      0.88      1017

Accuracy: 0.8938053097345132


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Product Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.40      0.57         5
           2       0.19      0.67      0.30         6
           4       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         2
          10       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         3
          14       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         1
          19       1.00      0.67      0.80         3
          20       0.00      0.00      0.00         3
          22       0.00      0.00      0.00         1
          23       0.00      0.00      0.00         1
          24       0.00      0.00      0.00         2
          25       0.00      0.00      0.00         1
          27       0.00      0.00      0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Hazard Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00         4
           5       0.58      0.50      0.54        14
           6       0.00      0.00      0.00         2
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1
          13       1.00      0.25      0.40         4
          14       0.75      0.38      0.50         8
          15       0.00      0.00      0.00         3
          16       0.00      0.00      0.00         1
          17       0.72      0.84      0.77       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
