<a href="https://colab.research.google.com/github/karthiksairam01/SemEval-Task9/blob/main/root/notebooks/NLP__Final_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre-processing and Importing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!git clone https://github.com/karthiksairam01/SemEval-Task9.git
!git clone https://github.com/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io.git

Cloning into 'SemEval-Task9'...
remote: Enumerating objects: 145, done.[K
remote: Counting objects: 100% (145/145), done.[K
remote: Compressing objects: 100% (123/123), done.[K
remote: Total 145 (delta 39), reused 48 (delta 6), pack-reused 0 (from 0)[K
Receiving objects: 100% (145/145), 1.14 MiB | 5.73 MiB/s, done.
Resolving deltas: 100% (39/39), done.
Cloning into 'food-hazard-detection-semeval-2025.github.io'...
remote: Enumerating objects: 125, done.[K
remote: Counting objects: 100% (125/125), done.[K
remote: Compressing objects: 100% (102/102), done.[K
remote: Total 125 (delta 57), reused 58 (delta 18), pack-reused 0 (from 0)[K
Receiving objects: 100% (125/125), 3.80 MiB | 5.23 MiB/s, done.
Resolving deltas: 100% (57/57), done.


In [None]:
#!rm -rf /content/SemEval-Task9/

In [None]:
# Install necessary libraries
!pip install networkx pandas tqdm

#!pip install -U pip setuptools wheel



In [None]:
import numpy as np
import networkx as nx
import pandas as pd
from tqdm import tqdm
import json
from sklearn.metrics import classification_report

# Functions (deprecated)

In [None]:
def get_titles_dict(incidents: pd.DataFrame, column: str) -> dict:
    try:
        values = np.unique(np.concatenate(incidents[column].values))
    except ValueError:
        values = np.unique(incidents[column].values)

    counts = np.array([sum([v in label for label in incidents[column].values]) for v in values])
    result = {}

    for v, n in zip(values, counts):
        doc = nlp(v)
        words = []
        for token in doc:
            if token.pos_ in {"NOUN","ADJ"}:
                words.append(token.text)
        result[v] = {"words": words}

    return result

def get_nodes_dict(incidents: pd.DataFrame, column: str) -> dict:
    try:
        values = np.unique(np.concatenate(incidents[column].values))
    except ValueError:
        values = np.unique(incidents[column].values)

    counts = np.array([sum([v in label for label in incidents[column].values]) for v in values])
    result = {}

    for v, n in zip(values, counts):
        doc = nlp(v)
        words = []
        for token in doc:
            if token.pos_ in {"NOUN", "PROPN", "VERB", "ADJ"}:
                words.append(token.text)
        result[v] = {"words": words}

    return result

def print_nodes(incidents: pd.DataFrame, column: str):
    try:
        values = np.unique(np.concatenate(incidents[column].values))
    except ValueError:
        values = np.unique(incidents[column].values)
    counts = np.array([sum([v in label for label in incidents[column].values]) for v in values])

    for v, n in zip(values, counts):
        # print(v)
        doc = nlp(v)
        words = []
        for token in doc:
            if token.pos_ in {"NOUN", "PROPN", "VERB", "ADJ"}:
                words.append(token.text)
        print(f'  {v}: {words}')

# Functions
## 1. Preprocess ConceptNet URI
## 2. Convert ConceptNet CSV to a graph

In [None]:
def canonicalize_node(uri):
    """
    Canonicalizes a ConceptNet URI by removing parts after the lemma.
    For example:
    "/c/en/condiment/n" -> "/c/en/condiment"
    "/c/en/condiments/n" -> "/c/en/condiments"
    "/c/en/apple" stays "/c/en/apple"
    """
    parts = uri.strip('/').split('/')

    if len(parts) >= 3:
        return "/" + "/".join(parts[:3])
    else:
        return uri

def load_conceptnet_lite_csv(file_path):
    """
    Loads the filtered ConceptNet Lite CSV into a NetworkX graph.

    Parameters:
    - file_path (str): Path to the filtered CSV file.

    Returns:
    - G (networkx.Graph): The constructed undirected graph with canonicalized nodes.
    """
    print("Loading ConceptNet Lite into NetworkX...")
    G = nx.Graph()

    try:
        df = pd.read_csv(file_path, sep=',', encoding='utf-8')
    except Exception as e:
        print(f"Error reading filtered CSV: {e}")
        return G

    print("Adding edges to NetworkX...")
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Adding edges"):
        start = canonicalize_node(row['start'])
        end = canonicalize_node(row['end'])
        rel = row['rel']
        G.add_edge(start, end, relation=rel)

    print(f"Graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")
    return G


# Loading ConceptNet with NetworkX

In [None]:
FILTERED_CSV_FILE = "/content/drive/MyDrive/NeSym/conceptnet-lite[RelatedTo,FormOf,Synonym].csv"

#load graph
G = load_conceptnet_lite_csv(FILTERED_CSV_FILE)

Loading ConceptNet Lite into NetworkX...
Adding edges to NetworkX...


Adding edges: 100%|██████████| 2304597/2304597 [01:56<00:00, 19712.06it/s]


Graph has 948526 nodes and 2052758 edges.


# Output Labels (hazard/prod categories)

In [None]:
json_file_path = '/content/SemEval-Task9/root/data/extracted_keywords_hazard_category.json'
with open(json_file_path, 'r') as file:
    data = json.load(file)

df_1 = pd.DataFrame(list(data.items()), columns=['hazard_category', 'extracted_terms'])
hazards = dict(zip(df_1['hazard_category'], df_1['extracted_terms']))

print(hazards)

json_file_path = '/content/SemEval-Task9/root/data/extracted_keywords_product_category.json'
with open(json_file_path, 'r') as file:
    data = json.load(file)

df_2 = pd.DataFrame(list(data.items()), columns=['product_category', 'extracted_terms'])
products = dict(zip(df_2['product_category'], df_2['extracted_terms']))
print(products)

{'allergens': ['allergens'], 'chemical': ['chemical'], 'food additives and flavourings': ['food additives', 'flavourings'], 'foreign bodies': ['foreign bodies', 'bodies'], 'fraud': ['fraud'], 'migration': ['migration'], 'organoleptic aspects': ['organoleptic', 'aspects'], 'other hazard': ['hazard'], 'packaging defect': ['defect']}
{'alcoholic beverages': ['alcoholic beverage', 'beverage', 'alcoholic', 'beverages'], 'cereals and bakery products': ['cereals', 'bakery', 'products'], 'cocoa and cocoa preparations, coffee and tea': ['cocoa', 'coffee', 'tea'], 'dietetic foods, food supplements, fortified foods': ['dietetic', 'fortified', 'food', 'supplements'], 'fats and oils': ['fats', 'oils'], 'feed materials': ['materials'], 'food additives and flavourings': ['additive', 'flavourings', 'food additives'], 'food contact materials': ['food', 'contact', 'materials'], 'fruits and vegetables': ['fruits', 'vegetables'], 'herbs and spices': ['herbs', 'spices'], 'honey and royal jelly': ['honey', 

# Input Labels (title keywords)

In [None]:
json_file_path = '/content/SemEval-Task9/root/data/extracted_keywords_incidents_train.json'
with open(json_file_path, 'r') as file:
    data = json.load(file)

df = pd.DataFrame(list(data.items()), columns=['title', 'extracted_terms'])

len(df)

4943

# Experimentation

## 1. Removing ban words

In [None]:
def clean_categories(categories, remove_strings):
    """
    Remove any keywords containing specified strings from both category keys and their associated keywords.

    Parameters:
    - categories (dict): Dictionary mapping category labels to their associated keywords.
    - remove_strings (list): List of strings to be removed from both category keys and their associated keywords.

    Returns:
    - cleaned_categories (dict): Modified dictionary with specified strings removed from each category.
    """
    cleaned_categories = {}

    for label, keywords in categories.items():

        filtered_keywords = [kw for kw in keywords if all(remove_string != kw for remove_string in remove_strings)]

        if filtered_keywords:
            cleaned_categories[label] = filtered_keywords

    return cleaned_categories


In [None]:
def remove_terms_from_extracted_terms(df, terms_to_remove):
    """Removes specified terms from the 'extracted_terms' column of a DataFrame.

    Args:
        df: The input DataFrame.
        terms_to_remove: A list of terms to remove.

    Returns:
        A new DataFrame with the specified terms removed from the 'extracted_terms' column.
    """

    df_cleaned = df.copy()

    df_cleaned['extracted_terms'] = df_cleaned['extracted_terms'].apply(
        lambda terms: [term for term in terms if term not in terms_to_remove]
    )

    return df_cleaned

## 2. Pruning

In [None]:
to_remove = ['good', 'white', 'recall', 'selected', 'stores', 'sold', 'food', 'materials', 'emphasise', 'beverage', 'beverages', 'products', 'mixed', 'issues', 'mix', 'alert', 'health', 'product', 'update']
print(hazards)
hazards = clean_categories(hazards, to_remove)


print(products)
products = clean_categories(products, to_remove)

print(df.head())

df_cleaned = remove_terms_from_extracted_terms(df, to_remove)

df = df_cleaned
print(df.head())
len(df)

## 3. Searching for one specific useless title repeated 70 times

In [None]:
search_string = 'recall notification'
matches = df[df['title'].str.contains(search_string, case=False, na=False)]

print("Rows matching the search string:")
print(matches)
len(matches)

## 4. Deleting said useless title

In [None]:
df_cleaned = df[~df['title'].str.contains(search_string, case=False, na=False)]

search_string = 'recall notification'

df_cleaned = df[~df['title'].str.contains(search_string, case=False, na=False)]

print("DataFrame after removing matching rows:")
print(df_cleaned)

df = df_cleaned

# Function to generate the sub-graphs

In [None]:
from collections import deque

def multi_source_bfs(G, start_nodes, max_distance=3):
    """
    Perform BFS from multiple start nodes up to max_distance.

    Parameters:
    - G (networkx.Graph): The graph to traverse.
    - start_nodes (list): List of node identifiers to start BFS from.
    - max_distance (int): Maximum distance to traverse.

    Returns:
    - distance (dict): Dictionary mapping nodes to their shortest distance from any start node.
    """
    distance = {}
    queue = deque()

    for node in start_nodes:
        if node in G:
            distance[node] = 0
            queue.append((node, 0))
        else:
            print(f"Warning: Start node {node} not in graph.")

    while queue:
        current_node, current_distance = queue.popleft()

        if current_distance >= max_distance:
            continue

        for neighbor in G.neighbors(current_node):
            if neighbor not in distance:
                distance[neighbor] = current_distance + 1
                queue.append((neighbor, current_distance + 1))

    return distance

# Testing and Debugging

In [None]:
start_concepts = ["alcohol"]  # Modify this list as needed

start_uris = [f"/c/en/{concept.lower()}" for concept in start_concepts]

print(start_uris)

['/c/en/alcohol']


In [None]:
max_distance = 5  # Adjust as needed

dist_map = multi_source_bfs(G, start_uris, max_distance)

In [None]:
print(len(dist_map))

825502


## 1. Verify if node in ConceptNet

In [None]:
def verify_nodes(G, nodes):
    """
    Verifies if the given nodes exist in the graph.

    Parameters:
    - G (networkx.Graph or networkx.DiGraph): The graph.
    - nodes (list): List of ConceptNet URIs to verify.

    Returns:
    - existing_nodes (list): Nodes that exist in the graph.
    - missing_nodes (list): Nodes that do not exist in the graph.
    """
    existing_nodes = []
    missing_nodes = []
    for node in nodes:
        if node in G:
            existing_nodes.append(node)
        else:
            missing_nodes.append(node)
    return existing_nodes, missing_nodes

In [None]:
start_concepts = ["dietetical"]  # Modify this list as needed

start_uris = [f"/c/en/{concept.lower()}" for concept in start_concepts]

print(start_uris)

existing, missing = verify_nodes(G, start_uris)

print("\n=== Node Verification ===")
for node in existing:
    print(f"{node} exists in the graph.")
for node in missing:
    print(f"{node} does NOT exist in the graph.")

['/c/en/dietetical']

=== Node Verification ===
/c/en/dietetical exists in the graph.


# Subgraph generation

In [None]:
#print(products)

# Convert product and hazard keywords to ConceptNet URIs
product_uris = {cat: [f"/c/en/{kw.replace(' ', '_').lower()}" for kw in kws] for cat, kws in products.items()}
hazard_uris = {cat: [f"/c/en/{kw.replace(' ', '_').lower()}" for kw in kws] for cat, kws in hazards.items()}

# Generate subgraphs for products and hazards
product_subgraphs = {cat: multi_source_bfs(G, uris, max_distance) for cat, uris in product_uris.items()}
hazard_subgraphs = {cat: multi_source_bfs(G, uris, max_distance) for cat, uris in hazard_uris.items()}

{'alcoholic beverages': ['alcoholic beverage', 'alcoholic'], 'cereals and bakery products': ['cereals', 'bakery'], 'cocoa and cocoa preparations, coffee and tea': ['cocoa', 'coffee', 'tea'], 'dietetic foods, food supplements, fortified foods': ['dietetic', 'fortified', 'supplements'], 'fats and oils': ['fats', 'oils'], 'food additives and flavourings': ['additive', 'flavourings', 'food additives'], 'food contact materials': ['contact'], 'fruits and vegetables': ['fruits', 'vegetables'], 'herbs and spices': ['herbs', 'spices'], 'honey and royal jelly': ['honey', 'royal jelly'], 'ices and desserts': ['desserts', 'ices'], 'meat, egg and dairy products': ['meat', 'egg', 'dairy products'], 'nuts, nut products and seeds': ['nut', 'nuts', 'seeds'], 'pet feed': ['pet'], 'prepared dishes and snacks': ['dishes', 'snacks', 'prepared'], 'seafood': ['seafood'], 'soups, broths, sauces and condiments': ['broth', 'broths', 'soups', 'soup', 'sauces', 'sauce', 'condiments', 'condiment'], 'sugars and syr

# Classification Function

In [None]:
def classify_title_keywords(df, categories, max_distance=5):
    """
    Classify titles into categories based on the minimum distance between title keywords and pre-generated category subgraphs.

    Parameters:
    - df (pd.DataFrame): DataFrame containing 'title' and 'extracted_terms' columns.
    - categories (dict): Dictionary mapping category labels to their distance dictionaries (pre-generated subgraphs).
    - max_distance (int): Maximum BFS distance to consider.

    Returns:
    - classifications (dict): Dictionary mapping titles to their classified category.
    """
    classifications = {}

    # Iterate over each title in the DataFrame and classify
    for _, row in df.iterrows():
        title = row['title']
        keywords = row['extracted_terms']
        #print(keywords)
        #print(f"Processing title: {title}, Keywords: {keywords}")

        min_distance = float('inf')
        assigned_category = None

        for category, distances in categories.items():

            # MIN CODE
            category_min_distance = float('inf')

            for keyword in keywords:
                keyword_uri = f"/c/en/{keyword.replace(' ', '_').lower()}"
                distance = distances.get(keyword_uri, float('inf'))
                if distance < category_min_distance:
                    category_min_distance = distance
                    #print(f"Category: {category}, Keyword: {keyword}, Distance: {distance}")

            # # AVG CODE

            # # Calculate the average distance to all keywords in the title for each category
            # category_average_distance = float('inf')

            # if keywords:
            #     distances_sum = 0
            #     valid_distances = 0

            #     for keyword in keywords:
            #         #print(keyword)
            #         keyword_uri = f"/c/en/{keyword.replace(' ', '_').lower()}"
            #         distance = distances.get(keyword_uri, float('inf'))

            #         # Only consider valid distances
            #         if distance != float('inf'):
            #             #print(f"Category: {category}, Keyword: {keyword}, Distance: {distance}")
            #             distances_sum += distance
            #             valid_distances += 1


            #     # Compute average distance if there are valid distances
            #     if valid_distances > 0:
            #         category_average_distance = distances_sum
            #         #print(f"Category: {category}, Average Distance: {category_average_distance}")


            # # Check if this category has a smaller average distance than previously found categories
            # if category_average_distance < min_distance:
            #     min_distance = category_average_distance
            #     assigned_category = category

            if category_min_distance < min_distance:
                min_distance = category_min_distance
                assigned_category = category

        classifications[title] = assigned_category

    return classifications


# Classification

In [None]:
categories = product_subgraphs
classifications_products_cat = classify_title_keywords(df, categories, G)

#classifications_products_cat

In [None]:
categories = hazard_subgraphs
classifications_hazards_cat = classify_title_keywords(df, categories, G)

#classifications_hazards_cat

In [None]:
title = "2009 - deluxe ice cream co. expands recall to include additional codes of tin roof sundae, goo goo cluster, and candy bar half gallon packaged ice cream product, 3 gallon tin roof sundae and qc 24 pack sundae cone because of possible health risk"

if title in classifications_products_cat:
    print(classifications_products_cat[title])
else:
    print(f"The key '{title}' does not exist in the dictionary.")

None


# Evaluation

## 1. Preprocessing
To format correctly and also replace 'inf' classifications with 'Unknown'

In [None]:
# Prepare the data to evaluate

def prepare_predictions(classifications):
    """
    Prepare predictions in the required format for evaluation.

    Parameters:
    - classifications (dict): Dictionary of titles and their classified categories.

    Returns:
    - formatted_predictions (dict): Predictions formatted for evaluation.
    """
    formatted_predictions = {}

    for title, category in classifications.items():
        formatted_predictions[title] = category

    return formatted_predictions


In [None]:
def sanitize_predictions(predictions):
    for title in list(predictions.keys()):
        if predictions[title] is None:
            #print(title)
            predictions[title] = 'Unknown'  #replace None with 'Unknown'
        elif not isinstance(predictions[title], str):
            predictions[title] = str(predictions[title])
    return predictions

In [None]:
# Prepare the predictions
formatted_predictions_pc = prepare_predictions(classifications_products_cat)
formatted_predictions_pc = sanitize_predictions(formatted_predictions_pc)

print(formatted_predictions_pc)
len(formatted_predictions_pc)




4880

In [None]:
formatted_predictions_hc = prepare_predictions(classifications_hazards_cat)
formatted_predictions_hc = sanitize_predictions(formatted_predictions_hc)

print(formatted_predictions_hc)
len(formatted_predictions_hc)



4880

## Evaluation Function

In [None]:
def evaluate_predictions(dataset_path, predictions, category):
    """
    Evaluate model predictions against the ground truth in the dataset for a specific category.

    Parameters:
    - dataset_path (str): Path to the dataset containing ground truth.
    - predictions (dict): Dictionary with titles as keys and predicted categories as values.
                          Format: {title: predicted_category}
    - category (str): The category to evaluate (e.g., 'hazard-category', 'product-category').

    Returns:
    - None (prints the evaluation results)
    """
    import pandas as pd
    from sklearn.metrics import classification_report

    df = pd.read_csv(dataset_path)

    required_columns = ['title', category]
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"The dataset must contain the columns: {required_columns}")

    filtered_df = df[df['title'].isin(predictions.keys())]

    true_labels = []
    predicted_labels = []
    examples = []

    for _, row in filtered_df.iterrows():
        title = row['title']
        ground_truth = row[category]

        true_labels.append(ground_truth)

        prediction = predictions.get(title, 'Unknown')
        predicted_labels.append(prediction)

        examples.append((title, ground_truth, prediction))

    print(f"Evaluation of {category.replace('-', ' ').capitalize()} Predictions:")
    print(classification_report(true_labels, predicted_labels, zero_division=0))

    # # Display mismatched predictions
    # print("\nExamples of Predictions vs Ground Truth:")
    # mismatches = [ex for ex in examples if ex[1] != ex[2]]
    # for i, (title, truth, pred) in enumerate(mismatches[:10]):  # Show the first 10 mismatches
    #     print(f"{i+1}. Title: {title}")
    #     print(f"   Truth: {truth}")
    #     print(f"   Predicted: {pred}")
    #     print("")

    output_df = pd.DataFrame(examples, columns=['title', 'ground_truth', 'predicted'])
    output_df.to_csv(f"predictions_vs_truth_{category}.csv", index=False)
    print(f"\nFull results saved to 'predictions_vs_truth_{category}.csv'")


In [None]:
dataset_path = "/content/food-hazard-detection-semeval-2025.github.io/data/incidents_train.csv"

evaluate_predictions(dataset_path, formatted_predictions_pc, 'product-category')

Evaluation of Product category Predictions:
                                                   precision    recall  f1-score   support

                                          Unknown       0.00      0.00      0.00         0
                              alcoholic beverages       0.06      0.80      0.11        59
                      cereals and bakery products       0.40      0.30      0.35       661
     cocoa and cocoa preparations, coffee and tea       0.14      0.65      0.23       205
                                    confectionery       0.00      0.00      0.00       169
dietetic foods, food supplements, fortified foods       0.65      0.13      0.22       128
                                    fats and oils       0.14      0.05      0.08        19
                                   feed materials       0.00      0.00      0.00         6
                   food additives and flavourings       0.00      0.00      0.00         8
                           food contact mater

In [None]:
evaluate_predictions(dataset_path, formatted_predictions_hc, 'hazard-category')

Evaluation of Hazard category Predictions:
                                precision    recall  f1-score   support

                       Unknown       0.00      0.00      0.00         0
                     allergens       0.94      0.38      0.54      1826
                    biological       0.00      0.00      0.00      1691
                      chemical       0.07      0.84      0.13       279
food additives and flavourings       0.09      0.13      0.11        23
                foreign bodies       0.11      0.02      0.03       547
                         fraud       0.21      0.11      0.15       342
                     migration       0.03      0.33      0.05         3
          organoleptic aspects       0.00      0.00      0.00        52
                  other hazard       0.04      0.17      0.07       128
              packaging defect       0.10      0.13      0.12        54

                      accuracy                           0.20      4945
                   

# Hazards

In [None]:
json_file_path = '/content/SemEval-Task9/root/data/extracted_keywords_hazards.json'
with open(json_file_path, 'r') as file:
    data = json.load(file)

df_3 = pd.DataFrame(list(data.items()), columns=['hazard', 'extracted_terms'])
hazards_true = dict(zip(df_3['hazard'], df_3['extracted_terms']))

print(hazards_true)

{'Aflatoxin': ['aflatoxin'], 'abnormal smell': ['smell', 'abnormal'], 'alcohol content': ['alcohol', 'content'], 'alkaloids': ['alkaloids'], 'allergens': ['allergens'], 'almond': ['almond'], 'altered organoleptic characteristics': ['altered', 'organoleptic', 'characteristics'], 'amygdalin': ['amygdalin'], 'antibiotics, vet drugs': ['antibiotics'], 'bacillus spp.': ['bacillus'], 'bad smell / off odor': ['bad', 'smell', 'odor'], 'bone fragment': ['bone', 'fragment'], 'brazil nut': ['brazil', 'nut'], 'bulging packaging': ['bulging'], 'cashew': ['cashew'], 'celery and products thereof': ['celery', 'products'], 'cereals containing gluten and products thereof': ['cereals', 'gluten', 'products'], 'cereals': ['cereals'], 'chemical compound (high content)': ['chemical', 'compound', 'high'], 'chemical compound (unauthorised)': ['chemical', 'compound', 'unauthorised'], 'chemical': ['chemical'], 'chlorine': ['chlorine'], 'clostridium botulinum': ['clostridium', 'botulinum'], 'coconut': ['coconut']

In [None]:
hazards_true = clean_categories(hazards_true, to_remove)
hazards_true

{'Aflatoxin': ['aflatoxin'],
 'abnormal smell': ['smell', 'abnormal'],
 'alcohol content': ['alcohol', 'content'],
 'alkaloids': ['alkaloids'],
 'allergens': ['allergens'],
 'almond': ['almond'],
 'altered organoleptic characteristics': ['altered',
  'organoleptic',
  'characteristics'],
 'amygdalin': ['amygdalin'],
 'antibiotics, vet drugs': ['antibiotics'],
 'bacillus spp.': ['bacillus'],
 'bad smell / off odor': ['bad', 'smell', 'odor'],
 'bone fragment': ['bone', 'fragment'],
 'brazil nut': ['brazil', 'nut'],
 'bulging packaging': ['bulging'],
 'cashew': ['cashew'],
 'celery and products thereof': ['celery'],
 'cereals containing gluten and products thereof': ['cereals', 'gluten'],
 'cereals': ['cereals'],
 'chemical compound (high content)': ['chemical', 'compound', 'high'],
 'chemical compound (unauthorised)': ['chemical', 'compound', 'unauthorised'],
 'chemical': ['chemical'],
 'chlorine': ['chlorine'],
 'clostridium botulinum': ['clostridium', 'botulinum'],
 'coconut': ['coconu

In [None]:
# Convert product and hazard keywords to ConceptNet URIs
hazard_true_uris = {cat: [f"/c/en/{kw.replace(' ', '_').lower()}" for kw in kws] for cat, kws in hazards_true.items()}

# Generate subgraphs for products and hazards
hazard_true_subgraphs = {cat: multi_source_bfs(G, uris, max_distance) for cat, uris in hazard_true_uris.items()}



In [None]:
categories = hazard_true_subgraphs
classifications_hazards = classify_title_keywords(df, categories, G)

In [None]:
classifications_hazards

In [None]:
# Prepare the predictions
formatted_hazards = prepare_predictions(classifications_hazards)
formatted_hazards = sanitize_predictions(formatted_hazards)


print(formatted_hazards)
len(formatted_hazards)




4880

In [None]:
dataset_path = "/content/food-hazard-detection-semeval-2025.github.io/data/incidents_train.csv"

evaluate_predictions(dataset_path, formatted_hazards, 'hazard')

Evaluation of Hazard Predictions:
                                                   precision    recall  f1-score   support

                                        Aflatoxin       0.07      0.60      0.12        10
                                          Unknown       0.00      0.00      0.00         0
                                   abnormal smell       0.01      0.25      0.02         4
                                  alcohol content       0.00      0.00      0.00         3
                                        alkaloids       1.00      0.33      0.50         3
                                        allergens       0.02      0.33      0.04        12
                                           almond       0.30      0.38      0.34        64
             altered organoleptic characteristics       0.00      0.00      0.00         3
                                        amygdalin       0.00      0.00      0.00         3
                           antibiotics, vet drugs      

# Products

In [None]:
json_file_path = '/content/extracted_keywords_products.json'
with open(json_file_path, 'r') as file:
    data = json.load(file)

df_4 = pd.DataFrame(list(data.items()), columns=['product', 'extracted_terms'])
products_true = dict(zip(df_4['product'], df_4['extracted_terms']))

print(products_true)

{'Catfishes (freshwater)': ['catfish', 'freshwater'], 'Dried pork meat': ['meat', 'dried'], 'Fishes not identified': ['identify', 'fishes'], 'Groupers (generic)': ['groupers', 'generic'], 'Not classified pork meat': ['meat', 'pork'], 'Pangas catfishes (generic)': ['catfish', 'pangas'], 'Precooked cooked pork meat products': ['cook', 'cooked', 'products', 'meat', 'pork'], 'Torpedo-shaped catfishes (generic)': ['catfishes', 'generic'], 'Veggie Burger': ['veggie', 'burger'], 'adobo seasoning': ['adobo'], 'after dinner mints': ['mints', 'dinner'], 'alcoholic beverages': ['alcoholic beverages', 'alcoholic', 'beverages'], 'alfalfa sprouts': ['alfalfa', 'sprouts'], 'all purpose seasoning': ['purpose'], 'allspice': ['allspice'], 'almond kernels': ['almond', 'kernels'], 'almond milk': ['almond', 'milk'], 'almond powder': ['almond', 'powder'], 'almond products': ['almond', 'products'], 'almonds': ['almonds', 'almond'], 'amaranth': ['amaranth'], 'anchovies in oil': ['oil', 'anchovy'], 'anchovy pa

In [None]:
products_true = clean_categories(products_true, to_remove)
products_true

{'Catfishes (freshwater)': ['catfish', 'freshwater'],
 'Dried pork meat': ['meat', 'dried'],
 'Fishes not identified': ['identify', 'fishes'],
 'Groupers (generic)': ['groupers', 'generic'],
 'Not classified pork meat': ['meat', 'pork'],
 'Pangas catfishes (generic)': ['catfish', 'pangas'],
 'Precooked cooked pork meat products': ['cook', 'cooked', 'meat', 'pork'],
 'Torpedo-shaped catfishes (generic)': ['catfishes', 'generic'],
 'Veggie Burger': ['veggie', 'burger'],
 'adobo seasoning': ['adobo'],
 'after dinner mints': ['mints', 'dinner'],
 'alcoholic beverages': ['alcoholic beverages', 'alcoholic'],
 'alfalfa sprouts': ['alfalfa', 'sprouts'],
 'all purpose seasoning': ['purpose'],
 'allspice': ['allspice'],
 'almond kernels': ['almond', 'kernels'],
 'almond milk': ['almond', 'milk'],
 'almond powder': ['almond', 'powder'],
 'almond products': ['almond'],
 'almonds': ['almonds', 'almond'],
 'amaranth': ['amaranth'],
 'anchovies in oil': ['oil', 'anchovy'],
 'angus beef': ['angus', 'b

In [None]:
# Convert product and hazard keywords to ConceptNet URIs
product_true_uris = {cat: [f"/c/en/{kw.replace(' ', '_').lower()}" for kw in kws] for cat, kws in products_true.items()}

max_distance = 5

# Generate subgraphs for products and hazards
product_true_subgraphs = {cat: multi_source_bfs(G, uris, max_distance) for cat, uris in product_true_uris.items()}



In [None]:
categories = product_true_subgraphs
classifications_products = classify_title_keywords(df, categories, G)

In [None]:
classifications_products

{'Vilis Lamb, Mint and Rosemary Pies 160g': 'chicken pie',
 "'Jackpot Mix' brand of Mix of Pretzels, Sticks and Salted Biscuits recalled": 'biscuits',
 '(Updated) Bradbury &amp  Son (Buxton) recalls Lo-Col because milk is not emphasised on the labelling': 'almond milk',
 '*(Updated on 2 June 2020) Not to consume a batch of bottled apple juice drink contaminated with patulin': 'apple cake',
 '*(Updated on 5 July 2022) Not to consume smoked salmon products with dill possibly contaminated with Listeria monocytogenes': 'Groupers (generic)',
 '165368 C. Corporation Recalls Pork Products Due to Possible Listeria Contamination': 'Not classified pork meat',
 '168 Express Ltd recalls various Jelly products because of a choking hazard': 'Dried pork meat',
 '2006 - Consumer Alert: Listeria Contamination in Raw Milk': 'almond milk',
 '2006 - Pinnacle Foods Corporation Issues Allergy Alert on Bakery-Style Chocolate Chip Muffin Mix': 'cereal bars with chocolate',
 '2006 - Pinnacle Foods Group Inc. I

In [None]:
# Prepare the predictions
formatted_products = prepare_predictions(classifications_products)
formatted_products = sanitize_predictions(formatted_products)


print(formatted_products)
len(formatted_products)




4943

In [None]:
dataset_path = "/content/food-hazard-detection-semeval-2025.github.io/data/incidents_train.csv"

evaluate_predictions(dataset_path, formatted_products, 'product')

Evaluation of Product Predictions:
                                                                        precision    recall  f1-score   support

                                                Catfishes (freshwater)       0.40      0.18      0.25        11
                                                       Dried pork meat       0.00      1.00      0.01         1
                                                 Fishes not identified       0.00      0.00      0.00        34
                                                    Groupers (generic)       0.00      0.00      0.00         1
                                              Not classified pork meat       0.06      0.82      0.11        11
                                            Pangas catfishes (generic)       0.00      0.00      0.00         3
                                   Precooked cooked pork meat products       0.00      0.00      0.00         9
                                    Torpedo-shaped catfishes (generi

# Random Sh

In [None]:
import gzip

# Define the target string
target = "/c/en/oils/"

file_name = "/content/conceptnet-lite.csv"

# Decompress and process line by line
output_file = "filtered_results.csv"
with gzip.open(file_name, "rt", encoding="utf-8") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
    for line in f_in:
        if target in line:
            f_out.write(line)  # Write matching lines to a new CSV file

print(f"Filtered lines containing '{target}' are saved to {output_file}")

FileNotFoundError: [Errno 2] No such file or directory: '/content/conceptnet-lite.csv'

In [None]:
file_name = "/content/conceptnet-lite.csv"

!grep '/c/en/oils/' /content/conceptnet-lite.csv