In [1]:
pip install dask[dataframe]


Collecting dask-expr<1.2,>=1.1 (from dask[dataframe])
  Downloading dask_expr-1.1.21-py3-none-any.whl.metadata (2.6 kB)
INFO: pip is looking at multiple versions of dask-expr to determine which version is compatible with other requirements. This could take a while.
  Downloading dask_expr-1.1.20-py3-none-any.whl.metadata (2.6 kB)
  Downloading dask_expr-1.1.19-py3-none-any.whl.metadata (2.6 kB)
  Downloading dask_expr-1.1.18-py3-none-any.whl.metadata (2.6 kB)
  Downloading dask_expr-1.1.16-py3-none-any.whl.metadata (2.5 kB)
Downloading dask_expr-1.1.16-py3-none-any.whl (243 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.2/243.2 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dask-expr
Successfully installed dask-expr-1.1.16


In [2]:
import pandas as pd
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')



# Load datasets
train_data = pd.read_csv("drive/MyDrive/ColabNotebooks/incidents_train.csv")
valid_data = pd.read_csv("drive/MyDrive/ColabNotebooks/incidents_valid.csv")


# Function to check for missing and duplicate values
def check_data_quality(data, name):
    print(f"--- {name} ---")
    print(f"Shape: {data.shape}")
    print(f"Missing values:\n{data.isnull().sum()}\n")
    print(f"Duplicate rows: {data.duplicated().sum()}\n")
    print("="*50)

# Run checks on each dataset
check_data_quality(train_data, "Training Data")
check_data_quality(valid_data, "Validation Data")

Mounted at /content/drive
--- Training Data ---
Shape: (5082, 11)
Missing values:
Unnamed: 0          0
year                0
month               0
day                 0
country             0
title               0
text                0
hazard-category     0
product-category    0
hazard              0
product             0
dtype: int64

Duplicate rows: 0

--- Validation Data ---
Shape: (565, 11)
Missing values:
Unnamed: 0          0
year                0
month               0
day                 0
country             0
title               0
text                0
hazard-category     0
product-category    0
hazard              0
product             0
dtype: int64

Duplicate rows: 0



In [3]:
import pandas as pd
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, f1_score

# Features and labels for training set
X_train = train_data['text']  # Text column
y_hazard_category = train_data['hazard-category']  # Hazard category
y_product_category = train_data['product-category']  # Product category
y_hazard = train_data['hazard']  # Hazard
y_product = train_data['product']  # Product

# Features and labels for test set
X_test = valid_data['text']  # Text column
y_test_hazard_category = valid_data['hazard-category']  # Hazard category
y_test_product_category = valid_data['product-category']  # Product category
y_test_hazard = valid_data['hazard']  # Hazard
y_test_product = valid_data['product']  # Product

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000, strip_accents='unicode', analyzer='char', ngram_range=(2, 5))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [4]:
# ST1: Train models for hazard categorie
hazard_category_model_3 = lgb.LGBMClassifier(random_state=42)
hazard_category_model_3.fit(X_train_vec, y_hazard_category)



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.943293 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1238306
[LightGBM] [Info] Number of data points in the train set: 5082, number of used features: 5000
[LightGBM] [Info] Start training from score -1.008359
[LightGBM] [Info] Start training from score -1.071245
[LightGBM] [Info] Start training from score -2.873978
[LightGBM] [Info] Start training from score -5.355406
[LightGBM] [Info] Start training from score -2.203739
[LightGBM] [Info] Start training from score -2.617258
[LightGBM] [Info] Start training from score -7.434848
[LightGBM] [Info] Start training from score -4.563168
[LightGBM] [Info] Start training from score -3.635620
[LightGBM] [Info] Start training from score -4.544476


In [5]:
# ST1: Train models for product categorie

product_category_model_3 = lgb.LGBMClassifier(random_state=42)
product_category_model_3.fit(X_train_vec, y_product_category)



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.842836 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1238306
[LightGBM] [Info] Number of data points in the train set: 5082, number of used features: 5000
[LightGBM] [Info] Start training from score -4.455923
[LightGBM] [Info] Start training from score -2.024691
[LightGBM] [Info] Start training from score -3.186353
[LightGBM] [Info] Start training from score -3.397662
[LightGBM] [Info] Start training from score -3.658263
[LightGBM] [Info] Start training from score -5.589021
[LightGBM] [Info] Start training from score -6.741701
[LightGBM] [Info] Start training from score -6.454019
[LightGBM] [Info] Start training from score -6.587550
[LightGBM] [Info] Start training from score -2.251193
[LightGBM] [Info] Start training from score -3.705146
[LightGBM] [Info] Start training from score -6.454019
[LightGBM] [Info] Start training from score -3.130783
[Ligh

In [6]:
# ST2: Train models for hazard
hazard_model_3 = lgb.LGBMClassifier(random_state=42)
hazard_model_3.fit(X_train_vec, y_hazard)



[1;30;43mΗ έξοδος ροής περικόπηκε στις τελευταίες 5000 γραμμές.[0m


In [7]:
# ST2: Train models for product

product_model_3 = lgb.LGBMClassifier(random_state=42)
product_model_3.fit(X_train_vec, y_product)



[1;30;43mΗ έξοδος ροής περικόπηκε στις τελευταίες 5000 γραμμές.[0m


In [8]:
# Predictions on the training set for ST1
hazard_category_train_preds = hazard_category_model_3.predict(X_train_vec)
product_category_train_preds = product_category_model_3.predict(X_train_vec)




In [9]:
# Predictions on the training set for ST2
hazard_train_preds = hazard_model_3.predict(X_train_vec)
product_train_preds = product_model_3.predict(X_train_vec)



In [10]:
# Predictions on the test set for ST1
hazard_category_test_preds = hazard_category_model_3.predict(X_test_vec)
product_category_test_preds = product_category_model_3.predict(X_test_vec)




In [11]:
# Predictions on the test set for ST2
hazard_test_preds = hazard_model_3.predict(X_test_vec)
product_test_preds = product_model_3.predict(X_test_vec)



In [12]:
# Function to print macro and micro F1 scores
def print_f1_scores(y_true, y_pred, label):
    macro = f1_score(y_true, y_pred, average='macro')
    micro = f1_score(y_true, y_pred, average='micro')
    print(f"{label} - Macro F1: {macro:.2f}, Micro F1: {micro:.2f}")

# Evaluation on Test Set
print("\nTest Set Evaluation:")

# Hazard Category
print("Hazard Category Classification Report (Test):")
print(classification_report(y_test_hazard_category, hazard_category_test_preds))
print_f1_scores(y_test_hazard_category, hazard_category_test_preds, "Hazard Category (Test)")

# Product Category
print("Product Category Classification Report (Test):")
print(classification_report(y_test_product_category, product_category_test_preds))
print_f1_scores(y_test_product_category, product_category_test_preds, "Product Category (Test)")

# Hazard
print("Hazard Classification Report (Test):")
print(classification_report(y_test_hazard, hazard_test_preds))
print_f1_scores(y_test_hazard, hazard_test_preds, "Hazard (Test)")

# Product
print("Product Classification Report (Test):")
print(classification_report(y_test_product, product_test_preds))
print_f1_scores(y_test_product, product_test_preds, "Product (Test)")


Test Set Evaluation:
Hazard Category Classification Report (Test):
                                precision    recall  f1-score   support

                     allergens       0.63      0.92      0.75       207
                    biological       0.30      0.04      0.06       194
                      chemical       0.14      0.18      0.16        28
food additives and flavourings       0.08      0.50      0.14         2
                foreign bodies       0.21      0.41      0.28        63
                         fraud       0.11      0.05      0.07        41
                     migration       0.00      0.00      0.00         0
          organoleptic aspects       0.00      0.00      0.00         8
                  other hazard       0.00      0.00      0.00        14
              packaging defect       0.00      0.00      0.00         8

                      accuracy                           0.41       565
                     macro avg       0.15      0.21      0.15     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [13]:
# Compute Final Scores
def compute_score(hazards_true, products_true, hazards_pred, products_pred):
    # Compute F1 for hazards:
    f1_hazards = f1_score(
        hazards_true,
        hazards_pred,
        average='macro'
    )

    # Compute F1 for products:
    f1_products = f1_score(
        products_true[hazards_pred == hazards_true],
        products_pred[hazards_pred == hazards_true],
        average='macro'
    )

    return (f1_hazards + f1_products) / 2. # Final Score for ST1
st1_score = (f1_score(y_test_hazard_category, hazard_category_test_preds, average='macro') +
             f1_score(y_test_product_category, product_category_test_preds, average='macro')) / 2.0
print(f"\nScore Sub-Task 1 (Test): {st1_score:.3f}")

# Final Score for ST2
st2_score = compute_score(y_test_hazard, y_test_product, hazard_test_preds, product_test_preds)
print(f"Score Sub-Task 2 (Test): {st2_score:.3f}")


Score Sub-Task 1 (Test): 0.089
Score Sub-Task 2 (Test): 0.008


In [14]:
import zipfile

# Combine all predictions into a DataFrame
submission_df = pd.DataFrame({
    "index": valid_data.index,  # Use the test set index
    "hazard_category": hazard_category_test_preds,  # Hazard category predictions
    "product_category": product_category_test_preds,  # Product category predictions
    "hazard": hazard_test_preds,  # Hazard predictions
    "product": product_test_preds  # Product predictions
})

# Save the DataFrame to 'submission.csv'
submission_df.to_csv("submission.csv", index=False)

# Create a zip file containing 'submission.csv'
with zipfile.ZipFile("submission.zip", "w") as zipf:
    zipf.write("submission.csv")

# Κατέβασμα του ZIP αρχείου
files.download("submission.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>