In [1]:
import pandas as pd
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')



# Load datasets
train_data = pd.read_csv("drive/MyDrive/ColabNotebooks/incidents_train.csv")
valid_data = pd.read_csv("drive/MyDrive/ColabNotebooks/incidents_valid.csv")


# Function to check for missing and duplicate values
def check_data_quality(data, name):
    print(f"--- {name} ---")
    print(f"Shape: {data.shape}")
    print(f"Missing values:\n{data.isnull().sum()}\n")
    print(f"Duplicate rows: {data.duplicated().sum()}\n")
    print("="*50)

# Run checks on each dataset
check_data_quality(train_data, "Training Data")
check_data_quality(valid_data, "Validation Data")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
--- Training Data ---
Shape: (5082, 11)
Missing values:
Unnamed: 0          0
year                0
month               0
day                 0
country             0
title               0
text                0
hazard-category     0
product-category    0
hazard              0
product             0
dtype: int64

Duplicate rows: 0

--- Validation Data ---
Shape: (565, 11)
Missing values:
Unnamed: 0          0
year                0
month               0
day                 0
country             0
title               0
text                0
hazard-category     0
product-category    0
hazard              0
product             0
dtype: int64

Duplicate rows: 0



# Classification


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Features and labels
X_train = train_data['text']
X_test = valid_data['text']

# Initialize LabelEncoders
hazard_category_encoder = LabelEncoder()
product_category_encoder = LabelEncoder()
hazard_encoder = LabelEncoder()
product_encoder = LabelEncoder()

# Fit LabelEncoders on the training data
hazard_category_encoder.fit(train_data['hazard-category'])
product_category_encoder.fit(train_data['product-category'])
hazard_encoder.fit(train_data['hazard'])
product_encoder.fit(train_data['product'])

# Encode target labels (train data)
y_hazard_category = hazard_category_encoder.transform(train_data['hazard-category'])
y_product_category = product_category_encoder.transform(train_data['product-category'])
y_hazard = hazard_encoder.transform(train_data['hazard'])
y_product = product_encoder.transform(train_data['product'])

# Handle unseen labels in the test data
def transform_with_unknown(encoder, labels, fallback_value=-1):
    # Check which labels are present in the encoder and which are not
    known_labels = encoder.classes_
    transformed_labels = []

    for label in labels:
        if label in known_labels:
            transformed_labels.append(encoder.transform([label])[0])
        else:
            transformed_labels.append(fallback_value)  # Assign fallback for unseen labels

    return np.array(transformed_labels)

# Encode test labels with fallback for unseen labels
y_test_hazard_category = transform_with_unknown(hazard_category_encoder, valid_data['hazard-category'])
y_test_product_category = transform_with_unknown(product_category_encoder, valid_data['product-category'])
y_test_hazard = transform_with_unknown(hazard_encoder, valid_data['hazard'])
y_test_product = transform_with_unknown(product_encoder, valid_data['product'])

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000, strip_accents='unicode', analyzer='char', ngram_range=(2, 5))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)




In [3]:
# ST1: Train models for hazard categorie
hazard_category_model = RandomForestClassifier(random_state=42)
hazard_category_model.fit(X_train_vec, y_hazard_category)




In [4]:
# ST1: Train models for product categorie
product_category_model = RandomForestClassifier(random_state=42)
product_category_model.fit(X_train_vec, y_product_category)

In [5]:
# ST2: Train models for hazard
hazard_model = RandomForestClassifier(random_state=42)
hazard_model.fit(X_train_vec, y_hazard)

In [6]:
# ST2: Train models for product
product_model = RandomForestClassifier(random_state=42)
product_model.fit(X_train_vec, y_product)

# Predictions



In [7]:
# Predictions on the training set for ST1
hazard_category_train_preds = hazard_category_model.predict(X_train_vec)
product_category_train_preds = product_category_model.predict(X_train_vec)


In [8]:
# Predictions on the training set for ST2
hazard_train_preds = hazard_model.predict(X_train_vec)
product_train_preds = product_model.predict(X_train_vec)

In [9]:
# Predictions on the test set for ST1
hazard_category_test_preds = hazard_category_model.predict(X_test_vec)
product_category_test_preds = product_category_model.predict(X_test_vec)

In [10]:
# Predictions on the test set for ST2
hazard_test_preds = hazard_model.predict(X_test_vec)
product_test_preds = product_model.predict(X_test_vec)


# Evaluation


In [11]:
# Function to print macro and micro F1 scores
def print_f1_scores(y_true, y_pred, label):
    macro = f1_score(y_true, y_pred, average='macro')
    micro = f1_score(y_true, y_pred, average='micro')
    print(f"{label} - Macro F1: {macro:.2f}, Micro F1: {micro:.2f}")


# Evaluation on Test Set
#print("\nTest Set Evaluation:")

# Hazard Category
#print("Hazard Category Classification Report (Test):")
#print(classification_report(y_test_hazard_category, hazard_category_test_preds))
print_f1_scores(y_test_hazard_category, hazard_category_test_preds, "Hazard Category")


# Product Category
#print("Product Category Classification Report (Test):")
#print(classification_report(y_test_product_category, product_category_test_preds))
print_f1_scores(y_test_product_category, product_category_test_preds, "Product Category")

# Hazard
#print("Hazard Classification Report (Test):")
#print(classification_report(y_test_hazard, hazard_test_preds))
print_f1_scores(y_test_hazard, hazard_test_preds, "Hazard")

# Product
#print("Product Classification Report (Test):")
#print(classification_report(y_test_product, product_test_preds))
print_f1_scores(y_test_product, product_test_preds, "Product")


Hazard Category - Macro F1: 0.49, Micro F1: 0.88
Product Category - Macro F1: 0.36, Micro F1: 0.54
Hazard - Macro F1: 0.35, Micro F1: 0.75
Product - Macro F1: 0.15, Micro F1: 0.30


ST1 (Hazard and Product Categories):

The macro F1 for hazard category (0.49) and product category (0.36) are moderate. While not perfect, they show that the model captures some class diversity despite imbalances.
The micro F1 for hazard category (0.88) is excellent, indicating strong overall performance across samples.

ST2 (Exact Hazard and Product Labels):

The macro F1 for hazard (0.35) and product (0.13) reveal that predicting rare labels is still a significant challenge.
The micro F1 for hazard (0.75) is decent, but for product (0.30), it is relatively low, suggesting issues with the high variability in product labels.

In [12]:
# Compute Final Scores
def compute_score(hazards_true, products_true, hazards_pred, products_pred):
    # Compute F1 for hazards:
    f1_hazards = f1_score(
        hazards_true,
        hazards_pred,
        average='macro'
    )

    # Compute F1 for products:
    f1_products = f1_score(
        products_true[hazards_pred == hazards_true],
        products_pred[hazards_pred == hazards_true],
        average='macro'
    )

    return (f1_hazards + f1_products) / 2.

# Final Score for ST1
st1_score = (f1_score(y_test_hazard_category, hazard_category_test_preds, average='macro') +
             f1_score(y_test_product_category, product_category_test_preds, average='macro')) / 2.0
print(f"\nScore Sub-Task 1: {st1_score:.3f}")

# Final Score for ST2
st2_score = compute_score(y_test_hazard, y_test_product, hazard_test_preds, product_test_preds)
print(f"Score Sub-Task 2: {st2_score:.3f}")




Score Sub-Task 1: 0.427
Score Sub-Task 2: 0.263


ST1 Score (0.427) :

This score reflects a decent performance on the category-level classification.

ST2 Score (0.253):

This lower score is expected since predicting exact labels (vectors) is inherently more challenging.


In [14]:
import zipfile
# Decode the predictions using inverse_transform
hazard_category_test_preds_decoded = hazard_category_encoder.inverse_transform(hazard_category_test_preds)
product_category_test_preds_decoded = product_category_encoder.inverse_transform(product_category_test_preds)
hazard_test_preds_decoded = hazard_encoder.inverse_transform(hazard_test_preds)
product_test_preds_decoded = product_encoder.inverse_transform(product_test_preds)

# Combine all decoded predictions into a DataFrame
submission_df = pd.DataFrame({
    "index": valid_data.index,  # Use the test set index
    "hazard_category": hazard_category_test_preds_decoded,  # Decoded hazard category predictions
    "product_category": product_category_test_preds_decoded,  # Decoded product category predictions
    "hazard": hazard_test_preds_decoded,  # Decoded hazard predictions
    "product": product_test_preds_decoded  # Decoded product predictions
})

# Save the DataFrame to 'submission.csv'
submission_df.to_csv("submission.csv", index=False)

# Create a zip file containing 'submission.csv'
with zipfile.ZipFile("submission.zip", "w") as zipf:
    zipf.write("submission.csv")

# Download the zip file containing the submission
files.download("submission.zip")



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>