In [1]:
import pandas as pd


# Load datasets
train_data = pd.read_csv("incidents_train.csv")
valid_data = pd.read_csv("incidents_valid.csv")


# Function to check for missing and duplicate values
def check_data_quality(data, name):
    print(f"--- {name} ---")
    print(f"Shape: {data.shape}")
    print(f"Missing values:\n{data.isnull().sum()}\n")
    print(f"Duplicate rows: {data.duplicated().sum()}\n")
    print("="*50)

# Run checks on each dataset
check_data_quality(train_data, "Training Data")
check_data_quality(valid_data, "Validation Data")

--- Training Data ---
Shape: (5082, 11)
Missing values:
Unnamed: 0          0
year                0
month               0
day                 0
country             0
title               0
text                0
hazard-category     0
product-category    0
hazard              0
product             0
dtype: int64

Duplicate rows: 0

--- Validation Data ---
Shape: (565, 11)
Missing values:
Unnamed: 0          0
year                0
month               0
day                 0
country             0
title               0
text                0
hazard-category     0
product-category    0
hazard              0
product             0
dtype: int64

Duplicate rows: 0



In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score


# Features and labels for training set
X_train = train_data['text']  # Text column
y_hazard_category = train_data['hazard-category']  # Hazard category
y_product_category = train_data['product-category']  # Product category
y_hazard = train_data['hazard']  # Hazard
y_product = train_data['product']  # Product

# Features and labels for test set
X_test = valid_data['text']  # Text column
y_test_hazard_category = valid_data['hazard-category']  # Hazard category
y_test_product_category = valid_data['product-category']  # Product category
y_test_hazard = valid_data['hazard']  # Hazard
y_test_product = valid_data['product']  # Product

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000, strip_accents='unicode', analyzer='char', ngram_range=(2, 5))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [3]:
# ST1: Train models for hazard categorie
hazard_category_model_2 = MultinomialNB()
hazard_category_model_2.fit(X_train_vec, y_hazard_category)

In [4]:
# ST1: Train models for product categorie
product_category_model_2 = MultinomialNB()
product_category_model_2.fit(X_train_vec, y_product_category)

In [5]:
# ST2: Train models for hazard
hazard_model_2 = MultinomialNB()
hazard_model_2.fit(X_train_vec, y_hazard)


In [6]:
# ST2: Train models for product
product_model_2 = MultinomialNB()
product_model_2.fit(X_train_vec, y_product)

In [8]:
# Predictions on the training set for ST1
hazard_category_train_preds = hazard_category_model_2.predict(X_train_vec)
product_category_train_preds = product_category_model_2.predict(X_train_vec)

# Predictions on the training set for ST2

hazard_train_preds = hazard_model_2.predict(X_train_vec)
product_train_preds = product_model_2.predict(X_train_vec)

# Predictions on the test set
hazard_category_test_preds = hazard_category_model_2.predict(X_test_vec)
product_category_test_preds = product_category_model_2.predict(X_test_vec)
hazard_test_preds = hazard_model_2.predict(X_test_vec)
product_test_preds = product_model_2.predict(X_test_vec)


In [9]:
# Evaluation function to print F1 scores
from sklearn.metrics import classification_report, f1_score

def print_f1_scores(y_true, y_pred, label):
    macro = f1_score(y_true, y_pred, average='macro')
    micro = f1_score(y_true, y_pred, average='micro')
    print(f"{label} - Macro F1: {macro:.2f}, Micro F1: {micro:.2f}")

# Evaluation on Test Set
#print("\nTest Set Evaluation:")

# Hazard Category
#print("Hazard Category Classification Report (Test):")
#print(classification_report(y_test_hazard_category_encoded, hazard_category_test_preds))
print_f1_scores(y_test_hazard_category, hazard_category_test_preds, "Hazard Category (Test)")

# Product Category
#print("Product Category Classification Report (Test):")
#print(classification_report(y_test_product_category_encoded, product_category_test_preds))
print_f1_scores(y_test_product_category, product_category_test_preds, "Product Category (Test)")

# Hazard
#print("Hazard Classification Report (Test):")
#print(classification_report(y_test_hazard_encoded, hazard_test_preds))
print_f1_scores(y_test_hazard, hazard_test_preds, "Hazard (Test)")

# Product
#print("Product Classification Report (Test):")
#print(classification_report(y_test_product_encoded, product_test_preds))
print_f1_scores(y_test_product, product_test_preds, "Product (Test)")

Hazard Category (Test) - Macro F1: 0.32, Micro F1: 0.70
Product Category (Test) - Macro F1: 0.11, Micro F1: 0.33
Hazard (Test) - Macro F1: 0.05, Micro F1: 0.42
Product (Test) - Macro F1: 0.00, Micro F1: 0.07


In [10]:
# Final Scores
def compute_score(hazards_true, products_true, hazards_pred, products_pred):
    # Compute F1 for hazards:
    f1_hazards = f1_score(hazards_true, hazards_pred, average='macro')

    # Compute F1 for products:
    f1_products = f1_score(
        products_true[hazards_pred == hazards_true],
        products_pred[hazards_pred == hazards_true],
        average='macro'
    )

    return (f1_hazards + f1_products) / 2.0

# Final Score for ST1
st1_score = (f1_score(y_test_hazard_category, hazard_category_test_preds, average='macro') +
             f1_score(y_test_product_category, product_category_test_preds, average='macro')) / 2.0
print(f"\nScore Sub-Task 1 (Test): {st1_score:.3f}")

# Final Score for ST2
st2_score = compute_score(y_test_hazard, y_test_product, hazard_test_preds, product_test_preds)
print(f"Score Sub-Task 2 (Test): {st2_score:.3f}")


Score Sub-Task 1 (Test): 0.216
Score Sub-Task 2 (Test): 0.028


In [11]:
import zipfile

# Combine all predictions into a DataFrame
submission_df = pd.DataFrame({
    "index": valid_data.index,  # Use the test set index
    "hazard_category": hazard_category_test_preds,  # Hazard category predictions
    "product_category": product_category_test_preds,  # Product category predictions
    "hazard": hazard_test_preds,  # Hazard predictions
    "product": product_test_preds  # Product predictions
})

# Save the DataFrame to 'submission.csv'
submission_df.to_csv("submission.csv", index=False)

# Create a zip file containing 'submission.csv'
with zipfile.ZipFile("submission.zip", "w") as zipf:
    zipf.write("submission.csv")