In [None]:
import pandas as pd
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')



# Load datasets
train_data = pd.read_csv("drive/MyDrive/ColabNotebooks/incidents_train.csv")
valid_data = pd.read_csv("drive/MyDrive/ColabNotebooks/incidents_valid.csv")


# Function to check for missing and duplicate values
def check_data_quality(data, name):
    print(f"--- {name} ---")
    print(f"Shape: {data.shape}")
    print(f"Missing values:\n{data.isnull().sum()}\n")
    print(f"Duplicate rows: {data.duplicated().sum()}\n")
    print("="*50)

# Run checks on each dataset
check_data_quality(train_data, "Training Data")
check_data_quality(valid_data, "Validation Data")

Mounted at /content/drive
--- Training Data ---
Shape: (5082, 11)
Missing values:
Unnamed: 0          0
year                0
month               0
day                 0
country             0
title               0
text                0
hazard-category     0
product-category    0
hazard              0
product             0
dtype: int64

Duplicate rows: 0

--- Validation Data ---
Shape: (565, 11)
Missing values:
Unnamed: 0          0
year                0
month               0
day                 0
country             0
title               0
text                0
hazard-category     0
product-category    0
hazard              0
product             0
dtype: int64

Duplicate rows: 0



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, f1_score

# Features and labels for training set
X_train = train_data['text']  # Text column
y_hazard_category = train_data['hazard-category']  # Hazard category
y_product_category = train_data['product-category']  # Product category
y_hazard = train_data['hazard']  # Hazard
y_product = train_data['product']  # Product

# Features and labels for test set
X_test = valid_data['text']  # Text column
y_test_hazard_category = valid_data['hazard-category']  # Hazard category
y_test_product_category = valid_data['product-category']  # Product category
y_test_hazard = valid_data['hazard']  # Hazard
y_test_product = valid_data['product']  # Product


# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000, strip_accents='unicode', analyzer='char', ngram_range=(2, 5))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


def get_classifier(name="svm"):
    if name == "svm":
        return SVC(random_state=42, kernel='linear', probability=True)  # SVM Classifier
    elif name == "logreg":
        return LogisticRegression(random_state=42, max_iter=1000)  # Logistic Regression
    elif name == "knn":
        return KNeighborsClassifier(n_neighbors=5)  # K-Nearest Neighbors
    else:
        raise ValueError("Unknown classifier name")

# Select classifier type
classifier_type = "svm"

In [None]:
# ST1: Train models for hazard categorie
hazard_category_model = get_classifier(classifier_type)
hazard_category_model.fit(X_train_vec, y_hazard_category)



In [None]:
# ST1: Train models for product categorie
product_category_model = get_classifier(classifier_type)
product_category_model.fit(X_train_vec, y_product_category)


In [None]:
# ST2: Train models for hazard
hazard_model = get_classifier(classifier_type)
hazard_model.fit(X_train_vec, y_hazard)

In [None]:
# ST2: Train models for product

product_model = get_classifier(classifier_type)
product_model.fit(X_train_vec, y_product)

In [None]:
# Predictions on the training set for ST1
hazard_category_train_preds = hazard_category_model.predict(X_train_vec)
product_category_train_preds = product_category_model.predict(X_train_vec)

# Predictions on the training set for ST2
hazard_train_preds = hazard_model.predict(X_train_vec)
product_train_preds = product_model.predict(X_train_vec)

# Predictions on the test set for ST1
hazard_category_test_preds = hazard_category_model.predict(X_test_vec)
product_category_test_preds = product_category_model.predict(X_test_vec)

# Predictions on the test set for ST2
hazard_test_preds = hazard_model.predict(X_test_vec)
product_test_preds = product_model.predict(X_test_vec)

# Function to print macro and micro F1 scores
def print_f1_scores(y_true, y_pred, label):
    macro = f1_score(y_true, y_pred, average='macro')
    micro = f1_score(y_true, y_pred, average='micro')
    print(f"{label} - Macro F1: {macro:.2f}, Micro F1: {micro:.2f}")

# Evaluation on Test Set
#print("\nTest Set Evaluation:")

# Hazard Category
#print("Hazard Category Classification Report (Test):")
#print(classification_report(y_test_hazard_category, hazard_category_test_preds))
print_f1_scores(y_test_hazard_category, hazard_category_test_preds, "Hazard Category (Test)")

# Product Category
#print("Product Category Classification Report (Test):")
#print(classification_report(y_test_product_category, product_category_test_preds))
print_f1_scores(y_test_product_category, product_category_test_preds, "Product Category (Test)")

# Hazard
#print("Hazard Classification Report (Test):")
#print(classification_report(y_test_hazard, hazard_test_preds))
print_f1_scores(y_test_hazard, hazard_test_preds, "Hazard (Test)")

# Product
#print("Product Classification Report (Test):")
#print(classification_report(y_test_product, product_test_preds))
print_f1_scores(y_test_product, product_test_preds, "Product (Test)")


Hazard Category (Test) - Macro F1: 0.53, Micro F1: 0.87
Product Category (Test) - Macro F1: 0.30, Micro F1: 0.50
Hazard (Test) - Macro F1: 0.17, Micro F1: 0.64
Product (Test) - Macro F1: 0.05, Micro F1: 0.17


In [None]:
# Compute Final Scores
def compute_score(hazards_true, products_true, hazards_pred, products_pred):
    # Compute F1 for hazards:
    f1_hazards = f1_score(
        hazards_true,
        hazards_pred,
        average='macro'
    )

    # Compute F1 for products:
    f1_products = f1_score(
        products_true[hazards_pred == hazards_true],
        products_pred[hazards_pred == hazards_true],
        average='macro'
    )

    return (f1_hazards + f1_products) / 2.

# Final Score for ST1
st1_score = (f1_score(y_test_hazard_category, hazard_category_test_preds, average='macro') +
             f1_score(y_test_product_category, product_category_test_preds, average='macro')) / 2.0
print(f"\nScore Sub-Task 1 (Test): {st1_score:.3f}")

# Final Score for ST2
st2_score = compute_score(y_test_hazard, y_test_product, hazard_test_preds, product_test_preds)
print(f"Score Sub-Task 2 (Test): {st2_score:.3f}")


Score Sub-Task 1 (Test): 0.411
Score Sub-Task 2 (Test): 0.112


In [None]:
import zipfile

# Combine all predictions into a DataFrame
submission_df = pd.DataFrame({
    "index": valid_data.index,  # Use the test set index
    "hazard_category": hazard_category_test_preds,  # Hazard category predictions
    "product_category": product_category_test_preds,  # Product category predictions
    "hazard": hazard_test_preds,  # Hazard predictions
    "product": product_test_preds  # Product predictions
})

# Save the DataFrame to 'submission.csv'
submission_df.to_csv("submission.csv", index=False)

# Create a zip file containing 'submission.csv'
with zipfile.ZipFile("submission.zip", "w") as zipf:
    zipf.write("submission.csv")

# Κατέβασμα του ZIP αρχείου
files.download("submission.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>