In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
    log_loss,
    f1_score
)
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

def run_ml_task():
    """
    Executes the end-to-end machine learning task for steel plate fault prediction.
    """
    print("Starting the Multi-Label Steel Plate Fault Prediction task...")

    # Define the target columns
    target_columns = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

    # --- 1. Load Data ---
    try:
        df = pd.read_csv('train.csv')
        print("Data loaded successfully from 'train.csv'.")
        print(f"Dataset shape: {df.shape}")
        print("First 5 rows of the dataset:")
        print(df.head())
    except FileNotFoundError:
        print("Error: 'train.csv' not found. Please ensure the file is in the same directory.")
        return
    except Exception as e:
        print(f"Error loading data: {e}")
        return

    # Separate features (X) and targets (y)
    X = df.drop(columns=target_columns)
    y = df[target_columns]

    # --- 2. Data Preprocessing ---
    # Identify numerical and categorical features (assuming all other columns are numerical for now)
    # [Suy luận] Based on the problem description, features are numerical and/or categorical.
    # For simplicity, we'll assume all non-target columns are numerical and need scaling.
    # If there were explicit categorical columns, one-hot encoding would be needed.
    numerical_features = X.select_dtypes(include=np.number).columns.tolist()
    # categorical_features = X.select_dtypes(include='object').columns.tolist() # Uncomment and use if you have categorical features

    # Handle missing values (simple imputation for numerical features)
    # [Suy luận] If there are missing values, a simple strategy like mean imputation is used.
    # For robust solutions, more sophisticated imputation methods or handling would be needed.
    for col in numerical_features:
        if X[col].isnull().any():
            X[col] = X[col].fillna(X[col].mean())
            print(f"Missing values in column '{col}' imputed with mean.")

    # Scale numerical features
    scaler = StandardScaler()
    X[numerical_features] = scaler.fit_transform(X[numerical_features])
    print("Numerical features scaled using StandardScaler.")

    # --- 3. Split Data ---
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Data split into training ({X_train.shape[0]} samples) and testing ({X_test.shape[0]} samples) sets.")

    # --- 4. Model Training ---
    # Using Logistic Regression with MultiOutputClassifier for multi-label classification
    # Logistic Regression is a good baseline and provides probability estimates.
    # Using 'liblinear' solver for smaller datasets or L1 regularization.
    # 'ovr' (One-vs-Rest) strategy is default for MultiOutputClassifier when underlying
    # classifier doesn't natively support multi-label.
    base_classifier = LogisticRegression(solver='liblinear', random_state=42, n_jobs=-1)
    model = MultiOutputClassifier(base_classifier)

    print("Training the Multi-Output Logistic Regression model...")
    model.fit(X_train, y_train)
    print("Model training complete.")

    # --- 5. Prediction ---
    # Predict probabilities for each defect type
    y_pred_proba = model.predict_proba(X_test)

    # predict_proba returns a list of arrays for MultiOutputClassifier
    # We need to reshape it into a single 2D array (n_samples, n_labels)
    # Each inner array is (n_samples, n_classes), we need the probability of the positive class (index 1)
    y_pred_proba_combined = np.column_stack([p[:, 1] for p in y_pred_proba])

    # For other metrics that require binary predictions, we need to threshold probabilities (e.g., at 0.5)
    y_pred_binary = (y_pred_proba_combined > 0.5).astype(int)

    # --- 6. Evaluation ---
    print("\n--- Evaluation Metrics ---")

    # Primary Evaluation Metric: Average ROC AUC across all 7 defect categories
    # roc_auc_score for multilabel: use average='macro' or 'weighted'
    # 'macro' computes the metric independently for each class, and then takes the average (unweighted).
    # 'weighted' accounts for label imbalance by weighting the average by the support (true instances for each label).
    # Since the problem statement doesn't specify weighting, 'macro' is a common default for unweighted average.
    # However, 'roc_auc_ovr_weighted' suggests we should try weighted.

    # AUC score for each label
    auc_scores_per_label = []
    for i, label in enumerate(target_columns):
        try:
            auc_score = roc_auc_score(y_test[label], y_pred_proba_combined[:, i])
            auc_scores_per_label.append(auc_score)
            print(f"ROC AUC for {label}: {auc_score:.4f}")
        except ValueError as e:
            # [Chưa xác minh] This can happen if a class has only one label in y_test
            print(f"[Warning] Could not calculate ROC AUC for '{label}': {e}. Skipping this label for individual AUC.")
            # Append NaN or 0 to not skew the average if it's truly not calculable
            # For robust averaging, it's better to exclude or handle these cases.
            # For simplicity, we'll append 0 here, but a more robust approach might be to filter.
            auc_scores_per_label.append(np.nan) # Append NaN to exclude from mean later

    # Calculate the average ROC AUC (excluding NaNs)
    average_roc_auc = np.nanmean(auc_scores_per_label)
    print(f"\nAverage ROC AUC across all defect categories: {average_roc_auc:.4f}")

    # Additional Metrics
    # Accuracy (Exact Match Ratio for multi-label)
    # accuracy_score with normalize=True gives exact match ratio
    accuracy = accuracy_score(y_test, y_pred_binary)
    print(f"Accuracy (Exact Match Ratio): {accuracy:.4f}")

    # Log Loss
    # log_loss is computed for each label and then averaged
    # [Suy luận] For multilabel, sklearn's log_loss requires flattened arrays or iterating over labels.
    # MultiOutputClassifier's predict_proba often gives a list of arrays.
    logloss_per_label = []
    for i, label in enumerate(target_columns):
        # Flatten y_test[label] to match the expected 1D input for log_loss
        logloss_per_label.append(log_loss(y_test[label], y_pred_proba_combined[:, i]))
    average_log_loss = np.mean(logloss_per_label)
    print(f"Log Loss: {average_log_loss:.4f}")

    # F1-macro and F1-weighted
    # 'macro': Calculate metrics for each label, and find their unweighted mean.
    # 'weighted': Calculate metrics for each label, and find their average weighted by support.
    f1_macro = f1_score(y_test, y_pred_binary, average='macro', zero_division=0)
    f1_weighted = f1_score(y_test, y_pred_binary, average='weighted', zero_division=0)
    print(f"F1-macro: {f1_macro:.4f}")
    print(f"F1-weighted: {f1_weighted:.4f}")

    # ROC AUC OvR and ROC AUC OvR weighted
    # For multi-label, 'roc_auc_score' with 'multi_class="ovr"' is not directly applicable
    # as it's meant for multi-class where each sample belongs to only one class.
    # For multi-label, we typically calculate AUC for each binary problem and then average.
    # The 'average' parameter in roc_auc_score already handles this for multi-label y_true and y_score.
    roc_auc_ovr = roc_auc_score(y_test, y_pred_proba_combined, average='macro')
    roc_auc_ovr_weighted = roc_auc_score(y_test, y_pred_proba_combined, average='weighted')
    print(f"ROC AUC (macro average): {roc_auc_ovr:.4f}")
    print(f"ROC AUC (weighted average): {roc_auc_ovr_weighted:.4f}")

    print("\n--- Program finished successfully ---")

# Execute the program
if __name__ == "__main__":
    run_ml_task()


Starting the Multi-Label Steel Plate Fault Prediction task...
Data loaded successfully from 'train.csv'.
Dataset shape: (19219, 35)
First 5 rows of the dataset:
   id  X_Minimum  X_Maximum  Y_Minimum  Y_Maximum  Pixels_Areas  X_Perimeter  \
0   0        584        590     909972     909977            16            8   
1   1        808        816     728350     728372           433           20   
2   2         39        192    2212076    2212144         11388          705   
3   3        781        789    3353146    3353173           210           16   
4   4       1540       1560     618457     618502           521           72   

   Y_Perimeter  Sum_of_Luminosity  Minimum_of_Luminosity  ...  \
0            5               2274                    113  ...   
1           54              44478                     70  ...   
2          420            1311391                     29  ...   
3           29               3202                    114  ...   
4           67              48231