In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e3/sample_submission.csv
/kaggle/input/playground-series-s4e3/train.csv
/kaggle/input/playground-series-s4e3/test.csv


In [4]:
# ==============================================================================
# 1. SETUP - Import Libraries and Define Configuration
# ==============================================================================
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_decision_forests as tfdf
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, log_loss, f1_score, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

print(f"TensorFlow Version: {tf.__version__}")
print(f"TensorFlow Decision Forests Version: {tfdf.__version__}")

# Configuration class to hold all constants
class Config:
    # File paths
    TRAIN_PATH = '/kaggle/input/playground-series-s4e3/train.csv'
    TEST_PATH = '/kaggle/input/playground-series-s4e3/test.csv'
    SAMPLE_SUBMISSION_PATH = '/kaggle/input/playground-series-s4e3/sample_submission.csv'
    SUBMISSION_PATH = 'submission.csv'

    # Data columns
    ID_COLUMN = 'id'
    TARGET_COLUMNS = [
        'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains',
        'Dirtiness', 'Bumps', 'Other_Faults'
    ]

    # Model parameters
    VALIDATION_SET_SIZE = 0.2
    RANDOM_STATE = 42
    BATCH_SIZE = 128

# Instantiate the configuration
config = Config()

# ==============================================================================
# 2. DATA LOADING & INITIAL ANALYSIS
# ==============================================================================
print("\n[INFO] Loading data...")
train_df = pd.read_csv(config.TRAIN_PATH)
test_df = pd.read_csv(config.TEST_PATH)
sample_submission_df = pd.read_csv(config.SAMPLE_SUBMISSION_PATH)

feature_cols = [col for col in train_df.columns if col not in [config.ID_COLUMN] + config.TARGET_COLUMNS]

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"Number of features: {len(feature_cols)}")
print("Features:", feature_cols)
print("Targets:", config.TARGET_COLUMNS)

# ==============================================================================
# 3. DATA SPLITTING
# ==============================================================================
print(f"\n[INFO] Splitting training data into training and validation sets (test_size={config.VALIDATION_SET_SIZE})...")

X = train_df[feature_cols]
y = train_df[config.TARGET_COLUMNS]

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=config.VALIDATION_SET_SIZE,
    random_state=config.RANDOM_STATE,
)

# Re-combine features and labels for easier conversion to tf.data.Dataset
train_split_df = pd.concat([X_train, y_train], axis=1)
val_split_df = pd.concat([X_val, y_val], axis=1)

print(f"Training split shape: {train_split_df.shape}")
print(f"Validation split shape: {val_split_df.shape}")

# ==============================================================================
# 4. DATA CONVERSION TO TENSORFLOW DATASET
# ==============================================================================
print("\n[INFO] Converting pandas DataFrames to TensorFlow Datasets...")

def to_tf_dataset(dataframe: pd.DataFrame, is_training=True):
    features_dict = {col: np.array(dataframe[col]) for col in feature_cols}
    if is_training:
        labels_dict = {col: np.array(dataframe[col]) for col in config.TARGET_COLUMNS}
        dataset = tf.data.Dataset.from_tensor_slices((features_dict, labels_dict))
    else:
        dataset = tf.data.Dataset.from_tensor_slices(features_dict)
    # Use prefetch for better performance
    dataset = dataset.batch(config.BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    return dataset

train_ds = to_tf_dataset(train_split_df, is_training=True)
val_ds = to_tf_dataset(val_split_df, is_training=True)
test_ds = to_tf_dataset(test_df, is_training=False)

# ==============================================================================
# 5. MODEL DEFINITION (CORRECTED)
# ==============================================================================
print("\n[INFO] Defining the TensorFlow Decision Forests model...")

# CORRECTED: Removed the `loss` argument as it's not a valid parameter for MultiTaskItem.
# The loss function is automatically inferred from the `task` type.
multi_task_items = [
    tfdf.keras.MultiTaskItem(
        label=label,
        task=tfdf.keras.Task.CLASSIFICATION
    ) for label in config.TARGET_COLUMNS
]

# Note: TF-DF GBT models are trained on the CPU. The GPU devices detected will not be used for training this model.
model = tfdf.keras.GradientBoostedTreesModel(
    multitask=multi_task_items,
    verbose=0,  # Set to 1 to see detailed training logs
    growing_strategy="BEST_FIRST_GLOBAL",
    max_depth=8,
    num_trees=1000,
)

# The compile step is lightweight and is mainly for setting up metrics to monitor.
model.compile(metrics=[tf.keras.metrics.AUC(name="auc", curve='ROC')])

# ==============================================================================
# 6. MODEL TRAINING
# ==============================================================================
print("\n[INFO] Starting model training...")
print("This may take a few minutes...")

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=1 # For GBT, the entire forest is built in one "epoch"
)

print("\n[INFO] Model training finished.")

# ==============================================================================
# 7. MODEL EVALUATION
# ==============================================================================
print("\n[INFO] Evaluating model performance on the validation set...")

val_predictions = model.predict(val_ds)

y_val_true = y_val.values
y_val_pred_probs = np.hstack([val_predictions[label] for label in config.TARGET_COLUMNS])
y_val_pred_labels = (y_val_pred_probs > 0.5).astype(int)

metrics_summary = {}
overall_roc_auc = roc_auc_score(y_val_true, y_val_pred_probs, average='macro')
metrics_summary['Overall ROC AUC (Competition Metric)'] = overall_roc_auc

all_roc_aucs, all_log_losses, all_f1_macros, all_f1_weighteds, all_accuracies = [], [], [], [], []

for i, label in enumerate(config.TARGET_COLUMNS):
    all_roc_aucs.append(roc_auc_score(y_val_true[:, i], y_val_pred_probs[:, i]))
    all_log_losses.append(log_loss(y_val_true[:, i], y_val_pred_probs[:, i]))
    all_f1_macros.append(f1_score(y_val_true[:, i], y_val_pred_labels[:, i], average='macro', zero_division=0))
    all_f1_weighteds.append(f1_score(y_val_true[:, i], y_val_pred_labels[:, i], average='weighted', zero_division=0))
    all_accuracies.append(accuracy_score(y_val_true[:, i], y_val_pred_labels[:, i]))

metrics_summary['avg_accuracy'] = np.mean(all_accuracies)
metrics_summary['avg_log_loss'] = np.mean(all_log_losses)
metrics_summary['avg_f1_macro'] = np.mean(all_f1_macros)
metrics_summary['avg_f1_weighted'] = np.mean(all_f1_weighteds)
metrics_summary['roc_auc_ovr'] = overall_roc_auc
metrics_summary['roc_auc_ovr_weighted'] = roc_auc_score(y_val_true, y_val_pred_probs, average='weighted')

print("\n--- Validation Metrics Summary ---")
for name, value in metrics_summary.items():
    print(f"{name:<35}: {value:.5f}")
print("----------------------------------\n")

print("--- Per-Target ROC AUC Scores ---")
for i, label in enumerate(config.TARGET_COLUMNS):
    print(f"{label:<15}: {all_roc_aucs[i]:.5f}")
print("---------------------------------\n")

# ==============================================================================
# 8. PREDICTION AND SUBMISSION
# ==============================================================================
print("[INFO] Generating predictions on the test set...")
test_predictions = model.predict(test_ds)

print("[INFO] Creating submission file...")
submission_df = pd.DataFrame({config.ID_COLUMN: test_df[config.ID_COLUMN]})

for label in config.TARGET_COLUMNS:
    submission_df[label] = test_predictions[label].flatten()

submission_df.to_csv(config.SUBMISSION_PATH, index=False)

print(f"\n[SUCCESS] Submission file '{config.SUBMISSION_PATH}' created successfully!")
print("Submission file head:")
print(submission_df.head())

TensorFlow Version: 2.18.0
TensorFlow Decision Forests Version: 1.11.0

[INFO] Loading data...
Training data shape: (19219, 35)
Test data shape: (12814, 28)
Number of features: 27
Features: ['X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum', 'Pixels_Areas', 'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index', 'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'LogOfAreas', 'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index', 'SigmoidOfAreas']
Targets: ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

[INFO] Splitting training data into training and validation sets (test_size=0.2)...
Training split shape: (15375, 34)
Validation split shape: (3844, 34)

[INFO] Converting pandas DataFrames to TensorFlow Datasets...

[INFO] Defining 

AttributeError: 'FieldDescriptor' object has no attribute '_default_constructor'