In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer

X_test = pd.read_csv("./X_test.csv")
df_y_test = pd.read_csv("./y_test.csv")
y_test = df_y_test.squeeze().astype(int) - 1  # Shift to 0-indexed classes
X_train = pd.read_csv("./X_train.csv")
df_y_train = pd.read_csv("./y_train.csv")
y_train = df_y_train.squeeze().astype(int) - 1  # Shift to 0-indexed classes

# One-hot encode categorical columns
categorical_cols = ['outlook', 'prioroutlook']
X_train = pd.get_dummies(X_train, columns=categorical_cols, drop_first=True)
X_test = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)

# Ensure both train and test have the same columns (handle unseen categories)
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Handle missing values with median imputation
imputer = SimpleImputer(strategy='median')
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Missing values in X_train: {X_train.isnull().sum().sum()}")
print(f"Missing values in X_test: {X_test.isnull().sum().sum()}")
print(f"y_train classes: {sorted(y_train.unique())}")

X_train shape: (1664, 79)
X_test shape: (290, 79)
Missing values in X_train: 0
Missing values in X_test: 0
y_train classes: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10)]


In [4]:
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score, classification_report

# ----- Align columns -----
common_cols = list(set(X_train.columns) & set(X_test.columns))
X_train = X_train[common_cols].copy()
X_test = X_test[common_cols].copy()

# ----- Identify categorical columns -----
categorical_features = [
    col for col in X_train.columns 
    if X_train[col].dtype == "object" or X_train[col].dtype == "bool"
]

# ----- Use a single OrdinalEncoder for ALL categorical features -----
enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
X_train[categorical_features] = enc.fit_transform(X_train[categorical_features].astype(str))
X_test[categorical_features] = enc.transform(X_test[categorical_features].astype(str))

# Shift unknown_value -1 to 0
X_train[categorical_features] += 1
X_test[categorical_features] += 1

# ----- Fill NaNs -----
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

# ----- Convert to numpy -----
X_train_np = X_train.values
X_test_np = X_test.values
y_train_np = y_train.values.ravel()
y_test_np = y_test.values.ravel()

# ----- categorical indexes & dims -----
cat_idxs = [X_train.columns.get_loc(col) for col in categorical_features]
cat_dims = [int(X_train[col].max()) + 1 for col in categorical_features]

# ----- TabNet -----
clf = TabNetClassifier(
    n_d=32,
    n_a=32,
    n_steps=4,
    gamma=1.3,
    lambda_sparse=1e-4,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=3e-3),
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=5,   # << improved
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    scheduler_params={"step_size": 20, "gamma": 0.9},
    verbose=1
)

clf.fit(
    X_train_np,
    y_train_np,
    eval_set=[(X_test_np, y_test_np)],
    eval_metric=['balanced_accuracy'],
    max_epochs=200,
    patience=30,
    batch_size=2048,
    virtual_batch_size=256
)

y_pred = clf.predict(X_test_np)

print("Accuracy:", accuracy_score(y_test_np, y_pred))
print(classification_report(y_test_np, y_pred))




epoch 0  | loss: 0.0     | val_0_balanced_accuracy: 0.09272 |  0:00:00s
epoch 1  | loss: 0.0     | val_0_balanced_accuracy: 0.09272 |  0:00:00s
epoch 2  | loss: 0.0     | val_0_balanced_accuracy: 0.09272 |  0:00:00s
epoch 3  | loss: 0.0     | val_0_balanced_accuracy: 0.09272 |  0:00:00s
epoch 4  | loss: 0.0     | val_0_balanced_accuracy: 0.09272 |  0:00:00s
epoch 5  | loss: 0.0     | val_0_balanced_accuracy: 0.09272 |  0:00:00s
epoch 6  | loss: 0.0     | val_0_balanced_accuracy: 0.09272 |  0:00:00s
epoch 7  | loss: 0.0     | val_0_balanced_accuracy: 0.09272 |  0:00:00s
epoch 8  | loss: 0.0     | val_0_balanced_accuracy: 0.09272 |  0:00:00s
epoch 9  | loss: 0.0     | val_0_balanced_accuracy: 0.09272 |  0:00:00s
epoch 10 | loss: 0.0     | val_0_balanced_accuracy: 0.09272 |  0:00:00s
epoch 11 | loss: 0.0     | val_0_balanced_accuracy: 0.09272 |  0:00:00s
epoch 12 | loss: 0.0     | val_0_balanced_accuracy: 0.09272 |  0:00:00s
epoch 13 | loss: 0.0     | val_0_balanced_accuracy: 0.09272 |  0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [3]:
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize

# Calculate within ±1 class accuracy
def tolerance_accuracy(y_true, y_pred, tolerance=1):
    """Calculate accuracy within a tolerance of the true class."""
    return np.mean(np.abs(y_true - y_pred) <= tolerance)

within_1_accuracy = tolerance_accuracy(y_test_np, y_pred, tolerance=1)
print(f"Within ±1 Class Accuracy: {within_1_accuracy:.4f} ({within_1_accuracy*100:.2f}%)")

# Calculate ROC AUC
y_pred_proba = clf.predict_proba(X_test_np)
classes = np.arange(11)  # 0-10 classes
y_test_bin = label_binarize(y_test_np, classes=classes)

roc_auc = roc_auc_score(y_test_bin, y_pred_proba, multi_class='ovr', average='macro')
print(f"ROC AUC (macro): {roc_auc:.4f}")

Within ±1 Class Accuracy: 0.3172 (31.72%)
ROC AUC (macro): 0.5812
