In [10]:
pip install pytorch-tabular

Note: you may need to restart the kernel to use updated packages.


In [61]:
from pytorch_tabular import TabularModel
from pytorch_tabular.models.node.config import NodeConfig
from pytorch_tabular.config import DataConfig, TrainerConfig
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pytorch_tabular.config import OptimizerConfig

from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import typing

# WDBS

In [None]:
def load_wdbc_data(path="data/wdbc/wdbc.data"):
  columns = [
      "ID", "Diagnosis",
      "radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean",
      "compactness_mean", "concavity_mean", "concave_points_mean", "symmetry_mean", "fractal_dimension_mean",
      "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se",
      "compactness_se", "concavity_se", "concave_points_se", "symmetry_se", "fractal_dimension_se",
      "radius_worst", "texture_worst", "perimeter_worst", "area_worst", "smoothness_worst",
      "compactness_worst", "concavity_worst", "concave_points_worst", "symmetry_worst", "fractal_dimension_worst"
  ]

  # Load the CSV file
  df = pd.read_csv(path, header=None, names=columns)

  target_col = "target"

  df[target_col] = df["Diagnosis"].map({"M": 1, "B": 0})  # 1 = Malignant, 0 = Benign
  columns_to_drop = ["ID", "Diagnosis"]
  
  df.drop(columns=columns_to_drop, inplace=True)
  feature_cols = df.drop(columns=[target_col]).columns.tolist()
  
  return df, feature_cols, target_col

# 1. Sample DataFrame
df, features, target = load_wdbc_data()

train_df, temp_df = train_test_split(df, test_size=0.8, stratify=df[target], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df[target], random_state=42)

scaler = StandardScaler()

train_df[features] = scaler.fit_transform(train_df[features])
val_df[features] = scaler.transform(val_df[features])
test_df[features] = scaler.transform(test_df[features])

## Default

In [72]:

if hasattr(typing, '_SpecialForm'):
    typing._SpecialForm.__qualname__ = 'typing._SpecialForm'

data_config = DataConfig(
    target=[target],
    continuous_cols=[col for col in df.columns if col != target],
    categorical_cols=[],
    normalize_continuous_features=True
)

model_config = NodeConfig(
    task="classification",
    num_layers=2,
    num_trees=1024,
    depth=6,
    learning_rate=1e-3,
    batch_norm_continuous_input=True
)

trainer_config = TrainerConfig(
    max_epochs=6,
    batch_size=256,
    early_stopping=None,  # Disabled
    checkpoints=None,     # Disabled
    load_best=False       # Disabled
)

optimizer_config = OptimizerConfig()

# 5. Initialize and train the model
node_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    trainer_config=trainer_config,
    optimizer_config=optimizer_config
)

node_model.fit(train=train_df, validation=val_df)

# Predictions
node_pred = node_model.predict(test_df)
true_labels = test_df[target]

# Metrics
node_metrics = {
    'Accuracy': accuracy_score(true_labels, node_pred[f"{target}_prediction"]),
    'F1': f1_score(true_labels, node_pred[f"{target}_prediction"], average='weighted')
}

Seed set to 42



Ignoring head config because NODE has a specific head which subsets the tree outputs




'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.


Data-aware initialization is performed on less than 1000 data points. This may cause instability.To avoid potential problems, run this model on a data batch with at least 1000 data samples.You can do so manually before training. Use with torch.no_grad() for memory efficiency.



Output()

In [73]:
catboost_model = CatBoostClassifier(
    verbose=0,
    random_state=42
)

catboost_model.fit(
    train_df.drop(columns=[target]),
    train_df[target],
    eval_set=(val_df.drop(columns=[target]), val_df[target]),
    use_best_model=True
)

cb_pred = catboost_model.predict(test_df.drop(columns=[target]))
cb_probs = catboost_model.predict_proba(test_df.drop(columns=[target]))[:, 1]

cb_metrics = {
    'Accuracy': accuracy_score(true_labels, cb_pred),
    'F1': f1_score(true_labels, cb_pred, average='weighted')
}

In [74]:
xgb_model = XGBClassifier(
    random_state=42
)

xgb_model.fit(
    train_df.drop(columns=[target]),
    train_df[target],
    eval_set=[(val_df.drop(columns=[target]), val_df[target])],
    verbose=0
)

xgb_pred = xgb_model.predict(test_df.drop(columns=[target]))
xgb_probs = xgb_model.predict_proba(test_df.drop(columns=[target]))[:, 1]

xgb_metrics = {
    'Accuracy': accuracy_score(true_labels, xgb_pred),
    'F1': f1_score(true_labels, xgb_pred, average='weighted')
}

In [77]:
import pandas as pd
import plotly.express as px

results = pd.DataFrame({
    'NODE': node_metrics,
    'CatBoost': cb_metrics,
    'XGBoost': xgb_metrics
}).T.reset_index().rename(columns={'index': 'Model'})

results_melted = results.melt(id_vars="Model", var_name="Metric", value_name="Score")

fig = px.bar(
    results_melted,
    x="Metric",
    y="Score",
    color="Model",
    barmode="group",
    text_auto=".3f",
    title="WDBC Dataset Model Comparison",
    height=400
)

fig.update_layout(
    yaxis_title="Score",
    xaxis_title="Metric",
    title_x=0.5,
    template="plotly_white",
    legend_title="Model"
)

fig.show()

## Automatically tuned

# Iris

In [78]:
from sklearn.datasets import load_iris

def load_iris_data():
  iris = load_iris(as_frame=True)
  df = iris.frame.copy()
  target_col = "target"
  feature_cols = iris.feature_names
  df[target_col] = iris.target
  return df, feature_cols, target_col

df, features, target = load_iris_data()

train_df, temp_df = train_test_split(df, test_size=0.8, stratify=df[target], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df[target], random_state=42)

scaler = StandardScaler()
train_df[features] = scaler.fit_transform(train_df[features])
val_df[features] = scaler.transform(val_df[features])
test_df[features] = scaler.transform(test_df[features])


## Default

In [79]:

if hasattr(typing, '_SpecialForm'):
    typing._SpecialForm.__qualname__ = 'typing._SpecialForm'

data_config = DataConfig(
    target=[target],
    continuous_cols=[col for col in df.columns if col != target],
    categorical_cols=[],
    normalize_continuous_features=True
)

model_config = NodeConfig(
    task="classification",
    num_layers=2,
    num_trees=1024,
    depth=6,
    learning_rate=1e-3,
    batch_norm_continuous_input=True
)

trainer_config = TrainerConfig(
    max_epochs=6,
    batch_size=256,
    early_stopping=None,  # Disabled
    checkpoints=None,     # Disabled
    load_best=False       # Disabled
)

optimizer_config = OptimizerConfig()

# 5. Initialize and train the model
node_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    trainer_config=trainer_config,
    optimizer_config=optimizer_config
)

node_model.fit(train=train_df, validation=val_df)

# Predictions
node_pred = node_model.predict(test_df)
true_labels = test_df[target]

# Metrics
node_metrics = {
    'Accuracy': accuracy_score(true_labels, node_pred[f"{target}_prediction"]),
    'F1': f1_score(true_labels, node_pred[f"{target}_prediction"], average='weighted')
}

Seed set to 42



Ignoring head config because NODE has a specific head which subsets the tree outputs




'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.


Data-aware initialization is performed on less than 1000 data points. This may cause instability.To avoid potential problems, run this model on a data batch with at least 1000 data samples.You can do so manually before training. Use with torch.no_grad() for memory efficiency.



Output()

In [80]:
catboost_model = CatBoostClassifier(
    verbose=0,
    random_state=42
)

catboost_model.fit(
    train_df.drop(columns=[target]),
    train_df[target],
    eval_set=(val_df.drop(columns=[target]), val_df[target]),
    use_best_model=True
)

cb_pred = catboost_model.predict(test_df.drop(columns=[target]))
cb_probs = catboost_model.predict_proba(test_df.drop(columns=[target]))[:, 1]

cb_metrics = {
    'Accuracy': accuracy_score(true_labels, cb_pred),
    'F1': f1_score(true_labels, cb_pred, average='weighted')
}

In [81]:
xgb_model = XGBClassifier(
    random_state=42
)

xgb_model.fit(
    train_df.drop(columns=[target]),
    train_df[target],
    eval_set=[(val_df.drop(columns=[target]), val_df[target])],
    verbose=0
)

xgb_pred = xgb_model.predict(test_df.drop(columns=[target]))
xgb_probs = xgb_model.predict_proba(test_df.drop(columns=[target]))[:, 1]

xgb_metrics = {
    'Accuracy': accuracy_score(true_labels, xgb_pred),
    'F1': f1_score(true_labels, xgb_pred, average='weighted')
}

In [85]:
import pandas as pd
import plotly.express as px

results = pd.DataFrame({
    'NODE': node_metrics,
    'CatBoost': cb_metrics,
    'XGBoost': xgb_metrics
}).T.reset_index().rename(columns={'index': 'Model'})

results_melted = results.melt(id_vars="Model", var_name="Metric", value_name="Score")

fig = px.bar(
    results_melted,
    x="Metric",
    y="Score",
    color="Model",
    barmode="group",
    text_auto=".3f",
    title="Iris Dataset Model Comparison",
    height=400
)

fig.update_layout(
    yaxis_title="Score",
    xaxis_title="Metric",
    title_x=0.5,
    template="plotly_white",
    legend_title="Model"
)

fig.show()

## Automatically tuned

# Wine Quality

In [86]:
from sklearn.datasets import load_wine

def load_wine_data():
  wine = load_wine(as_frame=True)
  df = wine.frame.copy()
  target_col = "target"
  feature_cols = wine.feature_names
  df[target_col] = wine.target
  return df, feature_cols, target_col

df, features, target = load_wine_data()

train_df, temp_df = train_test_split(df, test_size=0.8, stratify=df[target], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df[target], random_state=42)

scaler = StandardScaler()
train_df[features] = scaler.fit_transform(train_df[features])
val_df[features] = scaler.transform(val_df[features])
test_df[features] = scaler.transform(test_df[features])

## Default

In [87]:

if hasattr(typing, '_SpecialForm'):
    typing._SpecialForm.__qualname__ = 'typing._SpecialForm'

data_config = DataConfig(
    target=[target],
    continuous_cols=[col for col in df.columns if col != target],
    categorical_cols=[],
    normalize_continuous_features=True
)

model_config = NodeConfig(
    task="classification",
    num_layers=2,
    num_trees=1024,
    depth=6,
    learning_rate=1e-3,
    batch_norm_continuous_input=True
)

trainer_config = TrainerConfig(
    max_epochs=6,
    batch_size=256,
    early_stopping=None,  # Disabled
    checkpoints=None,     # Disabled
    load_best=False       # Disabled
)

optimizer_config = OptimizerConfig()

# 5. Initialize and train the model
node_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    trainer_config=trainer_config,
    optimizer_config=optimizer_config
)

node_model.fit(train=train_df, validation=val_df)

# Predictions
node_pred = node_model.predict(test_df)
true_labels = test_df[target]

# Metrics
node_metrics = {
    'Accuracy': accuracy_score(true_labels, node_pred[f"{target}_prediction"]),
    'F1': f1_score(true_labels, node_pred[f"{target}_prediction"], average='weighted')
}

Seed set to 42



Ignoring head config because NODE has a specific head which subsets the tree outputs




'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.


Data-aware initialization is performed on less than 1000 data points. This may cause instability.To avoid potential problems, run this model on a data batch with at least 1000 data samples.You can do so manually before training. Use with torch.no_grad() for memory efficiency.



Output()

In [88]:
catboost_model = CatBoostClassifier(
    verbose=0,
    random_state=42
)

catboost_model.fit(
    train_df.drop(columns=[target]),
    train_df[target],
    eval_set=(val_df.drop(columns=[target]), val_df[target]),
    use_best_model=True
)

cb_pred = catboost_model.predict(test_df.drop(columns=[target]))
cb_probs = catboost_model.predict_proba(test_df.drop(columns=[target]))[:, 1]

cb_metrics = {
    'Accuracy': accuracy_score(true_labels, cb_pred),
    'F1': f1_score(true_labels, cb_pred, average='weighted')
}

In [89]:
xgb_model = XGBClassifier(
    random_state=42
)

xgb_model.fit(
    train_df.drop(columns=[target]),
    train_df[target],
    eval_set=[(val_df.drop(columns=[target]), val_df[target])],
    verbose=0
)

xgb_pred = xgb_model.predict(test_df.drop(columns=[target]))
xgb_probs = xgb_model.predict_proba(test_df.drop(columns=[target]))[:, 1]

xgb_metrics = {
    'Accuracy': accuracy_score(true_labels, xgb_pred),
    'F1': f1_score(true_labels, xgb_pred, average='weighted')
}

In [90]:
import pandas as pd
import plotly.express as px

results = pd.DataFrame({
    'NODE': node_metrics,
    'CatBoost': cb_metrics,
    'XGBoost': xgb_metrics
}).T.reset_index().rename(columns={'index': 'Model'})

results_melted = results.melt(id_vars="Model", var_name="Metric", value_name="Score")

fig = px.bar(
    results_melted,
    x="Metric",
    y="Score",
    color="Model",
    barmode="group",
    text_auto=".3f",
    title="Iris Dataset Model Comparison",
    height=400
)

fig.update_layout(
    yaxis_title="Score",
    xaxis_title="Metric",
    title_x=0.5,
    template="plotly_white",
    legend_title="Model"
)

fig.show()

## Automatically tuned