# Installations and setup, from: 
#### https://colab.research.google.com/github/PriorLabs/TabPFN/blob/main/examples/notebooks/TabPFN_Demo_Local.ipynb

In [2]:
## Base library Installation
# Install Baselines for model comparison
!pip install catboost xgboost

# Install the datasets library for loading example data
!pip install datasets

# Install rich for better and more readable printing
!pip install rich

## TabPFN Installation optimized for Google Colab
# Install the TabPFN Client library
!pip install tabpfn-client

# Install TabPFN extensions for additional functionalities
!pip install tabpfn-extensions[all]

# Install tabpfn
!pip install tabpfn

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting plotly (from catboost)
  Downloading plotly-6.5.0-py3-none-any.whl.metadata (8.5 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m97.4 MB/s[0m  [33m0:00:01[0m6m0:00:01[0m00:01[0m
[?25hDownloading plotly-6.5.0-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m96.7 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: plotly, catboost
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [catboost]1/2[0m [catboost]
[1A[2KSuccessfully installed catboost-1.2.8 plotly-6.5.0
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting multiprocess<0.70.19 (from datasets)
  Using cached multiprocess-0.70.18-py312-none-any.whl.metadata (7.5 kB)
C

In [3]:
# Standard Library Imports

# TabPFN and Extensions

try:
    from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import (
        AutoTabPFNClassifier,
    )

    from tabpfn import TabPFNClassifier, TabPFNRegressor
except ImportError:
    raise ImportError(
        "Warning: Could not import TabPFN / TabPFN extensions. Please run installation above and restart the session afterwards (Runtime > Restart Session)."
    )

# Data Science & Visualization
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch

# Other ML Models
from catboost import CatBoostClassifier, CatBoostRegressor

# Notebook UI/Display
from IPython.display import Markdown, display
from rich.console import Console
from rich.panel import Panel
from rich.prompt import Prompt
from rich.rule import Rule
from sklearn.compose import make_column_selector, make_column_transformer

# Scikit-Learn: Data & Preprocessing
from sklearn.datasets import fetch_openml, load_breast_cancer

# Scikit-Learn: Models
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.model_selection import (
    KFold,
    StratifiedKFold,
    cross_val_score,
    train_test_split,
)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from xgboost import XGBClassifier, XGBRegressor

# This transformer will be used to handle categorical features for the baseline models
column_transformer = make_column_transformer(
    (
        OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
        make_column_selector(dtype_include=["object", "category"]),
    ),
    remainder="passthrough",
)

  import pkg_resources


# Setting up tab

In [16]:
# Simple import for TabPFN
from tabpfn import TabPFNClassifier

# Now you can use it like any other sklearn classifier
#model = TabPFNClassifier()
print("TabPFNClassifier imported successfully.")

TabPFNClassifier imported successfully.


In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

df = pd.read_csv("FPA-FOD_39attributes.csv", low_memory=False)

unknown_mask = df['NWCG_GENERAL_CAUSE'] == "Missing data/not specified/undetermined"
df_known = df[~unknown_mask].copy()

# -------------------------
# Drop 2 smallest classes
# -------------------------
cause_counts = df_known['NWCG_GENERAL_CAUSE'].value_counts()
classes_to_drop = cause_counts.nsmallest(2).index.tolist()
classes_to_drop.append("Unnamed: 0")
df_reduced = df_known[~df_known['NWCG_GENERAL_CAUSE'].isin(classes_to_drop)].copy()

# -------------------------
# Fill missing values (prevents NaNs in PyTorch)
# -------------------------
df_reduced['Mang_Name'] = df_reduced['Mang_Name'].fillna('Unknown')
df_reduced['STATE'] = df_reduced['STATE'].fillna('Unknown')

numeric_cols = df_reduced.drop(
    columns=['NWCG_GENERAL_CAUSE', 'Mang_Name', 'STATE']
).columns
df_reduced[numeric_cols] = df_reduced[numeric_cols].fillna(
    df_reduced[numeric_cols].median()
)

# -------------------------
# Recreate X, y
# -------------------------
X = df_reduced.drop(columns=['NWCG_GENERAL_CAUSE'])
y = df_reduced['NWCG_GENERAL_CAUSE']

# Encode y
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Encode categorical columns
categorical_cols = ['Mang_Name', 'STATE']

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_encoded = encoder.fit_transform(X[categorical_cols])

# Build final X DataFrame
encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(categorical_cols))
X_numeric = X.drop(columns=categorical_cols).reset_index(drop=True)
X_final = pd.concat([X_numeric, encoded_df], axis=1)

# -------------------------
# Train/Test Split
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

In [27]:
tabpfn_classifier = TabPFNClassifier(random_state=42)
tabpfn_classifier.fit(X_train, y_train)
y_pred_proba = tabpfn_classifier.predict_proba(X_test)

# Calculate the ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_proba[:, 1])
print(f"TabPFN ROC AUC Score: {roc_auc:.4f}")

ValueError: There should be no NaNs in the encoded x and y.Check that you do not feed NaNs or use a NaN-handling enocder.Your embedded x and y returned the following:torch.isnan(embedded_x).any()=tensor(True, device='cuda:0') | torch.isnan(embedded_y).any()=tensor(False, device='cuda:0')

In [28]:
df_reduced.isna().sum().sort_values(ascending=False)


Unnamed: 0              0
DISCOVERY_DOY           0
FIRE_YEAR               0
STATE                   0
FIPS_CODE               0
Annual_etr              0
Annual_precipitation    0
Annual_tempreture       0
pr                      0
tmmn                    0
vs                      0
fm100                   0
fm1000                  0
bi                      0
vpd                     0
erc                     0
Elevation_1km           0
Aspect_1km              0
erc_Percentile          0
Slope_1km               0
TPI_1km                 0
EVC                     0
Evacuation              0
SDI                     0
FRG                     0
No_FireStation_5.0km    0
Mang_Name               0
GAP_Sts                 0
GACC_PL                 0
GDP                     0
GHM                     0
NDVI-1day               0
NPL                     0
Popo_1km                0
RPL_THEMES              0
RPL_THEME1              0
RPL_THEME2              0
RPL_THEME3              0
RPL_THEME4  

In [29]:
import numpy as np
print("NaNs in X:", np.isnan(X_final.to_numpy()).any())
print("Infs in X:", np.isinf(X_final.to_numpy()).any())


NaNs in X: False
Infs in X: False


In [30]:
import torch
torch.isnan(torch.tensor(X_final.values, dtype=torch.float32)).any()


tensor(False)