In [None]:
## Base library Installation
# Install Baselines for model comparison
!uv pip install catboost xgboost

# Install the datasets library for loading example data
!uv pip install datasets

# Install rich for better and more readable printing
!uv pip install rich


## TabPFN Installation optimized for Google Colab
# Install the TabPFN Client library
!uv pip install tabpfn-client

# Install tabpfn from source
# Clone the repository: shallow for speedup
!git clone --depth 1 https://github.com/PriorLabs/tabpfn

# Speeding up installation in this notebook:
# Remove torch dependency as it is already installed on colab (do not run this in your local setup)
!sed -i "/torch/d" tabpfn/pyproject.toml

# Step 3: Install using the correct directory name 'tabpfn'
!uv pip install -e "tabpfn"

# Install TabPFN extensions for additional functionalities
!git clone https://github.com/PriorLabs/tabpfn-extensions

# Speeding up installation in this notebook:
# Remove torch dependency as it is already installed on colab (do not run this in your local setup)
!sed -i "/torch/d" tabpfn-extensions/pyproject.toml

!uv pip install -e tabpfn-extensions[all]

[2mUsing Python 3.11.13 environment at: /usr[0m
[2K[2mResolved [1m21 packages[0m [2min 493ms[0m[0m
[2K[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[

In [None]:
import tabpfn
print("TabPFN imported successfully!")

TabPFN imported successfully!


In [None]:
# Standard Library Imports

# TabPFN and Extensions

try:
    from tabpfn import TabPFNClassifier, TabPFNRegressor
    from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import (
        AutoTabPFNClassifier,
    )
except ImportError:
    raise ImportError(
        "Warning: Could not import TabPFN / TabPFN extensions. Please run installation above and restart the session afterwards (Runtime > Restart Session)."
    )

# Data Science & Visualization
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch

# Other ML Models
from catboost import CatBoostClassifier, CatBoostRegressor

# Notebook UI/Display
from IPython.display import Markdown, display
from rich.console import Console
from rich.panel import Panel
from rich.prompt import Prompt
from rich.rule import Rule
from sklearn.compose import make_column_selector, make_column_transformer

# Scikit-Learn: Data & Preprocessing
from sklearn.datasets import fetch_openml, load_breast_cancer

# Scikit-Learn: Models
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.model_selection import (
    KFold,
    StratifiedKFold,
    cross_val_score,
    train_test_split,
)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from xgboost import XGBClassifier, XGBRegressor

# This transformer will be used to handle categorical features for the baseline models
column_transformer = make_column_transformer(
    (
        OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
        make_column_selector(dtype_include=["object", "category"]),
    ),
    remainder="passthrough",
)

In [None]:
from sklearn.utils import shuffle
from pathlib import Path

In [None]:
def split_data(X, Y, p = 0.8, seed = 123456):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = p, random_state = seed)
    return X_train, X_test, Y_train, Y_test

In [None]:
def performance_eval(Y_pred, Y_test):
    Y_pred = np.array(Y_pred)
    Y_test = np.array(Y_test)
    MAE = np.mean(np.abs(Y_pred - Y_test))
    RMSE = np.sqrt(np.mean((Y_pred - Y_test)**2))
    MAPE = np.mean(np.abs(Y_test - Y_pred) / ((np.abs(Y_test) + np.abs(Y_pred)) / 2)) * 100

    #accuracy = np.mean(Y_pred == Y_test) * 100

    return MAE, RMSE, MAPE

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Base

ICL4000

In [None]:
# Set up the data directory path
DATA_DIR = Path("/content/drive/My Drive/tabPFN_data/linear_exp_all_positive2")

In [None]:
from pathlib import Path
#DATA_DIR = Path("Tabpfn_data/linear_exp_all_positive2")


rows = []
files = sorted(DATA_DIR.glob("dataset_*.csv"))[:1000]
for csv_path in files:
#for csv_path in sorted(DATA_DIR.glob("dataset_*.csv")):
    df = pd.read_csv(csv_path)
    X = df.drop(columns=["Y"])
    Y = df["Y"]
    X_train, X_test, Y_train, Y_test = split_data(X, Y)

    model = TabPFNRegressor(random_state=123456)
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)

    MAE, RMSE, MAPE = performance_eval(y_pred, Y_test)

    rows.append({"csv": str(csv_path), "mae": MAE, "rmse": RMSE, "mape": MAPE})

df = pd.DataFrame(rows)
df.to_csv(DATA_DIR / "results_base.csv", index=False)
df[["mae", "rmse", "mape"]].describe().to_csv(DATA_DIR / "summary_statistics_base.csv")

In [None]:
df

Unnamed: 0,csv,mae,rmse,mape
0,/content/drive/My Drive/tabPFN_data/linear_exp...,0.116431,0.389478,2.143777
1,/content/drive/My Drive/tabPFN_data/linear_exp...,0.104303,0.130838,2.308963
2,/content/drive/My Drive/tabPFN_data/linear_exp...,0.102707,0.134133,2.119487
3,/content/drive/My Drive/tabPFN_data/linear_exp...,0.086523,0.118739,2.260998
4,/content/drive/My Drive/tabPFN_data/linear_exp...,0.088582,0.112392,3.500569
...,...,...,...,...
995,/content/drive/My Drive/tabPFN_data/linear_exp...,0.093152,0.115417,2.969915
996,/content/drive/My Drive/tabPFN_data/linear_exp...,0.093385,0.127502,2.368307
997,/content/drive/My Drive/tabPFN_data/linear_exp...,0.081944,0.107539,2.015369
998,/content/drive/My Drive/tabPFN_data/linear_exp...,0.098313,0.124965,2.268969


In [None]:
df = pd.read_csv(DATA_DIR / "results_rowOrder_fewshot10.csv")
df["mae"][:100].describe()

Unnamed: 0,mae
count,100.0
mean,1.111948
std,0.307509
min,0.42209
25%,0.911799
50%,1.112821
75%,1.281054
max,2.482564


ICL10

In [None]:
from pathlib import Path
#DATA_DIR = Path("Tabpfn_data/linear_exp_all_positive2")


rows = []
files = sorted(DATA_DIR.glob("dataset_*.csv"))[:1000]
for csv_path in files:
#for csv_path in sorted(DATA_DIR.glob("dataset_*.csv")):
    df = pd.read_csv(csv_path)
    X = df.drop(columns=["Y"])
    Y = df["Y"]
    X_train, X_test, Y_train, Y_test = split_data(X, Y)
    X_train = X_train.iloc[:10,:]
    Y_train = Y_train[:10]

    model = TabPFNRegressor(random_state=123456)
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)

    MAE, RMSE, MAPE = performance_eval(y_pred, Y_test)

    rows.append({"csv": str(csv_path), "mae": MAE, "rmse": RMSE, "mape": MAPE})

df = pd.DataFrame(rows)
df.to_csv(DATA_DIR / "results_base_fewshot10.csv", index=False)
df[["mae", "rmse", "mape"]].describe().to_csv(DATA_DIR / "summary_statistics_base_fewshot10.csv")

  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * 

ICL20

In [None]:
from pathlib import Path
#DATA_DIR = Path("Tabpfn_data/linear_exp_all_positive2")


rows = []
files = sorted(DATA_DIR.glob("dataset_*.csv"))[:1000]
for csv_path in files:
#for csv_path in sorted(DATA_DIR.glob("dataset_*.csv")):
    df = pd.read_csv(csv_path)
    X = df.drop(columns=["Y"])
    Y = df["Y"]
    X_train, X_test, Y_train, Y_test = split_data(X, Y)
    X_train = X_train.iloc[:20,:]
    Y_train = Y_train[:20]

    model = TabPFNRegressor(random_state=123456)
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)

    MAE, RMSE, MAPE = performance_eval(y_pred, Y_test)

    rows.append({"csv": str(csv_path), "mae": MAE, "rmse": RMSE, "mape": MAPE})

df = pd.DataFrame(rows)
df.to_csv(DATA_DIR / "results_base_fewshot20.csv", index=False)
df[["mae", "rmse", "mape"]].describe().to_csv(DATA_DIR / "summary_statistics_base_fewshot20.csv")

  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)


ICL500

In [None]:
from pathlib import Path
#DATA_DIR = Path("Tabpfn_data/linear_exp_all_positive2")


rows = []
files = sorted(DATA_DIR.glob("dataset_*.csv"))[:1000]
for csv_path in files:
#for csv_path in sorted(DATA_DIR.glob("dataset_*.csv")):
    df = pd.read_csv(csv_path)
    X = df.drop(columns=["Y"])
    Y = df["Y"]
    X_train, X_test, Y_train, Y_test = split_data(X, Y)
    X_train = X_train.iloc[:500,:]
    Y_train = Y_train[:500]

    model = TabPFNRegressor(random_state=123456)
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)

    MAE, RMSE, MAPE = performance_eval(y_pred, Y_test)

    rows.append({"csv": str(csv_path), "mae": MAE, "rmse": RMSE, "mape": MAPE})

df = pd.DataFrame(rows)
df.to_csv(DATA_DIR / "results_base_fewshot500.csv", index=False)
df[["mae", "rmse", "mape"]].describe().to_csv(DATA_DIR / "summary_statistics_base_fewshot500.csv")

## Row Order

ICL4000

In [None]:
from pathlib import Path
#DATA_DIR = Path("Tabpfn_data/linear_exp_all_positive2")


rows = []
files = sorted(DATA_DIR.glob("dataset_*.csv"))[:1000]
for csv_path in files:
#for csv_path in sorted(DATA_DIR.glob("dataset_*.csv")):
    df = pd.read_csv(csv_path)
    X = df.drop(columns=["Y"])
    Y = df["Y"]
    X_train, X_test, Y_train, Y_test = split_data(X, Y)
    X_train, Y_train = shuffle(X_train, Y_train, random_state=123456)

    model_row = TabPFNRegressor(random_state=123456)
    model_row.fit(X_train, Y_train)
    y_pred = model_row.predict(X_test)

    MAE, RMSE, MAPE = performance_eval(y_pred, Y_test)

    rows.append({"csv": str(csv_path), "mae": MAE, "rmse": RMSE, "mape": MAPE})

df = pd.DataFrame(rows)
df.to_csv(DATA_DIR / "results_rowOrder.csv", index=False)
df[["mae", "rmse", "mape"]].describe().to_csv(DATA_DIR / "summary_statistics_rowOrder.csv")

In [None]:
df

Unnamed: 0,csv,mae,rmse,mape
0,/content/drive/My Drive/tabPFN_data/linear_exp...,0.116028,0.385564,2.142260
1,/content/drive/My Drive/tabPFN_data/linear_exp...,0.105003,0.132267,2.332470
2,/content/drive/My Drive/tabPFN_data/linear_exp...,0.102337,0.133577,2.115820
3,/content/drive/My Drive/tabPFN_data/linear_exp...,0.086595,0.118547,2.265478
4,/content/drive/My Drive/tabPFN_data/linear_exp...,0.088973,0.112360,3.504979
...,...,...,...,...
995,/content/drive/My Drive/tabPFN_data/linear_exp...,0.092295,0.114690,2.957135
996,/content/drive/My Drive/tabPFN_data/linear_exp...,0.093109,0.128055,2.359052
997,/content/drive/My Drive/tabPFN_data/linear_exp...,0.083323,0.108347,2.035877
998,/content/drive/My Drive/tabPFN_data/linear_exp...,0.098438,0.125333,2.276191


ICL10

In [None]:
from pathlib import Path
#DATA_DIR = Path("Tabpfn_data/linear_exp_all_positive2")


rows = []
files = sorted(DATA_DIR.glob("dataset_*.csv"))[:1000]
for csv_path in files:
#for csv_path in sorted(DATA_DIR.glob("dataset_*.csv")):
    df = pd.read_csv(csv_path)
    X = df.drop(columns=["Y"])
    Y = df["Y"]
    X_train, X_test, Y_train, Y_test = split_data(X, Y)
    X_train = X_train.iloc[:10,:]
    Y_train = Y_train[:10]
    X_train, Y_train = shuffle(X_train, Y_train, random_state=123456)

    model_row = TabPFNRegressor(random_state=123456)
    model_row.fit(X_train, Y_train)
    y_pred = model_row.predict(X_test)

    MAE, RMSE, MAPE = performance_eval(y_pred, Y_test)

    rows.append({"csv": str(csv_path), "mae": MAE, "rmse": RMSE, "mape": MAPE})

df = pd.DataFrame(rows)
df.to_csv(DATA_DIR / "results_rowOrder_fewshot10.csv", index=False)
df[["mae", "rmse", "mape"]].describe().to_csv(DATA_DIR / "summary_statistics_rowOrder_fewshot10.csv")

  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * 

ICL20

In [None]:
from pathlib import Path
#DATA_DIR = Path("Tabpfn_data/linear_exp_all_positive2")


rows = []
files = sorted(DATA_DIR.glob("dataset_*.csv"))[:100]
for csv_path in files:
#for csv_path in sorted(DATA_DIR.glob("dataset_*.csv")):
    df = pd.read_csv(csv_path)
    X = df.drop(columns=["Y"])
    Y = df["Y"]
    X_train, X_test, Y_train, Y_test = split_data(X, Y)
    X_train = X_train.iloc[:20,:]
    Y_train = Y_train[:20]
    X_train, Y_train = shuffle(X_train, Y_train, random_state=123456)

    model_row = TabPFNRegressor(random_state=123456)
    model_row.fit(X_train, Y_train)
    y_pred = model_row.predict(X_test)

    MAE, RMSE, MAPE = performance_eval(y_pred, Y_test)

    rows.append({"csv": str(csv_path), "mae": MAE, "rmse": RMSE, "mape": MAPE})

df = pd.DataFrame(rows)
df.to_csv(DATA_DIR / "results_rowOrder_fewshot20_100.csv", index=False)
df[["mae", "rmse", "mape"]].describe().to_csv(DATA_DIR / "summary_statistics_rowOrder_fewshot20_100.csv")

  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)


ICL500

In [None]:
from pathlib import Path
#DATA_DIR = Path("Tabpfn_data/linear_exp_all_positive2")


rows = []
files = sorted(DATA_DIR.glob("dataset_*.csv"))[:100]
for csv_path in files:
#for csv_path in sorted(DATA_DIR.glob("dataset_*.csv")):
    df = pd.read_csv(csv_path)
    X = df.drop(columns=["Y"])
    Y = df["Y"]
    X_train, X_test, Y_train, Y_test = split_data(X, Y)
    X_train = X_train.iloc[:500,:]
    Y_train = Y_train[:500]
    X_train, Y_train = shuffle(X_train, Y_train, random_state=123456)

    model_row = TabPFNRegressor(random_state=123456)
    model_row.fit(X_train, Y_train)
    y_pred = model_row.predict(X_test)

    MAE, RMSE, MAPE = performance_eval(y_pred, Y_test)

    rows.append({"csv": str(csv_path), "mae": MAE, "rmse": RMSE, "mape": MAPE})

df = pd.DataFrame(rows)
df.to_csv(DATA_DIR / "results_rowOrder_fewshot500_100.csv", index=False)
df[["mae", "rmse", "mape"]].describe().to_csv(DATA_DIR / "summary_statistics_rowOrder_fewshot500_100.csv")

## Num_Digits

ICL4000

In [None]:
DATA_DIR = Path("/content/drive/My Drive/tabPFN_data/linear_exp_all_positive2_10_digits")

In [None]:
from pathlib import Path
#DATA_DIR = Path("Tabpfn_data/linear_exp_all_positive2")


rows = []
files = sorted(DATA_DIR.glob("dataset_*.csv"))[:1000]
for csv_path in files:
#for csv_path in sorted(DATA_DIR.glob("dataset_*.csv")):
    df = pd.read_csv(csv_path)
    X = df.drop(columns=["Y"])
    Y = df["Y"]
    X_train, X_test, Y_train, Y_test = split_data(X, Y)

    model = TabPFNRegressor(random_state=123456)
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)

    MAE, RMSE, MAPE = performance_eval(y_pred, Y_test)

    rows.append({"csv": str(csv_path), "mae": MAE, "rmse": RMSE, "mape": MAPE})

df = pd.DataFrame(rows)
df.to_csv(DATA_DIR / "results_10_digits.csv", index=False)
df[["mae", "rmse", "mape"]].describe().to_csv(DATA_DIR / "summary_statistics_10_digits.csv")

ICL10

In [None]:
from pathlib import Path
#DATA_DIR = Path("Tabpfn_data/linear_exp_all_positive2")


rows = []
files = sorted(DATA_DIR.glob("dataset_*.csv"))[:100]
for csv_path in files:
#for csv_path in sorted(DATA_DIR.glob("dataset_*.csv")):
    df = pd.read_csv(csv_path)
    X = df.drop(columns=["Y"])
    Y = df["Y"]
    X_train, X_test, Y_train, Y_test = split_data(X, Y)
    X_train = X_train.iloc[:10,:]
    Y_train = Y_train[:10]

    model = TabPFNRegressor(random_state=123456)
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)

    MAE, RMSE, MAPE = performance_eval(y_pred, Y_test)

    rows.append({"csv": str(csv_path), "mae": MAE, "rmse": RMSE, "mape": MAPE})

df = pd.DataFrame(rows)
df.to_csv(DATA_DIR / "results_10_digits_fewshot10_100.csv", index=False)
df[["mae", "rmse", "mape"]].describe().to_csv(DATA_DIR / "summary_statistics_10_digits_fewshot10_100.csv")

ICL20

In [None]:
from pathlib import Path
#DATA_DIR = Path("Tabpfn_data/linear_exp_all_positive2")


rows = []
files = sorted(DATA_DIR.glob("dataset_*.csv"))[:100]
for csv_path in files:
#for csv_path in sorted(DATA_DIR.glob("dataset_*.csv")):
    df = pd.read_csv(csv_path)
    X = df.drop(columns=["Y"])
    Y = df["Y"]
    X_train, X_test, Y_train, Y_test = split_data(X, Y)
    X_train = X_train.iloc[:20,:]
    Y_train = Y_train[:20]

    model = TabPFNRegressor(random_state=123456)
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)

    MAE, RMSE, MAPE = performance_eval(y_pred, Y_test)

    rows.append({"csv": str(csv_path), "mae": MAE, "rmse": RMSE, "mape": MAPE})

df = pd.DataFrame(rows)
df.to_csv(DATA_DIR / "results_10_digits_fewshot20_100.csv", index=False)
df[["mae", "rmse", "mape"]].describe().to_csv(DATA_DIR / "summary_statistics_10_digits_fewshot20_100.csv")

  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)


ICL500

In [None]:
from pathlib import Path
#DATA_DIR = Path("Tabpfn_data/linear_exp_all_positive2")


rows = []
files = sorted(DATA_DIR.glob("dataset_*.csv"))[:100]
for csv_path in files:
#for csv_path in sorted(DATA_DIR.glob("dataset_*.csv")):
    df = pd.read_csv(csv_path)
    X = df.drop(columns=["Y"])
    Y = df["Y"]
    X_train, X_test, Y_train, Y_test = split_data(X, Y)
    X_train = X_train.iloc[:500,:]
    Y_train = Y_train[:500]

    model = TabPFNRegressor(random_state=123456)
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)

    MAE, RMSE, MAPE = performance_eval(y_pred, Y_test)

    rows.append({"csv": str(csv_path), "mae": MAE, "rmse": RMSE, "mape": MAPE})

df = pd.DataFrame(rows)
df.to_csv(DATA_DIR / "results_10_digits_fewshot500_100.csv", index=False)
df[["mae", "rmse", "mape"]].describe().to_csv(DATA_DIR / "summary_statistics_10_digits_fewshot500_100.csv")

## Column_Order

ICL4000

In [None]:
DATA_DIR = Path("/content/drive/My Drive/tabPFN_data/linear_exp_all_positive2_shuffle")

In [None]:
from pathlib import Path
#DATA_DIR = Path("Tabpfn_data/linear_exp_all_positive2")


rows = []
files = sorted(DATA_DIR.glob("dataset_*.csv"))[:1000]
for csv_path in files:
#for csv_path in sorted(DATA_DIR.glob("dataset_*.csv")):
    df = pd.read_csv(csv_path)
    X = df.drop(columns=["Y"])
    Y = df["Y"]
    X_train, X_test, Y_train, Y_test = split_data(X, Y)

    model = TabPFNRegressor(random_state=123456)
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)

    MAE, RMSE, MAPE = performance_eval(y_pred, Y_test)

    rows.append({"csv": str(csv_path), "mae": MAE, "rmse": RMSE, "mape": MAPE})

df = pd.DataFrame(rows)
df.to_csv(DATA_DIR / "results_shuffle.csv", index=False)
df[["mae", "rmse", "mape"]].describe().to_csv(DATA_DIR / "summary_statistics_shuffle.csv")

ICL10

In [None]:
from pathlib import Path
#DATA_DIR = Path("Tabpfn_data/linear_exp_all_positive2")


rows = []
files = sorted(DATA_DIR.glob("dataset_*.csv"))[:100]
for csv_path in files:
#for csv_path in sorted(DATA_DIR.glob("dataset_*.csv")):
    df = pd.read_csv(csv_path)
    X = df.drop(columns=["Y"])
    Y = df["Y"]
    X_train, X_test, Y_train, Y_test = split_data(X, Y)
    X_train = X_train.iloc[:10,:]
    Y_train = Y_train[:10]

    model = TabPFNRegressor(random_state=123456)
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)

    MAE, RMSE, MAPE = performance_eval(y_pred, Y_test)

    rows.append({"csv": str(csv_path), "mae": MAE, "rmse": RMSE, "mape": MAPE})

df = pd.DataFrame(rows)
df.to_csv(DATA_DIR / "results_shuffle_fewshot10_100.csv", index=False)
df[["mae", "rmse", "mape"]].describe().to_csv(DATA_DIR / "summary_statistics_shuffle_fewshot10_100.csv")

ICL20

In [None]:
from pathlib import Path
#DATA_DIR = Path("Tabpfn_data/linear_exp_all_positive2")


rows = []
files = sorted(DATA_DIR.glob("dataset_*.csv"))[:100]
for csv_path in files:
#for csv_path in sorted(DATA_DIR.glob("dataset_*.csv")):
    df = pd.read_csv(csv_path)
    X = df.drop(columns=["Y"])
    Y = df["Y"]
    X_train, X_test, Y_train, Y_test = split_data(X, Y)
    X_train = X_train.iloc[:20,:]
    Y_train = Y_train[:20]

    model = TabPFNRegressor(random_state=123456)
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)

    MAE, RMSE, MAPE = performance_eval(y_pred, Y_test)

    rows.append({"csv": str(csv_path), "mae": MAE, "rmse": RMSE, "mape": MAPE})

df = pd.DataFrame(rows)
df.to_csv(DATA_DIR / "results_shuffle_fewshot20_100.csv", index=False)
df[["mae", "rmse", "mape"]].describe().to_csv(DATA_DIR / "summary_statistics_shuffle_fewshot20_100.csv")

  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)
  x_inv[pos] = np.expm1(np.log(x[pos] * lmbda + 1) / lmbda)


ICL500

In [None]:
from pathlib import Path
#DATA_DIR = Path("Tabpfn_data/linear_exp_all_positive2")


rows = []
files = sorted(DATA_DIR.glob("dataset_*.csv"))[:100]
for csv_path in files:
#for csv_path in sorted(DATA_DIR.glob("dataset_*.csv")):
    df = pd.read_csv(csv_path)
    X = df.drop(columns=["Y"])
    Y = df["Y"]
    X_train, X_test, Y_train, Y_test = split_data(X, Y)
    X_train = X_train.iloc[:500,:]
    Y_train = Y_train[:500]

    model = TabPFNRegressor(random_state=123456)
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)

    MAE, RMSE, MAPE = performance_eval(y_pred, Y_test)

    rows.append({"csv": str(csv_path), "mae": MAE, "rmse": RMSE, "mape": MAPE})

df = pd.DataFrame(rows)
df.to_csv(DATA_DIR / "results_shuffle_fewshot500_100.csv", index=False)
df[["mae", "rmse", "mape"]].describe().to_csv(DATA_DIR / "summary_statistics_shuffle_fewshot500_100.csv")