In [None]:
from tabpfn import TabPFNClassifier
from sklearn.ensemble import RandomForestClassifier

import experiments.runners as runners
from data import load_us_perm_visas, load_credit_default

rf_model = RandomForestClassifier()
pfn_model = TabPFNClassifier(ignore_pretraining_limits=True)

In [None]:
X, y = load_us_perm_visas()

In [None]:
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import numpy as np
import math
from models.preprocessors import DatetimeFeatureSplitter, DatetimeFeatureEncoder


In [None]:
## RF Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('datetime', DatetimeFeatureEncoder(), make_column_selector(dtype_include='datetime64[ns]')),
        ('cat high c', TargetEncoder(), make_column_selector(dtype_include='object')),
        ('num',SimpleImputer(strategy='constant', fill_value=0), make_column_selector(dtype_include=['int64', 'float64'])), # no need to scale numerical columns for RandomForest
        ('cat', OneHotEncoder(handle_unknown='ignore'), make_column_selector(dtype_include='category'))
        # ('pass', 'passthrough', make_column_selector(dtype_exclude=['datetime64']))
    ],
    remainder='passthrough'
    )        

# Create classifier pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', pfn_model)
])

In [None]:
## TabPFN pipeline
preprocessor = ColumnTransformer(
    transformers=[
            ('datetime', DatetimeFeatureSplitter(), make_column_selector(dtype_include='datetime64[ns]')),
            ('num', SimpleImputer(strategy='mean'), make_column_selector(dtype_include=['int64', 'float64'])), # Here we see the limit of the approach: different strategy between models
            ('cat', SimpleImputer(strategy='most_frequent'), make_column_selector(dtype_include='category')),
            ('cat high cardinality', SimpleImputer(strategy='most_frequent'), make_column_selector(dtype_include='object')),
            # todo: this isn't great, having to impute values for TabPFN. But the model otherwise had issues with missing variables. 
            # I think TabPFN expects missing values to be formatted in a certain way. Here it got an NA type or so in what it expected to be a str column

        ],
    remainder='passthrough'
    )        

# Create classifier pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', pfn_model)
])

In [None]:
max_size_for_model = 28000
n_splits = 5

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import LearningCurveDisplay, StratifiedKFold # Use StratifiedKFold for classification

# Define cross-validation strategy (e.g., 5-fold stratified)
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

n_samples = len(X)
n_splits = cv.get_n_splits(X, y) # Get n_splits from the cv object

# Calculate max size allowed by the CV split
max_size_from_cv = int(n_samples * (1 - 1 / n_splits))

# Determine the final max size, considering both CV and model limits
final_max_train_size = min(max_size_for_model, max_size_from_cv)

# Define train_sizes, ensuring it doesn't exceed the final_max_train_size
# Example: 10 steps from 100 up to the calculated maximum
start_size = 100
num_steps = 10

if final_max_train_size < start_size:
    # Handle cases where even the smallest desired size is too large
    # Maybe just use one size? Or raise an error?
    train_sizes_abs = np.array([final_max_train_size])
    print(f"Warning: Max allowed train size ({final_max_train_size}) is less than start_size ({start_size}). Using only max size.")
else:
    train_sizes_abs = np.linspace(start_size, final_max_train_size, num_steps, dtype=int)

print(f"Total samples: {n_samples}")
print(f"CV folds: {n_splits}")
print(f"Max training samples per fold: {max_size_from_cv}")
print(f"Model/Config limit: {max_size_for_model}")
print(f"Final max train size used in learning curve: {final_max_train_size}")
print(f"Train sizes to be tested: {train_sizes_abs}")

In [None]:
# Ignore FutureWarning, of which TabPFNClassifier has a lot!
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
print(f"Generating learning curve for TabPFN...")
# Use LearningCurveDisplay.from_estimator
display = LearningCurveDisplay.from_estimator(
    pipeline,
    X,
    y,
    cv=cv,
    train_sizes=train_sizes_abs, # Use absolute sizes suitable for TabPFN
    scoring="f1",  # Or "roc_auc", "f1", "neg_log_loss", etc. depending on your goal
    n_jobs=1,  # Number of CPU cores to use
    random_state=42, # Consistent state for any internal randomness if needed
    # Add other relevant parameters if needed
)

display.ax_.set_title(f"Learning Curve for TabPFN on Visa Data")
plt.show()

In [None]:
import pandas as pd
import numpy as np

In [None]:
display.test_scores

In [None]:
train_sizes_abs = display.train_sizes
train_scores_folds = display.train_scores # Shape: (n_train_sizes, n_folds)
test_scores_folds = display.test_scores  # Shape: (n_train_sizes, n_folds)

# 2. Determine the number of folds
n_folds = train_scores_folds.shape[1]

# 3. Prepare data for DataFrame construction
data_list = []
for i, size in enumerate(train_sizes_abs):
    for fold in range(n_folds):
        # Append training score data for this size and fold
        data_list.append({
            'train_size': size,
            'fold': fold,
            'score_type': 'train',
            'score': train_scores_folds[i, fold]
        })
        # Append test score data for this size and fold
        data_list.append({
            'train_size': size,
            'fold': fold,
            'score_type': 'test',
            'score': test_scores_folds[i, fold]
        })

# 4. Create pandas DataFrame
df_learning_curve = pd.DataFrame(data_list)

# 5. Export to CSV
csv_filename = 'learning_curve_data.csv'
df_learning_curve.to_csv(csv_filename, index=False)

print(f"Learning curve data exported to {csv_filename}")
print("\nDataFrame head:")
print(df_learning_curve.head())
print("\nDataFrame tail:")
print(df_learning_curve.tail())