In [1]:
from tabpfn import TabPFNClassifier
from sklearn.ensemble import RandomForestClassifier

import experiments.runners as runners
from data import load_us_perm_visas, load_credit_default

rf_model = RandomForestClassifier()
pfn_model = TabPFNClassifier(ignore_pretraining_limits=True)

In [2]:
# Instatiate run class that provides functions for training and evaluation
new_run = runners.BasicRun(pfn_model, load_us_perm_visas)

Loading data...
Loaded 356161 samples with 14 features and 2 unique classes.
Splitting data for TabPFNClassifier...
Subsampling data: Selecting 14925 out of 356161 samples to meet max_length=10000 constraint with test_size=0.33.
Setting up pipeline for TabPFNClassifier...
TabPFN model detected.


In [3]:
# Inspect training data
new_run.X_train

Unnamed: 0,decision_date,employer_name,employer_city,employer_state,job_info_work_city,job_info_work_state,pw_soc_code,pw_unit_of_pay_9089,pw_source_name_9089,pw_soc_title,country_of_citizenship,class_of_admission,pw_level_9089,pw_amount_9089
107134,2014-08-12,ericsson inc.,plano,tx,plano,tx,17-2071,yr,oes,electrical engineers,india,h-1b,level ii,78832.0
266132,2016-02-10,r.o.lewis inc,richmond,va,richmond,va,49-3023,yr,oes,automotive service technicians and mechanics,pakistan,,level i,27227.0
294416,2016-04-29,"essilor of america, inc",dallas,tx,dallas,tx,11-2021,yr,other,marketing managers,china,h-1b,,77900.0
354092,2016-10-18,genuine parts company,atlanta,ga,atlanta,ga,15-1132,yr,oes,"software developers, applications",india,h-1b,level i,57325.0
217856,2015-04-30,"qualcomm, inc.",san diego,ca,san diego,ca,15-1121,yr,oes,computer systems analysts,india,h-1b,level ii,74318.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66978,2014-04-10,lpl financial llc,san diego,ca,charlotte,nc,15-1132,yr,oes,"software developers, applications",india,h-1b,level iv,105040.0
2474,2011-12-12,james alan hill,arlington,va,arlington,va,39-9011.01,yr,oes,nannies,,b-2,level iii,24690.0
163957,2014-12-03,coolsoft llc,louisville,ky,louisville,ky,15-1132,yr,oes,"software developers, applications",india,h-1b,level iii,78728.0
302928,2016-05-20,"ustream, inc.",san francisco,ca,san francisco,ca,15-1132,yr,oes,"software developers, applications",hungary,l-1,level ii,96866.0


In [4]:
new_run.y_train

107134    1
266132    1
294416    1
354092    1
217856    1
         ..
66978     1
2474      1
163957    1
302928    1
94448     1
Name: case_status, Length: 9999, dtype: int64

In [5]:
# Inspect pipeline
new_run.pipe

In [6]:
# Ignore FutureWarning, of which TabPFNClassifier has a lot!
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [7]:
new_run.fit()

Fitting TabPFNClassifier on 9999 samples with 14 features ...
DONE: TabPFNClassifier fit


<experiments.runners.BasicRun at 0x105c218e0>

In [None]:
new_run.predict()
# WARNING: TabPFN takes 50-54 min predicting both class and probability

In [None]:
new_run.score()

## Work in progress: ROC
Below is code to make the ROC AUC methods work. There is a problem with the array dimensions

In [None]:
new_run.prob_predictions[:, 0]

In [None]:
new_run.y_test[0]

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Calculate ROC curve points
fpr, tpr, thresholds = roc_curve(new_run.y_test, new_run.prob_predictions[:, 1], pos_label='certified')

# Calculate AUC score
auc_score = roc_auc_score(new_run.y_test, new_run.prob_predictions[:, 1])
print(f"ROC AUC: {auc_score}")

In [None]:
# plot ROC curve
import matplotlib.pyplot as plt
plt.figure()
plt.plot(fpr, tpr, label=f'ROC curve (area = {auc_score:.3f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")