# TabPFN Starter Notebook
In this notebook, I change small parts of C. Deotte's provided starter notebook and instead train a TabPFN model and ensemble it with the best public notebook. Since this model performed well on its own, im curious to see how it will perform with a high-level starter notebook. The best public notebook achieves `LB = 0.954`

# Load Data

In [1]:
# !git clone https://github.com/PriorLabs/tabpfn-extensions
!pip install tabpfn
# !pip install -e tabpfn-extensions

Collecting tabpfn
  Downloading tabpfn-2.0.6-py3-none-any.whl.metadata (20 kB)
Downloading tabpfn-2.0.6-py3-none-any.whl (124 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.9/124.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tabpfn
Successfully installed tabpfn-2.0.6


In [2]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt

train = pd.read_csv("/kaggle/input/playground-series-s5e3/train.csv")
print("Train shape", train.shape )
train.head()

Train shape (2190, 13)


Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
0,0,1,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2,1
1,1,2,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9,1
2,2,3,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1,1
3,3,4,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6,1
4,4,5,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8,0


In [3]:
test = pd.read_csv("/kaggle/input/playground-series-s5e3/test.csv")
print("Test shape:", test.shape )
test.head()

Test shape: (730, 12)


Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
0,2190,1,1019.5,17.5,15.8,12.7,14.9,96.0,99.0,0.0,50.0,24.3
1,2191,2,1016.5,17.5,16.5,15.8,15.1,97.0,99.0,0.0,50.0,35.3
2,2192,3,1023.9,11.2,10.4,9.4,8.9,86.0,96.0,0.0,40.0,16.9
3,2193,4,1022.9,20.6,17.3,15.2,9.5,75.0,45.0,7.1,20.0,50.6
4,2194,5,1022.2,16.1,13.8,6.4,4.3,68.0,49.0,9.2,20.0,19.4


In [4]:
orig = pd.read_csv("/kaggle/input/rainfall-prediction-using-machine-learning/Rainfall.csv")
orig.columns = orig.columns.str.strip()
orig['rainfall'] = orig['rainfall'].str.lower().map({'yes': 1, 'no': 0})
train = train.drop(columns=['id'])
train = pd.concat([orig, train], axis=0, ignore_index=True)
train = train.fillna(train.mean())
test = test.fillna(test.mean())
print("Train shape", train.shape )
train.head()

Train shape (2556, 12)


Unnamed: 0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
0,1,1025.9,19.9,18.3,16.8,13.1,72.0,49.0,1,9.3,80.0,26.3
1,2,1022.0,21.7,18.9,17.2,15.6,81.0,83.0,1,0.6,50.0,15.3
2,3,1019.7,20.3,19.3,18.0,18.4,95.0,91.0,1,0.0,40.0,14.2
3,4,1018.9,22.3,20.6,19.1,18.8,90.0,88.0,1,1.0,50.0,16.9
4,5,1015.9,21.3,20.7,20.2,19.9,95.0,81.0,1,0.0,40.0,13.7


In [5]:
RMV = ['rainfall','id']
FEATURES = [c for c in train.columns if not c in RMV]
print("Our features are:")
print( FEATURES )

Our features are:
['day', 'pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'winddirection', 'windspeed']


# TabPFN Model
We train 5 fold TabPFN classification model. We standardize all features to mean=0, std=1.

In [6]:
from sklearn.model_selection import KFold
from tabpfn import TabPFNClassifier

In [7]:
%%time
FOLDS = 5
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=777)
    
oof_pfn = np.zeros(len(train))
pred_pfn = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)
    
    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"rainfall"]    
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"rainfall"]
    x_test = test[FEATURES].copy()

    for c in FEATURES:
        m = x_train[c].mean()
        s = x_train[c].std()
        x_train[c] = (x_train[c]-m)/s
        x_valid[c] = (x_valid[c]-m)/s
        x_test[c] = (x_test[c]-m)/s
        x_test[c] = x_test[c].fillna(0)
        x_train[c] = x_train[c].fillna(0)

    model = TabPFNClassifier(device = "cuda", random_state = 12)
    model.fit(x_train.values, y_train.values)

    # INFER OOF
    oof_pfn[test_index] = model.predict_proba(x_valid.values)[:,1]
    # INFER TEST
    pred_pfn += model.predict_proba(x_test.values)[:,1]

# COMPUTE AVERAGE TEST PREDS
pred_pfn /= FOLDS

#########################
### Fold 1
#########################


  model, _, config_ = load_model_criterion_config(


tabpfn-v2-classifier.ckpt:   0%|          | 0.00/29.0M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/37.0 [00:00<?, ?B/s]

#########################
### Fold 2
#########################
#########################
### Fold 3
#########################
#########################
### Fold 4
#########################
#########################
### Fold 5
#########################
CPU times: user 9.4 s, sys: 843 ms, total: 10.2 s
Wall time: 12 s


In [8]:
from sklearn.metrics import roc_auc_score
true = train.rainfall.values
m = roc_auc_score(true, oof_pfn)
print(f"TabPFN CV Score AUC = {m:.3f}")

TabPFN CV Score AUC = 0.892


# Submission CSV Ensemble!
We load the best public notebook from version 1 of public notebook which achieves `LB 0.954` (from [here][1]). Then we ensemble our new TabPFN model preditions with weights `-0.65 * PFN + 1.65 * Public`. We use `scipy.stats.rankdata` to normalize predictions before ensemble.

[1]: https://www.kaggle.com/code/act18l/lb-probing

In [9]:
print("Best Public Notebook achieves LB = 0.954!")
best_public = pd.read_csv("/kaggle/input/lb-915-public-notebook/submission95427.csv")
display( best_public.head() )
best_public = best_public.rainfall.values

Best Public Notebook achieves LB = 0.954!


Unnamed: 0,id,rainfall
0,2190,2.0
1,2191,2.0
2,2192,2.0
3,2193,0.084932
4,2194,0.019863


In [10]:
from scipy.stats import rankdata

print("Ensemble achieves LB = 0.956! Hooray!")
sub = pd.read_csv("/kaggle/input/playground-series-s5e3/sample_submission.csv")
sub.rainfall = -0.065 * rankdata( pred_pfn ) + 1.065 * rankdata( best_public )
sub.rainfall = rankdata( sub.rainfall ) / len(sub)
print( sub.shape )
sub.to_csv(f"submission_ensemble.csv",index=False)
sub.head()

Ensemble achieves LB = 0.956! Hooray!
(730, 2)


Unnamed: 0,id,rainfall
0,2190,0.935616
1,2191,0.934247
2,2192,0.986301
3,2193,0.094521
4,2194,0.021918
