# Ispitivanje efikasnost TabPFN na tipicnim tabelarnim dataset-ovim (klasifikacija)

# Osnovno pretprocesiranje Adult Income dataset-a

1. Rukovanje nedostajućim vrijednostima

In [1]:
import pandas as pd

# Učitavanje CSV fajl-a
df = pd.read_csv('adult.csv')

# Prikaz vrijednosti '?' u atributima u kojima ih ima
print("Broj '?' po kolonama:")
print((df == '?').sum())

# Kolone koje sadrže '?' kao nedostajuće vrijednosti
missing_value_cols = ['workclass', 'occupation', 'native.country']

# Zamjena '?' sa 'Unknown'
for col in missing_value_cols:
    df[col] = df[col].replace('?', 'Unknown')

# Provjera da li su sve '?' uspješno zamijenjene
print("\nPreostale '?' vrijednosti:")
print((df == '?').sum())


Broj '?' po kolonama:
age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
income               0
dtype: int64

Preostale '?' vrijednosti:
age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64


2. Endkodiraje kategorijskih atributa

In [2]:
from sklearn.preprocessing import LabelEncoder

# Kategorijske kolone (bez ciljne varijable)
categorical_cols = [
    'workclass',
    'marital.status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native.country'
]

# Label encoding za kategorijske kolone
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # čuvamo enkoder ako bude trebalo za dekodiranje

# Provjera
df[categorical_cols].head()



Unnamed: 0,workclass,marital.status,occupation,relationship,race,sex,native.country
0,7,6,14,1,4,0,38
1,3,6,3,1,4,0,38
2,7,6,14,4,2,0,38
3,3,0,6,4,4,0,38
4,3,5,9,3,4,0,38


3. Uklanjanje atributa

In [3]:
df.drop(columns=['education', 'fnlwgt'], inplace=True)
print(df.columns)

Index(['age', 'workclass', 'education.num', 'marital.status', 'occupation',
       'relationship', 'race', 'sex', 'capital.gain', 'capital.loss',
       'hours.per.week', 'native.country', 'income'],
      dtype='object')


4. Enkodiranje ciljne varijable *income*



In [4]:
from sklearn.preprocessing import LabelEncoder
# Enkodiranje ciljne varijable
le_target = LabelEncoder()
df['income'] = le_target.fit_transform(df['income'])
# Prikaz mapiranja vrijednosti
print("Mapiranje klasnih oznaka:")
for original, encoded in zip(le_target.classes_, le_target.transform(le_target.classes_)):
    print(f"{original} → {encoded}")


Mapiranje klasnih oznaka:
<=50K → 0
>50K → 1


# Ispitivanje efikasnosti TabPFN modela

---



Instalacija potrebnih biblioteka za TabPFN model

In [5]:
!pip install tabpfn torch --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.9/128.9 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m124.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m98.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m56.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Import potrebnih biblioteka

In [6]:
from sklearn.model_selection import train_test_split
import torch
from tabpfn import TabPFNClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

# Ispitivanje na podskupu od 10 000 instanci

Priprema podataka

In [7]:
# Stratifikovano uzorkovanje iz cijelog df
df_10000, _ = train_test_split(
    df,
    train_size=10000,
    stratify=df['income'],
    random_state=10000
)

X_10000 = df_10000.drop(columns=['income'])
y_10000 = df_10000['income']

X_10000_np = X_10000.to_numpy().astype('float32')
y_10000_np = y_10000.to_numpy().astype('int64')

X_train_10000, X_test_10000, y_train_10000, y_test_10000 = train_test_split(
    X_10000_np,
    y_10000_np,
    test_size=0.2,
    stratify=y_10000_np,
    random_state=42
)

Treniranje i evaluacija modela

In [8]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Koristi se uređaj: {device}")

# 1. Učitavanje unaprijed treniranog TabPFN modela
model = TabPFNClassifier(device=device)
model.fit(X_train_10000, y_train_10000)

# 2. Predikcija nad test skupom
y_pred_10000 = model.predict(X_test_10000)

# 3. Evaluacija modela
accuracy = accuracy_score(y_test_10000, y_pred_10000)
print(f"\n Tačnost (accuracy) modela na test skupu (10k uzorak): {accuracy:.4f}\n")

# Detaljan izvještaj
print("Klasifikacioni izvještaj:\n")
print(classification_report(y_test_10000, y_pred_10000, target_names=['<=50K', '>50K']))

Koristi se uređaj: cuda


  model, _, config_ = load_model_criterion_config(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tabpfn-v2-classifier.ckpt:   0%|          | 0.00/29.0M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/37.0 [00:00<?, ?B/s]


 Tačnost (accuracy) modela na test skupu (10k uzorak): 0.8625

Klasifikacioni izvještaj:

              precision    recall  f1-score   support

       <=50K       0.88      0.94      0.91      1518
        >50K       0.77      0.61      0.68       482

    accuracy                           0.86      2000
   macro avg       0.83      0.78      0.80      2000
weighted avg       0.86      0.86      0.86      2000



Primjena RandomForest modela nad istim uzorkom radi poređenja

In [9]:
# Inicijalizacija Random Forest modela
rf_model = RandomForestClassifier(random_state=42)

# Treniranje na trening skupu od 10.000 uzoraka
rf_model.fit(X_train_10000, y_train_10000)

# Predikcija na test skupu
y_pred_rf = rf_model.predict(X_test_10000)

# Evaluacija modela
accuracy_rf = accuracy_score(y_test_10000, y_pred_rf)
print(f"\nTačnost (accuracy) Random Forest modela na test skupu (10.000 uzorak): {accuracy_rf:.4f}\n")
print("Klasifikacioni izvještaj za Random Forest:\n")
print(classification_report(y_test_10000, y_pred_rf, target_names=['<=50K', '>50K']))


Tačnost (accuracy) Random Forest modela na test skupu (10.000 uzorak): 0.8485

Klasifikacioni izvještaj za Random Forest:

              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.90      1518
        >50K       0.73      0.59      0.65       482

    accuracy                           0.85      2000
   macro avg       0.80      0.76      0.78      2000
weighted avg       0.84      0.85      0.84      2000



# Ispitivanje na podskupu od 5 000 instanci

Priprema podataka

In [10]:
# Stratifikovano uzorkovanje iz cijelog df za 5000 uzoraka
df_5000, _ = train_test_split(
    df,
    train_size=5000,
    stratify=df['income'],
    random_state=10000
)

X_5000 = df_5000.drop(columns=['income'])
y_5000 = df_5000['income']

X_5000_np = X_5000.to_numpy().astype('float32')
y_5000_np = y_5000.to_numpy().astype('int64')

X_train_5000, X_test_5000, y_train_5000, y_test_5000 = train_test_split(
    X_5000_np,
    y_5000_np,
    test_size=0.2,
    stratify=y_5000_np,
    random_state=42
)

Treniranje i evaluacija modela

In [11]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Koristi se uređaj: {device}")

# 1. Učitavanje unaprijed treniranog TabPFN modela
model = TabPFNClassifier(device=device)
model.fit(X_train_5000, y_train_5000)

# 2. Predikcija nad test skupom
y_pred_5000 = model.predict(X_test_5000)

# 3. Evaluacija modela
accuracy = accuracy_score(y_test_5000, y_pred_5000)
print(f"\n Tačnost (accuracy) modela na test skupu (5k uzorak): {accuracy:.4f}\n")

# Detaljan izvještaj
print("Klasifikacioni izvještaj:\n")
print(classification_report(y_test_5000, y_pred_5000, target_names=['<=50K', '>50K']))

Koristi se uređaj: cuda

 Tačnost (accuracy) modela na test skupu (5k uzorak): 0.8530

Klasifikacioni izvještaj:

              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.91       759
        >50K       0.74      0.61      0.67       241

    accuracy                           0.85      1000
   macro avg       0.81      0.77      0.79      1000
weighted avg       0.85      0.85      0.85      1000



Primjena RandomForest modela nad istim uzorkom radi poređenja

In [12]:
# Inicijalizacija Random Forest modela
rf_model = RandomForestClassifier(random_state=42)

# Treniranje na trening skupu od 5.000 uzoraka
rf_model.fit(X_train_5000, y_train_5000)

# Predikcija na test skupu
y_pred_rf = rf_model.predict(X_test_5000)

# Evaluacija modela
accuracy_rf = accuracy_score(y_test_5000, y_pred_rf)
print(f"\nTačnost (accuracy) Random Forest modela na test skupu (5.000 uzorak): {accuracy_rf:.4f}\n")
print("Klasifikacioni izvještaj za Random Forest:\n")
print(classification_report(y_test_5000, y_pred_rf, target_names=['<=50K', '>50K']))


Tačnost (accuracy) Random Forest modela na test skupu (5.000 uzorak): 0.8330

Klasifikacioni izvještaj za Random Forest:

              precision    recall  f1-score   support

       <=50K       0.86      0.93      0.89       759
        >50K       0.70      0.53      0.61       241

    accuracy                           0.83      1000
   macro avg       0.78      0.73      0.75      1000
weighted avg       0.82      0.83      0.82      1000



# Ispitivanje na podskupu od 500 instanci

Priprema podataka

In [13]:
# Stratifikovano uzorkovanje iz cijelog df za 500 uzoraka
df_500, _ = train_test_split(
    df,
    train_size=500,
    stratify=df['income'],
    random_state=10000
)

X_500 = df_500.drop(columns=['income'])
y_500 = df_500['income']

X_500_np = X_500.to_numpy().astype('float32')
y_500_np = y_500.to_numpy().astype('int64')

X_train_500, X_test_500, y_train_500, y_test_500 = train_test_split(
    X_500_np,
    y_500_np,
    test_size=0.2,
    stratify=y_500_np,
    random_state=42
)


Treniranje i evaluacija modela

In [14]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Koristi se uređaj: {device}")

# 1. Učitavanje unaprijed treniranog TabPFN modela
model = TabPFNClassifier(device=device)
model.fit(X_train_500, y_train_500)

# 2. Predikcija nad test skupom
y_pred_500 = model.predict(X_test_500)

# 3. Evaluacija modela
accuracy = accuracy_score(y_test_500, y_pred_500)
print(f"\n Tačnost (accuracy) modela na test skupu (500 uzorak): {accuracy:.4f}\n")

# Detaljan izvještaj
print("Klasifikacioni izvještaj:\n")
print(classification_report(y_test_500, y_pred_500, target_names=['<=50K', '>50K']))

Koristi se uređaj: cuda

 Tačnost (accuracy) modela na test skupu (500 uzorak): 0.8800

Klasifikacioni izvještaj:

              precision    recall  f1-score   support

       <=50K       0.89      0.96      0.92        76
        >50K       0.83      0.62      0.71        24

    accuracy                           0.88       100
   macro avg       0.86      0.79      0.82       100
weighted avg       0.88      0.88      0.87       100



Primjena RandomForest modela nad istim uzorkom radi poređenja

In [15]:
# Inicijalizacija modela
rf_model = RandomForestClassifier(random_state=42)

# Treniranje na istom trening setu
rf_model.fit(X_train_500, y_train_500)

# Predikcija na test skupu
y_pred_rf = rf_model.predict(X_test_500)

# Evaluacija
accuracy_rf = accuracy_score(y_test_500, y_pred_rf)
print(f"\nTačnost (accuracy) Random Forest modela na test skupu (500 uzorak): {accuracy_rf:.4f}\n")
print("Klasifikacioni izvještaj za Random Forest:\n")
print(classification_report(y_test_500, y_pred_rf, target_names=['<=50K', '>50K']))


Tačnost (accuracy) Random Forest modela na test skupu (500 uzorak): 0.9000

Klasifikacioni izvještaj za Random Forest:

              precision    recall  f1-score   support

       <=50K       0.90      0.97      0.94        76
        >50K       0.89      0.67      0.76        24

    accuracy                           0.90       100
   macro avg       0.90      0.82      0.85       100
weighted avg       0.90      0.90      0.89       100

