In [None]:
!pip install gpytorch
!pip install ConfigSpace
!pip install openml

In [None]:
import autogluon

In [None]:
!pip install tabpfn[full]

In [None]:
!pip install scikit-learn==1.1.3


In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118


In [None]:
import torch
print(torch.version.cuda)  # 설치된 PyTorch가 지원하는 CUDA 버전
print(torch.cuda.is_available())  # GPU 사용 가능 여부

In [None]:
import sys
import numpy as np
from pathlib import Path
import pandas as pd
import torch
# import openml
import os
import time
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from pathlib import Path

from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris, load_wine, load_breast_cancer
from sklearn.model_selection import train_test_split

# tabpfn_path = 'TabPFN'
# sys.path.insert(0, tabpfn_path)
# from tabpfn.scripts.transformer_prediction_interface import TabPFNClassifier
# from tabpfn.scripts.decision_boundary import DecisionBoundaryDisplay

In [None]:
X, y = load_wine(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)

In [None]:
# N_ensemble_configurations defines how many estimators are averaged, it is bounded by #features * #classes
# more ensemble members are slower, but more accurate
classifier = TabPFNClassifier(device='cuda', N_ensemble_configurations=4)

In [None]:
start = time.time()
classifier.fit(X_train, y_train)
y_eval, p_eval = classifier.predict(X_test, return_winning_probability=True)
print('Prediction time: ', time.time() - start, 'Accuracy', accuracy_score(y_test, y_eval))

In [None]:
!pip install ucimlrepo

In [None]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features
y = adult.data.targets 
  

df = pd.concat([X,y],axis=1)

df = df.dropna()

df['income'] = df['income'].str.replace(r'K\.', 'K', regex=True)

# def stratified_sample(df, stratify_col, n_per_class, random_state=None, replace=False):
#     return df.groupby(stratify_col).sample(n=n_per_class, replace=replace, random_state=random_state)

# # 함수 사용 예
# df = stratified_sample(df, stratify_col='income', n_per_class=400, random_state=1234)


# # df = df.dropna()

# # metadata 
# print(adult.metadata) 
  
# # variable information 
# print(adult.variables) 


In [None]:
df

In [None]:
df.income.value_counts()

In [None]:
X = df[['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       ]]
y = df[['income']]

In [None]:
from sklearn.preprocessing import LabelEncoder

# Label Encoding
le = LabelEncoder()
# X['feature1'] = le.fit_transform(X['feature1'])  # feature1을 숫자로 변환
y_label = le.fit_transform(y.astype(str))  # target 값도 변환

# print(X)
print(y_label)


In [None]:
y.info()

In [None]:
for i in ['workclass','education',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'native-country']:
    X[f'{i}_cat'] = le.fit_transform(X[f'{i}'].astype(str))
   #  X[f'{i}_cat'] = X[f'{i}_cat'].astype(str)  

In [None]:
X

In [None]:
X_cat = X[['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week',
       'workclass_cat', 'education_cat', 'marital-status_cat',
       'occupation_cat', 'relationship_cat', 'race_cat', 'sex_cat',
       'native-country_cat']]

In [None]:
X_cat.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_cat, y_label, test_size=0.2, random_state=42)

In [None]:
max_configurations = X_train.shape[1] * len(set(y_train))
print("최대 N_ensemble_configurations:", max_configurations)

In [None]:
# N_ensemble_configurations defines how many estimators are averaged, it is bounded by #features * #classes
# more ensemble members are slower, but more accurate
classifier = TabPFNClassifier(device='cuda', N_ensemble_configurations=32)

In [None]:
start = time.time()
classifier.fit(X_train, y_train)
y_eval, p_eval = classifier.predict(X_test, return_winning_probability=True)
print('Prediction time: ', time.time() - start, 'Accuracy', accuracy_score(y_test, y_eval))

성능 개선 방법
- N_ensemble_configurations 바꾸기 (특성 * class 종류 개수)
- train/test 비율 8:2
- class 불균형 맞추기
- 여러번 샘플링 해서 결과를 투표로 결정


In [None]:
from collections import Counter
# 샘플링 횟수와 결과 저장용 변수
n_iterations = 10  # 반복 샘플링 횟수
test_predictions = []  # 테스트 데이터의 예측 결과 저장

for i in range(n_iterations):
    # 학습 데이터 샘플링
    X_train_sample, _, y_train_sample, _ = train_test_split(
        X_train, y_train, test_size=0.2, random_state=i
    )
    
    # 모델 학습 및 예측
    classifier.fit(X_train_sample, y_train_sample)
    y_pred = classifier.predict(X_test)
    test_predictions.append(y_pred)

# 예측값 집계 (인덱스별로 가장 많이 나온 값을 선택)
final_predictions = []
for i in range(len(X_test)):
    # 각 테스트 샘플의 i번째 인덱스에 대해 다수결 집계
    preds_for_index = [pred[i] for pred in test_predictions]
    most_common = Counter(preds_for_index).most_common(1)[0][0]
    final_predictions.append(most_common)

# 최종 결과 평가
accuracy = accuracy_score(y_test, final_predictions)
print(f"최종 다수결 예측 정확도: {accuracy}")