In [1]:
!pip install openml torch scikit-learn

Collecting openml
  Downloading openml-0.14.2.tar.gz (144 kB)
     ---------------------------------------- 0.0/144.5 kB ? eta -:--:--
     -------- ------------------------------- 30.7/144.5 kB ? eta -:--:--
     ----------------------------- -------- 112.6/144.5 kB 1.3 MB/s eta 0:00:01
     -------------------------------------- 144.5/144.5 kB 1.4 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting liac-arff>=2.4.0 (from openml)
  Using cached liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting xmltodict (from openml)
  Obtaining dependency information for xmltodict from https://files.pythonho

In [1]:
import torch
import random
import pathlib

from torch.utils.checkpoint import checkpoint

from tabpfn.utils import normalize_data, to_ranking_low_mem, remove_outliers
from tabpfn.utils import NOP, normalize_by_used_features_f

from sklearn.preprocessing import PowerTransformer, QuantileTransformer, RobustScaler

import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils import column_or_1d
from sklearn.preprocessing import LabelEncoder
from pathlib import Path
from tabpfn.scripts.model_builder import load_model, load_model_only_inference
import os
import pickle
import io

class CustomUnpickler(pickle.Unpickler):
    def find_class(self, module, name):
        if name == 'Manager':
            from settings import Manager
            return Manager
        try:
            return self.find_class_cpu(module, name)
        except:
            return None

    def find_class_cpu(self, module, name):
        if module == 'torch.storage' and name == '_load_from_bytes':
            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
        else:
            return super().find_class(module, name)

def load_model_workflow(i, e, add_name, base_path, device='cpu', eval_addition='', only_inference=True):
    """
    Workflow for loading a model and setting appropriate parameters for diffable hparam tuning.

    :param i:
    :param e:
    :param eval_positions_valid:
    :param add_name:
    :param base_path:
    :param device:
    :param eval_addition:
    :return:
    """
    def get_file(e):
        """
        Returns the different paths of model_file, model_path and results_file
        """
        model_file = f'models_diff/prior_diff_real_checkpoint{add_name}_n_{i}_epoch_{e}.cpkt'
        model_path = os.path.join(base_path, model_file)
        # print('Evaluate ', model_path)
        results_file = os.path.join(base_path,
                                    f'models_diff/prior_diff_real_results{add_name}_n_{i}_epoch_{e}_{eval_addition}.pkl')
        return model_file, model_path, results_file

    def check_file(e):
        model_file, model_path, results_file = get_file(e)
        if not Path(model_path).is_file():  # or Path(results_file).is_file():
            print('We have to download the TabPFN, as there is no checkpoint at ', model_path)
            print('It has about 100MB, so this might take a moment.')
            import requests
            url = 'https://github.com/automl/TabPFN/raw/main/tabpfn/models_diff/prior_diff_real_checkpoint_n_0_epoch_42.cpkt'
            r = requests.get(url, allow_redirects=True)
            os.makedirs(os.path.dirname(model_path), exist_ok=True)
            open(model_path, 'wb').write(r.content)
        return model_file, model_path, results_file

    model_file = None
    if e == -1:
        for e_ in [42] + list(range(100, -1, -1)):
            model_file_, model_path_, results_file_ = check_file(e_)
            if model_file_ is not None:
                e = e_
                model_file, model_path, results_file = model_file_, model_path_, results_file_
                break
    else:
        model_file, model_path, results_file = check_file(e)

    if model_file is None:
        model_file, model_path, results_file = get_file(e)
        raise Exception('No checkpoint found at '+str(model_path))


    #print(f'Loading {model_file}')
    if only_inference:
        # print('Loading model that can be used for inference only')
        model, c = load_model_only_inference(base_path, model_file, device)
    else:
        #until now also only capable of inference
        model, c = load_model(base_path, model_file, device, eval_positions=[], verbose=False)
    #model, c = load_model(base_path, model_file, device, eval_positions=[], verbose=False)

    return model, c, results_file


In [2]:
class TabPFNClassifier(BaseEstimator, ClassifierMixin):

    models_in_memory = {}

    def __init__(self, device='cpu', base_path= None, model_string='',
                 N_ensemble_configurations=3, no_preprocess_mode=False, multiclass_decoder='permutation',
                 feature_shift_decoder=True, only_inference=True, seed=0, no_grad=True, batch_size_inference=32,
                 subsample_features=False):
        """
        Initializes the classifier and loads the model. 
        Depending on the arguments, the model is either loaded from memory, from a file, or downloaded from the 
        repository if no model is found.
        
        Can also be used to compute gradients with respect to the inputs X_train and X_test. Therefore no_grad has to be 
        set to False and no_preprocessing_mode must be True. Furthermore, X_train and X_test need to be given as 
        torch.Tensors and their requires_grad parameter must be set to True.
        
        
        :param device: If the model should run on cuda or cpu.
        :param base_path: Base path of the directory, from which the folders like models_diff can be accessed.
        :param model_string: Name of the model. Used first to check if the model is already in memory, and if not, 
               tries to load a model with that name from the models_diff directory. It looks for files named as 
               follows: "prior_diff_real_checkpoint" + model_string + "_n_0_epoch_e.cpkt", where e can be a number 
               between 100 and 0, and is checked in a descending order. 
        :param N_ensemble_configurations: The number of ensemble configurations used for the prediction. Thereby the 
               accuracy, but also the running time, increases with this number. 
        :param no_preprocess_mode: Specifies whether preprocessing is to be performed.
        :param multiclass_decoder: If set to permutation, randomly shifts the classes for each ensemble configuration. 
        :param feature_shift_decoder: If set to true shifts the features for each ensemble configuration according to a 
               random permutation.
        :param only_inference: Indicates if the model should be loaded to only restore inference capabilities or also 
               training capabilities. Note that the training capabilities are currently not being fully restored.
        :param seed: Seed that is used for the prediction. Allows for a deterministic behavior of the predictions.
        :param batch_size_inference: This parameter is a trade-off between performance and memory consumption.
               The computation done with different values for batch_size_inference is the same,
               but it is split into smaller/larger batches.
        :param no_grad: If set to false, allows for the computation of gradients with respect to X_train and X_test. 
               For this to correctly function no_preprocessing_mode must be set to true.
        :param subsample_features: If set to true and the number of features in the dataset exceeds self.max_features (100),
                the features are subsampled to self.max_features.
        """

        # Model file specification (Model name, Epoch)
        i = 0
        model_key = model_string+'|'+str(device)
        if model_key in self.models_in_memory:
            model, c, results_file = self.models_in_memory[model_key]
        else:
            model, c, results_file = load_model_workflow(i, -1, add_name=model_string, base_path=base_path, device=device,
                                                         eval_addition='', only_inference=only_inference)
            self.models_in_memory[model_key] = (model, c, results_file)
            if len(self.models_in_memory) == 2:
                print('Multiple models in memory. This might lead to memory issues. Consider calling remove_models_from_memory()')
        #style, temperature = self.load_result_minimal(style_file, i, e)

        self.device = device
        self.model = model
        self.c = c
        self.style = None
        self.temperature = None
        self.N_ensemble_configurations = N_ensemble_configurations
        self.base__path = base_path
        self.base_path = base_path
        self.i = i
        self.model_string = model_string

        self.max_num_features = self.c['num_features']
        self.max_num_classes = self.c['max_num_classes']
        self.differentiable_hps_as_style = self.c['differentiable_hps_as_style']

        self.no_preprocess_mode = no_preprocess_mode
        self.feature_shift_decoder = feature_shift_decoder
        self.multiclass_decoder = multiclass_decoder
        self.only_inference = only_inference
        self.seed = seed
        self.no_grad = no_grad
        self.subsample_features = subsample_features

        assert self.no_preprocess_mode if not self.no_grad else True, \
            "If no_grad is false, no_preprocess_mode must be true, because otherwise no gradient can be computed."

        self.batch_size_inference = batch_size_inference

    def remove_models_from_memory(self):
        self.models_in_memory = {}

    def load_result_minimal(self, path, i, e):
        with open(path, 'rb') as output:
            _, _, _, style, temperature, optimization_route = CustomUnpickler(output).load()

            return style, temperature

    def _validate_targets(self, y):
        y_ = column_or_1d(y, warn=True)
        check_classification_targets(y)
        cls, y = np.unique(y_, return_inverse=True)
        if len(cls) < 2:
            raise ValueError(
                "The number of classes has to be greater than one; got %d class"
                % len(cls)
            )

        self.classes_ = cls

        return np.asarray(y, dtype=np.float64, order="C")

    def fit(self, X, y, overwrite_warning=False):
        """
        Validates the training set and stores it.

        If clf.no_grad (default is True):
        X, y should be of type np.array
        else:
        X should be of type torch.Tensors (y can be np.array or torch.Tensor)
        """
        if self.no_grad:
            # Check that X and y have correct shape
            X, y = check_X_y(X, y, force_all_finite=False)
        # Store the classes seen during fit
        y = self._validate_targets(y)
        self.label_encoder = LabelEncoder()
        y = self.label_encoder.fit_transform(y)

        self.X_ = X
        self.y_ = y

        if (X.shape[1] > self.max_num_features):
            if self.subsample_features:
                print('WARNING: The number of features for this classifier is restricted to ', self.max_num_features, ' and will be subsampled.')
            else:
                raise ValueError("The number of features for this classifier is restricted to ", self.max_num_features)
        if len(np.unique(y)) > self.max_num_classes:
            raise ValueError("The number of classes for this classifier is restricted to ", self.max_num_classes)
        if X.shape[0] > 1024 and not overwrite_warning:
            raise ValueError("⚠️ WARNING: TabPFN is not made for datasets with a trainingsize > 1024. Prediction might take a while, be less reliable. We advise not to run datasets > 10k samples, which might lead to your machine crashing (due to quadratic memory scaling of TabPFN). Please confirm you want to run by passing overwrite_warning=True to the fit function.")


        # Return the classifier
        return self

    def predict_proba(self, X, normalize_with_test=False, return_logits=False):
        """
        Predict the probabilities for the input X depending on the training set previously passed in the method fit.

        If no_grad is true in the classifier the function takes X as a numpy.ndarray. If no_grad is false X must be a
        torch tensor and is not fully checked.
        """
        # Check is fit had been called
        check_is_fitted(self)

        # Input validation
        if self.no_grad:
            X = check_array(X, force_all_finite=False)
            X_full = np.concatenate([self.X_, X], axis=0)
            X_full = torch.tensor(X_full, device=self.device).float().unsqueeze(1)
        else:
            assert (torch.is_tensor(self.X_) & torch.is_tensor(X)), "If no_grad is false, this function expects X as " \
                                                                    "a tensor to calculate a gradient"
            X_full = torch.cat((self.X_, X), dim=0).float().unsqueeze(1).to(self.device)

            if int(torch.isnan(X_full).sum()):
                print('X contains nans and the gradient implementation is not designed to handel nans.')

        y_full = np.concatenate([self.y_, np.zeros(shape=X.shape[0])], axis=0)
        y_full = torch.tensor(y_full, device=self.device).float().unsqueeze(1)

        eval_pos = self.X_.shape[0]

        prediction = transformer_predict(self.model[2], X_full, y_full, eval_pos,
                                         device=self.device,
                                         style=self.style,
                                         inference_mode=True,
                                         preprocess_transform='none' if self.no_preprocess_mode else 'mix',
                                         normalize_with_test=normalize_with_test,
                                         N_ensemble_configurations=self.N_ensemble_configurations,
                                         softmax_temperature=self.temperature,
                                         multiclass_decoder=self.multiclass_decoder,
                                         feature_shift_decoder=self.feature_shift_decoder,
                                         differentiable_hps_as_style=self.differentiable_hps_as_style,
                                         seed=self.seed,
                                         return_logits=return_logits,
                                         no_grad=self.no_grad,
                                         batch_size_inference=self.batch_size_inference,
                                         **get_params_from_config(self.c))
        prediction_, y_ = prediction.squeeze(0), y_full.squeeze(1).long()[eval_pos:]

        return prediction_.detach().cpu().numpy() if self.no_grad else prediction_

    def predict(self, X, return_winning_probability=False, normalize_with_test=False):
        p = self.predict_proba(X, normalize_with_test=normalize_with_test)
        y = np.argmax(p, axis=-1)
        y = self.classes_.take(np.asarray(y, dtype=np.intp))
        if return_winning_probability:
            return y, p.max(axis=-1)
        return y

import time
def transformer_predict(model, eval_xs, eval_ys, eval_position,
                        device='cpu',
                        max_features=100,
                        style=None,
                        inference_mode=False,
                        num_classes=2,
                        extend_features=True,
                        normalize_with_test=False,
                        normalize_to_ranking=False,
                        softmax_temperature=0.0,
                        multiclass_decoder='permutation',
                        preprocess_transform='mix',
                        categorical_feats=[],
                        feature_shift_decoder=False,
                        N_ensemble_configurations=10,
                        batch_size_inference=16,
                        differentiable_hps_as_style=False,
                        average_logits=True,
                        fp16_inference=False,
                        normalize_with_sqrt=False,
                        seed=0,
                        no_grad=True,
                        return_logits=False,
                        **kwargs):
    """

    :param model:
    :param eval_xs:
    :param eval_ys:
    :param eval_position:
    :param rescale_features:
    :param device:
    :param max_features:
    :param style:
    :param inference_mode:
    :param num_classes:
    :param extend_features:
    :param normalize_to_ranking:
    :param softmax_temperature:
    :param multiclass_decoder:
    :param preprocess_transform:
    :param categorical_feats:
    :param feature_shift_decoder:
    :param N_ensemble_configurations:
    :param average_logits:
    :param normalize_with_sqrt:
    :param metric_used:
    :return:
    """
    num_classes = len(torch.unique(eval_ys))

    def predict(eval_xs, eval_ys, used_style, softmax_temperature, return_logits):
        # Initialize results array size S, B, Classes

        # no_grad disables inference_mode, because otherwise the gradients are lost
        inference_mode_call = torch.inference_mode() if inference_mode and no_grad else NOP()
        with inference_mode_call:
            start = time.time()
            output = model(
                    (used_style.repeat(eval_xs.shape[1], 1) if used_style is not None else None, eval_xs, eval_ys.float()),
                    single_eval_pos=eval_position)[:, :, 0:num_classes]

            output = output[:, :, 0:num_classes] / torch.exp(softmax_temperature)
            if not return_logits:
                output = torch.nn.functional.softmax(output, dim=-1)
            #else:
            #    output[:, :, 1] = model((style.repeat(eval_xs.shape[1], 1) if style is not None else None, eval_xs, eval_ys.float()),
            #               single_eval_pos=eval_position)

            #    output[:, :, 1] = torch.sigmoid(output[:, :, 1]).squeeze(-1)
            #    output[:, :, 0] = 1 - output[:, :, 1]

        #print('RESULTS', eval_ys.shape, torch.unique(eval_ys, return_counts=True), output.mean(axis=0))

        return output

    def preprocess_input(eval_xs, preprocess_transform):
        import warnings

        if eval_xs.shape[1] > 1:
            raise Exception("Transforms only allow one batch dim - TODO")

        if eval_xs.shape[2] > max_features:
            eval_xs = eval_xs[:, :, sorted(np.random.choice(eval_xs.shape[2], max_features, replace=False))]

        if preprocess_transform != 'none':
            if preprocess_transform == 'power' or preprocess_transform == 'power_all':
                pt = PowerTransformer(standardize=True)
            elif preprocess_transform == 'quantile' or preprocess_transform == 'quantile_all':
                pt = QuantileTransformer(output_distribution='normal')
            elif preprocess_transform == 'robust' or preprocess_transform == 'robust_all':
                pt = RobustScaler(unit_variance=True)

        # eval_xs, eval_ys = normalize_data(eval_xs), normalize_data(eval_ys)
        eval_xs = normalize_data(eval_xs, normalize_positions=-1 if normalize_with_test else eval_position)

        # Removing empty features
        eval_xs = eval_xs[:, 0, :]
        sel = [len(torch.unique(eval_xs[0:eval_ys.shape[0], col])) > 1 for col in range(eval_xs.shape[1])]
        eval_xs = eval_xs[:, sel]

        warnings.simplefilter('error')
        if preprocess_transform != 'none':
            eval_xs = eval_xs.cpu().numpy()
            feats = set(range(eval_xs.shape[1])) if 'all' in preprocess_transform else set(
                range(eval_xs.shape[1])) - set(categorical_feats)
            for col in feats:
                try:
                    pt.fit(eval_xs[0:eval_position, col:col + 1])
                    trans = pt.transform(eval_xs[:, col:col + 1])
                    # print(scipy.stats.spearmanr(trans[~np.isnan(eval_xs[:, col:col+1])], eval_xs[:, col:col+1][~np.isnan(eval_xs[:, col:col+1])]))
                    eval_xs[:, col:col + 1] = trans
                except:
                    pass
            eval_xs = torch.tensor(eval_xs).float()
        warnings.simplefilter('default')

        eval_xs = eval_xs.unsqueeze(1)

        # TODO: Caution there is information leakage when to_ranking is used, we should not use it
        eval_xs = remove_outliers(eval_xs, normalize_positions=-1 if normalize_with_test else eval_position) \
                if not normalize_to_ranking else normalize_data(to_ranking_low_mem(eval_xs))
        # Rescale X
        eval_xs = normalize_by_used_features_f(eval_xs, eval_xs.shape[-1], max_features,
                                               normalize_with_sqrt=normalize_with_sqrt)

        return eval_xs.to(device)

    eval_xs, eval_ys = eval_xs.to(device), eval_ys.to(device)
    eval_ys = eval_ys[:eval_position]

    model.to(device)

    model.eval()

    import itertools
    if not differentiable_hps_as_style:
        style = None

    if style is not None:
        style = style.to(device)
        style = style.unsqueeze(0) if len(style.shape) == 1 else style
        num_styles = style.shape[0]
        softmax_temperature = softmax_temperature if softmax_temperature.shape else softmax_temperature.unsqueeze(
            0).repeat(num_styles)
    else:
        num_styles = 1
        style = None
        softmax_temperature = torch.log(torch.tensor([0.8]))

    styles_configurations = range(0, num_styles)
    def get_preprocess(i):
        if i == 0:
            return 'power_all'
#            if i == 1:
#                return 'robust_all'
        if i == 1:
            return 'none'

    preprocess_transform_configurations = ['none', 'power_all'] if preprocess_transform == 'mix' else [preprocess_transform]

    if seed is not None:
        torch.manual_seed(seed)

    feature_shift_configurations = torch.randperm(eval_xs.shape[2]) if feature_shift_decoder else [0]
    class_shift_configurations = torch.randperm(len(torch.unique(eval_ys))) if multiclass_decoder == 'permutation' else [0]

    ensemble_configurations = list(itertools.product(class_shift_configurations, feature_shift_configurations))
    #default_ensemble_config = ensemble_configurations[0]

    rng = random.Random(seed)
    rng.shuffle(ensemble_configurations)
    ensemble_configurations = list(itertools.product(ensemble_configurations, preprocess_transform_configurations, styles_configurations))
    ensemble_configurations = ensemble_configurations[0:N_ensemble_configurations]
    #if N_ensemble_configurations == 1:
    #    ensemble_configurations = [default_ensemble_config]

    output = None

    eval_xs_transformed = {}
    inputs, labels = [], []
    start = time.time()
    for ensemble_configuration in ensemble_configurations:
        (class_shift_configuration, feature_shift_configuration), preprocess_transform_configuration, styles_configuration = ensemble_configuration

        style_ = style[styles_configuration:styles_configuration+1, :] if style is not None else style
        softmax_temperature_ = softmax_temperature[styles_configuration]

        eval_xs_, eval_ys_ = eval_xs.clone(), eval_ys.clone()

        if preprocess_transform_configuration in eval_xs_transformed:
            eval_xs_ = eval_xs_transformed[preprocess_transform_configuration].clone()
        else:
            eval_xs_ = preprocess_input(eval_xs_, preprocess_transform=preprocess_transform_configuration)
            if no_grad:
                eval_xs_ = eval_xs_.detach()
            eval_xs_transformed[preprocess_transform_configuration] = eval_xs_

        eval_ys_ = ((eval_ys_ + class_shift_configuration) % num_classes).float()

        eval_xs_ = torch.cat([eval_xs_[..., feature_shift_configuration:],eval_xs_[..., :feature_shift_configuration]],dim=-1)

        # Extend X
        if extend_features:
            eval_xs_ = torch.cat(
                [eval_xs_,
                 torch.zeros((eval_xs_.shape[0], eval_xs_.shape[1], max_features - eval_xs_.shape[2])).to(device)], -1)
        inputs += [eval_xs_]
        labels += [eval_ys_]

    inputs = torch.cat(inputs, 1)
    inputs = torch.split(inputs, batch_size_inference, dim=1)
    labels = torch.cat(labels, 1)
    labels = torch.split(labels, batch_size_inference, dim=1)
    #print('PREPROCESSING TIME', str(time.time() - start))
    outputs = []
    start = time.time()
    for batch_input, batch_label in zip(inputs, labels):
        #preprocess_transform_ = preprocess_transform if styles_configuration % 2 == 0 else 'none'
        import warnings
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore",
                                    message="None of the inputs have requires_grad=True. Gradients will be None")
            warnings.filterwarnings("ignore",
                                    message="torch.cuda.amp.autocast only affects CUDA ops, but CUDA is not available.  Disabling.")
            if device == 'cpu':
                output_batch = checkpoint(predict, batch_input, batch_label, style_, softmax_temperature_, True)
            else:
                with torch.cuda.amp.autocast(enabled=fp16_inference):
                    output_batch = checkpoint(predict, batch_input, batch_label, style_, softmax_temperature_, True)
        outputs += [output_batch]
    #print('MODEL INFERENCE TIME ('+str(batch_input.device)+' vs '+device+', '+str(fp16_inference)+')', str(time.time()-start))

    outputs = torch.cat(outputs, 1)
    for i, ensemble_configuration in enumerate(ensemble_configurations):
        (class_shift_configuration, feature_shift_configuration), preprocess_transform_configuration, styles_configuration = ensemble_configuration
        output_ = outputs[:, i:i+1, :]
        output_ = torch.cat([output_[..., class_shift_configuration:],output_[..., :class_shift_configuration]],dim=-1)

        #output_ = predict(eval_xs, eval_ys, style_, preprocess_transform_)
        if not average_logits and not return_logits:
            # transforms every ensemble_configuration into a probability -> equal contribution of every configuration
            output_ = torch.nn.functional.softmax(output_, dim=-1)
        output = output_ if output is None else output + output_

    output = output / len(ensemble_configurations)
    if average_logits and not return_logits:
        if fp16_inference:
            output = output.float()
        output = torch.nn.functional.softmax(output, dim=-1)

    output = torch.transpose(output, 0, 1)

    return output

def get_params_from_config(c):
    return {'max_features': c['num_features']
        , 'rescale_features': c["normalize_by_used_features"]
        , 'normalize_to_ranking': c["normalize_to_ranking"]
        , 'normalize_with_sqrt': c.get("normalize_with_sqrt", False)
            }

In [10]:
import torch
import torch.nn as nn
from sklearn.base import BaseEstimator, ClassifierMixin
from tabpfn import TabPFNClassifier as TabPFNModel  # TabPFNClassifier 임포트
import numpy as np

class FTTransformer(nn.Module):
    def __init__(self, num_features, dim=100, depth=4, heads=8, mlp_dim=256):
        super(FTTransformer, self).__init__()
        self.num_features = num_features
        self.dim = dim

        # Feature Tokenization
        self.feature_tokenizer = nn.Linear(1, dim)

        # Transformer Encoder
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=dim, 
                nhead=heads, 
                dim_feedforward=mlp_dim
            ), 
            num_layers=depth
        )

    def forward(self, x):
        x = x.unsqueeze(-1)  # Add dimension for tokenization
        tokens = self.feature_tokenizer(x)
        tokens = tokens.transpose(0, 1)  # (batch_size, num_features, dim) -> (num_features, batch_size, dim)
        tokens = self.transformer(tokens)
        tokens = tokens.mean(dim=0)  # Global average pooling
        return tokens

In [37]:
from sklearn.metrics import log_loss, accuracy_score

class FTTabPFNClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, device='cpu', num_features=10, num_classes=2, dim=100, depth=4, heads=8, mlp_dim=256,
                 n_random_trials=30, n_opt_trials=1, batch_size=128, val_batch_size=256, 
                 early_stopping_rounds=20, epochs=100, logging_period=100, **kwargs):
        super().__init__()
        self.device = device
        self.max_features = 100  # TabPFN이 허용하는 최대 피처 수를 설정
        self.training_feature_count = None  # 학습 데이터의 피처 수를 저장

        # TabPFN 하이퍼파라미터 설정
        self.n_random_trials = n_random_trials
        self.n_opt_trials = n_opt_trials
        self.batch_size = batch_size
        self.val_batch_size = val_batch_size
        self.early_stopping_rounds = early_stopping_rounds
        self.epochs = epochs
        self.logging_period = logging_period

        # FTTransformer 초기화
        self.ft_transformer = FTTransformer(num_features, dim, depth, heads, mlp_dim)
        self.ft_transformer.to(self.device)

        # TabPFN 모델 초기화
        self.tabpfn = TabPFNModel(device=self.device)

        # 각 Trial의 로그로스와 정확도를 저장할 리스트
        self.trial_log_losses = []
        self.trial_accuracies = []

    def fit(self, X, y):
        self.ft_transformer.train()
        X, y = X.to(self.device), y.to(self.device)
        
        # FTTransformer 학습 (임베딩 추출)
        optimizer = torch.optim.Adam(self.ft_transformer.parameters(), lr=0.001)
        criterion = nn.CrossEntropyLoss()

        for epoch in range(self.epochs):
            optimizer.zero_grad()
            embeddings = self.ft_transformer(X)
            loss = criterion(embeddings, y)
            loss.backward()
            optimizer.step()

            if (epoch + 1) % self.logging_period == 0:
                with torch.no_grad():
                    # 예측 수행
                    embeddings_eval = self.ft_transformer(X)
                    y_pred_proba = torch.softmax(embeddings_eval, dim=1).cpu().numpy()
                    y_pred = torch.argmax(embeddings_eval, dim=1).cpu().numpy()

                    # 손실 및 정확도 계산
                    current_loss = log_loss(y.cpu().numpy(), y_pred_proba)
                    current_accuracy = accuracy_score(y.cpu().numpy(), y_pred)
                    print(f'Epoch {epoch+1}/{self.epochs}, Loss: {loss.item():.4f}, Accuracy: {current_accuracy * 100:.2f}%')

        # FTTransformer 학습 후 전체 데이터셋에 대해 임베딩을 얻음
        with torch.no_grad():
            embeddings = self.ft_transformer(X).cpu().numpy()

        # 학습 데이터의 피처 수 저장
        self.training_feature_count = embeddings.shape[1]

        # 피처 수가 max_features를 초과할 경우 피처 서브샘플링
        if embeddings.shape[1] > self.max_features:
            print(f"Subsampling features from {embeddings.shape[1]} to {self.max_features}")
            indices = np.random.choice(embeddings.shape[1], self.max_features, replace=False)
            embeddings = embeddings[:, indices]

        # 하이퍼파라미터 최적화 (랜덤 시도)
        best_loss = float('inf')
        best_params = None
        for trial in range(self.n_random_trials + self.n_opt_trials):
            # 하이퍼파라미터 조합을 랜덤하게 선택
            learning_rate = np.random.uniform(1e-4, 1e-2)
            current_batch_size = np.random.choice([self.batch_size, self.val_batch_size])
            print(f"Trial {trial+1}/{self.n_random_trials + self.n_opt_trials}: Testing parameters batch_size={current_batch_size}, learning_rate={learning_rate}")

            # TabPFN 모델 학습 시 하이퍼파라미터를 적용
            self.tabpfn.learning_rate = learning_rate  # 예시: TabPFN에서 학습률을 설정하는 경우
            self.tabpfn.batch_size = current_batch_size  # 예시: TabPFN에서 배치 크기를 설정하는 경우

            # TabPFN 학습 (수동으로 조기 종료 구현)
            no_improvement_count = 0
            best_val_loss = float('inf')
            best_val_accuracy = 0.0

            for epoch in range(self.epochs):
                # 한 에포크에 대해 학습
                self.tabpfn.fit(embeddings, y.cpu().numpy())  # fit 메서드에서 에포크를 고려한 학습이 필요할 수 있음

                # 예측 수행
                y_pred_proba = self.tabpfn.predict_proba(embeddings)
                y_pred = np.argmax(y_pred_proba, axis=1)

                # 검증 손실 및 정확도 계산
                val_loss = log_loss(y.cpu().numpy(), y_pred_proba)
                val_accuracy = accuracy_score(y.cpu().numpy(), y_pred)

                # 조기 종료 체크
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    best_val_accuracy = val_accuracy
                    no_improvement_count = 0
                else:
                    no_improvement_count += 1

                if no_improvement_count >= self.early_stopping_rounds:
                    print(f'Early stopping at epoch {epoch+1} with validation loss: {best_val_loss:.4f}, validation accuracy: {best_val_accuracy * 100:.2f}%')
                    break

            # 각 Trial에서의 성능을 저장
            self.trial_log_losses.append(best_val_loss)
            self.trial_accuracies.append(best_val_accuracy)

            if best_val_loss < best_loss:
                best_loss = best_val_loss
                best_params = (learning_rate, current_batch_size)

            if trial >= self.n_random_trials:
                break  # 최적화 시도 후 종료

        print(f"Best parameters found: learning_rate={best_params[0]}, batch_size={best_params[1]}, with validation loss: {best_loss:.4f}")

        # 최적의 파라미터로 전체 데이터에 대해 TabPFN 재학습
        self.tabpfn.learning_rate = best_params[0]
        self.tabpfn.batch_size = best_params[1]
        self.tabpfn.fit(embeddings, y.cpu().numpy())

    def predict_proba(self, X):
        self.ft_transformer.eval()
        X = torch.tensor(X, device=self.device).float()

        with torch.no_grad():
            embeddings = self.ft_transformer(X).cpu().numpy()

        # 학습 데이터와 동일한 피처 수를 유지
        if embeddings.shape[1] > self.training_feature_count:
            embeddings = embeddings[:, :self.training_feature_count]
        elif embeddings.shape[1] < self.training_feature_count:
            padding = np.zeros((embeddings.shape[0], self.training_feature_count - embeddings.shape[1]))
            embeddings = np.hstack([embeddings, padding])

        # 피처 수가 max_features를 초과할 경우 피처 서브샘플링
        if embeddings.shape[1] > self.max_features:
            indices = np.random.choice(embeddings.shape[1], self.max_features, replace=False)
            embeddings = embeddings[:, indices]

        # 임베딩 기반으로 TabPFN 예측 확률 계산
        return self.tabpfn.predict_proba(embeddings)

    def predict(self, X):
        # 입력을 텐서로 변환 (필요한 경우)
        if not isinstance(X, torch.Tensor):
            X = torch.tensor(X, device=self.device).float()

        # 최종 예측 단계에서 학습 데이터와 동일한 피처 수를 유지
        with torch.no_grad():
            embeddings = self.ft_transformer(X).cpu().numpy()

        # 학습 데이터와 동일한 피처 수를 유지
        if embeddings.shape[1] > self.training_feature_count:
            embeddings = embeddings[:, :self.training_feature_count]
        elif embeddings.shape[1] < self.training_feature_count:
            padding = np.zeros((embeddings.shape[0], self.training_feature_count - embeddings.shape[1]))
            embeddings = np.hstack([embeddings, padding])

        # 피처 수가 max_features를 초과할 경우 피처 서브샘플링
        if embeddings.shape[1] > self.max_features:
            indices = np.random.choice(embeddings.shape[1], self.max_features, replace=False)
            embeddings = embeddings[:, indices]

        return self.tabpfn.predict(embeddings)


In [43]:
from sklearn.metrics import log_loss

class FTTabPFNClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, device='cpu', num_features=10, num_classes=2, dim=100, depth=4, heads=8, mlp_dim=256,
                 n_random_trials=30, n_opt_trials=1, batch_size=128, val_batch_size=256, 
                 early_stopping_rounds=20, epochs=100, logging_period=100, **kwargs):
        super().__init__()
        self.device = device
        self.max_features = 100  # TabPFN이 허용하는 최대 피처 수를 설정
        self.training_feature_count = None  # 학습 데이터의 피처 수를 저장

        # TabPFN 하이퍼파라미터 설정
        self.n_random_trials = n_random_trials
        self.n_opt_trials = n_opt_trials
        self.batch_size = batch_size
        self.val_batch_size = val_batch_size
        self.early_stopping_rounds = early_stopping_rounds
        self.epochs = epochs
        self.logging_period = logging_period

        # FTTransformer 초기화
        self.ft_transformer = FTTransformer(num_features, dim, depth, heads, mlp_dim)
        self.ft_transformer.to(self.device)

        # TabPFN 모델 초기화
        self.tabpfn = TabPFNModel(device=self.device)

    def fit(self, X, y):
        self.ft_transformer.train()
        X, y = X.to(self.device), y.to(self.device)
        
        # FTTransformer 학습 (임베딩 추출)
        optimizer = torch.optim.Adam(self.ft_transformer.parameters(), lr=0.001)
        criterion = nn.CrossEntropyLoss()

        for epoch in range(self.epochs):
            optimizer.zero_grad()
            embeddings = self.ft_transformer(X)
            loss = criterion(embeddings, y)
            loss.backward()
            optimizer.step()

            if (epoch + 1) % self.logging_period == 0:
                print(f'Epoch {epoch+1}/{self.epochs}, Loss: {loss.item():.4f}')

        # FTTransformer 학습 후 전체 데이터셋에 대해 임베딩을 얻음
        with torch.no_grad():
            embeddings = self.ft_transformer(X).cpu().numpy()

        # 학습 데이터의 피처 수 저장
        self.training_feature_count = embeddings.shape[1]

        # 피처 수가 max_features를 초과할 경우 피처 서브샘플링
        if embeddings.shape[1] > self.max_features:
            print(f"Subsampling features from {embeddings.shape[1]} to {self.max_features}")
            indices = np.random.choice(embeddings.shape[1], self.max_features, replace=False)
            embeddings = embeddings[:, indices]

        # 하이퍼파라미터 최적화 (랜덤 시도)
        best_loss = float('inf')
        best_params = None
        for trial in range(self.n_random_trials + self.n_opt_trials):
            # 하이퍼파라미터 조합을 랜덤하게 선택
            learning_rate = np.random.uniform(1e-4, 1e-2)
            current_batch_size = np.random.choice([self.batch_size, self.val_batch_size])
            print(f"Trial {trial+1}/{self.n_random_trials + self.n_opt_trials}: Testing parameters batch_size={current_batch_size}, learning_rate={learning_rate}")

            # TabPFN 모델 학습 시 하이퍼파라미터를 적용
            self.tabpfn.learning_rate = learning_rate  # 예시: TabPFN에서 학습률을 설정하는 경우
            self.tabpfn.batch_size = current_batch_size  # 예시: TabPFN에서 배치 크기를 설정하는 경우

            # TabPFN 학습 (수동으로 조기 종료 구현)
            no_improvement_count = 0
            best_val_loss = float('inf')

            for epoch in range(self.epochs):
                # 한 에포크에 대해 학습
                self.tabpfn.fit(embeddings, y.cpu().numpy())  # fit 메서드에서 에포크를 고려한 학습이 필요할 수 있음

                # 예측 수행
                y_pred_proba = self.tabpfn.predict_proba(embeddings)

                # 검증 손실 계산
                val_loss = log_loss(y.cpu().numpy(), y_pred_proba)

                # 조기 종료 체크
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    no_improvement_count = 0
                else:
                    no_improvement_count += 1

                if no_improvement_count >= self.early_stopping_rounds:
                    print(f'Early stopping at epoch {epoch+1} with validation loss: {best_val_loss:.4f}')
                    break

            if best_val_loss < best_loss:
                best_loss = best_val_loss
                best_params = (learning_rate, current_batch_size)

            if trial >= self.n_random_trials:
                break  # 최적화 시도 후 종료

        print(f"Best parameters found: learning_rate={best_params[0]}, batch_size={best_params[1]}, with validation loss: {best_loss:.4f}")

        # 최적의 파라미터로 전체 데이터에 대해 TabPFN 재학습
        self.tabpfn.learning_rate = best_params[0]
        self.tabpfn.batch_size = best_params[1]
        self.tabpfn.fit(embeddings, y.cpu().numpy())

    def predict_proba(self, X):
        self.ft_transformer.eval()
        X = torch.tensor(X, device=self.device).float()

        with torch.no_grad():
            embeddings = self.ft_transformer(X).cpu().numpy()

        # 학습 데이터와 동일한 피처 수를 유지
        if embeddings.shape[1] > self.training_feature_count:
            embeddings = embeddings[:, :self.training_feature_count]
        elif embeddings.shape[1] < self.training_feature_count:
            padding = np.zeros((embeddings.shape[0], self.training_feature_count - embeddings.shape[1]))
            embeddings = np.hstack([embeddings, padding])

        # 피처 수가 max_features를 초과할 경우 피처 서브샘플링
        if embeddings.shape[1] > self.max_features:
            indices = np.random.choice(embeddings.shape[1], self.max_features, replace=False)
            embeddings = embeddings[:, indices]

        # 임베딩 기반으로 TabPFN 예측 확률 계산
        return self.tabpfn.predict_proba(embeddings)

    def predict(self, X):
        # 입력을 텐서로 변환 (필요한 경우)
        if not isinstance(X, torch.Tensor):
            X = torch.tensor(X, device=self.device).float()

        # 최종 예측 단계에서 학습 데이터와 동일한 피처 수를 유지
        with torch.no_grad():
            embeddings = self.ft_transformer(X).cpu().numpy()

        # 학습 데이터와 동일한 피처 수를 유지
        if embeddings.shape[1] > self.training_feature_count:
            embeddings = embeddings[:, :self.training_feature_count]
        elif embeddings.shape[1] < self.training_feature_count:
            padding = np.zeros((embeddings.shape[0], self.training_feature_count - embeddings.shape[1]))
            embeddings = np.hstack([embeddings, padding])

        # 피처 수가 max_features를 초과할 경우 피처 서브샘플링
        if embeddings.shape[1] > self.max_features:
            indices = np.random.choice(embeddings.shape[1], self.max_features, replace=False)
            embeddings = embeddings[:, indices]

        return self.tabpfn.predict(embeddings)


In [44]:
import openml
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
import torch
import pathlib

# OpenML에서 데이터셋 로드
dataset = openml.datasets.get_dataset(29)
X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute)

# 범주형 변수와 결측치 처리
X = pd.get_dummies(X, drop_first=True)
X = X.fillna(X.median())

# 라벨 인코딩
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# 학습/테스트 데이터셋 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Numpy 배열을 PyTorch Tensor로 변환
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.long)

# FTTabPFNClassifier를 초기화
model = FTTabPFNClassifier(
    device='cpu',
    base_path=pathlib.Path.cwd().resolve(),  # base_path를 현재 작업 디렉토리로 설정
    num_features=X_train.shape[1],
    num_classes=len(label_encoder.classes_),  # 클래스 수 설정
    dim=128,
    depth=4,
    heads=8,
    mlp_dim=256,
)

# 모델을 특정 장치로 이동 (CPU 또는 GPU)
model.ft_transformer.to(model.device)

# 모델 학습
model.fit(X_train, y_train)

# 예측 수행
y_pred = model.predict(X_test)

# 정확도 평가
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy on the credit-approval dataset: {accuracy * 100:.2f}%')



  dataset = openml.datasets.get_dataset(29)


Epoch 100/100, Loss: 0.5004
Subsampling features from 128 to 100
Trial 1/31: Testing parameters batch_size=128, learning_rate=0.000503659879380401




Early stopping at epoch 21 with validation loss: 0.4285
Trial 2/31: Testing parameters batch_size=256, learning_rate=0.003431748123217137




Early stopping at epoch 21 with validation loss: 0.4285
Trial 3/31: Testing parameters batch_size=256, learning_rate=0.003962857454925808




Early stopping at epoch 21 with validation loss: 0.4285
Trial 4/31: Testing parameters batch_size=256, learning_rate=0.008748613403261333




Early stopping at epoch 21 with validation loss: 0.4285
Trial 5/31: Testing parameters batch_size=256, learning_rate=0.00344852569569144




Early stopping at epoch 21 with validation loss: 0.4285
Trial 6/31: Testing parameters batch_size=128, learning_rate=0.0006396422811200486




Early stopping at epoch 21 with validation loss: 0.4285
Trial 7/31: Testing parameters batch_size=128, learning_rate=0.0030205357223496146




Early stopping at epoch 21 with validation loss: 0.4285
Trial 8/31: Testing parameters batch_size=128, learning_rate=0.0016151749715515621




Early stopping at epoch 21 with validation loss: 0.4285
Trial 9/31: Testing parameters batch_size=128, learning_rate=0.005062546377630681




Early stopping at epoch 21 with validation loss: 0.4285
Trial 10/31: Testing parameters batch_size=128, learning_rate=0.006565112159117923




Early stopping at epoch 21 with validation loss: 0.4285
Trial 11/31: Testing parameters batch_size=128, learning_rate=0.006173991881909038




Early stopping at epoch 21 with validation loss: 0.4285
Trial 12/31: Testing parameters batch_size=256, learning_rate=0.0027013114660799195




Early stopping at epoch 21 with validation loss: 0.4285
Trial 13/31: Testing parameters batch_size=128, learning_rate=0.004772970504979343




Early stopping at epoch 21 with validation loss: 0.4285
Trial 14/31: Testing parameters batch_size=128, learning_rate=0.0010104265796233266




Early stopping at epoch 21 with validation loss: 0.4285
Trial 15/31: Testing parameters batch_size=256, learning_rate=0.009494027702391657




Early stopping at epoch 21 with validation loss: 0.4285
Trial 16/31: Testing parameters batch_size=128, learning_rate=0.006436726966975227




Early stopping at epoch 21 with validation loss: 0.4285
Trial 17/31: Testing parameters batch_size=256, learning_rate=0.0070229744457124465




Early stopping at epoch 21 with validation loss: 0.4285
Trial 18/31: Testing parameters batch_size=256, learning_rate=0.009762293236335642




Early stopping at epoch 21 with validation loss: 0.4285
Trial 19/31: Testing parameters batch_size=256, learning_rate=0.0009448548761065653




Early stopping at epoch 21 with validation loss: 0.4285
Trial 20/31: Testing parameters batch_size=128, learning_rate=0.007731481449594245




Early stopping at epoch 21 with validation loss: 0.4285
Trial 21/31: Testing parameters batch_size=128, learning_rate=0.0009761803639631348




Early stopping at epoch 21 with validation loss: 0.4285
Trial 22/31: Testing parameters batch_size=128, learning_rate=0.00598963887033855




Early stopping at epoch 21 with validation loss: 0.4285
Trial 23/31: Testing parameters batch_size=128, learning_rate=0.009357435714622065




Early stopping at epoch 21 with validation loss: 0.4285
Trial 24/31: Testing parameters batch_size=256, learning_rate=0.007728312474014851




Early stopping at epoch 21 with validation loss: 0.4285
Trial 25/31: Testing parameters batch_size=128, learning_rate=0.00832799839103715




Early stopping at epoch 21 with validation loss: 0.4285
Trial 26/31: Testing parameters batch_size=128, learning_rate=0.007965036391686447




Early stopping at epoch 21 with validation loss: 0.4285
Trial 27/31: Testing parameters batch_size=256, learning_rate=0.0015575946502340528




Early stopping at epoch 21 with validation loss: 0.4285
Trial 28/31: Testing parameters batch_size=128, learning_rate=0.006710697283269588




Early stopping at epoch 21 with validation loss: 0.4285
Trial 29/31: Testing parameters batch_size=256, learning_rate=0.007690518810029577




Early stopping at epoch 21 with validation loss: 0.4285
Trial 30/31: Testing parameters batch_size=128, learning_rate=0.0051915112139721345




Early stopping at epoch 21 with validation loss: 0.4285
Trial 31/31: Testing parameters batch_size=128, learning_rate=0.0026751097579319656




Early stopping at epoch 21 with validation loss: 0.4285
Best parameters found: learning_rate=0.000503659879380401, batch_size=128, with validation loss: 0.4285




Accuracy on the credit-approval dataset: 49.28%


NameError: name 'y_pred_proba' is not defined

In [25]:
!pip install tabPFN



In [1]:
import torch
import torch.nn as nn

class FeatureTokenizer(nn.Module):
    def __init__(self, continuous_dim, categorical_dims, embedding_dim=512):
        super(FeatureTokenizer, self).__init__()
        
        # 연속형 특성에 대한 임베딩 레이어
        self.continuous_tokenizer = nn.Linear(1, embedding_dim)
        
        # 범주형 특성에 대한 임베딩 레이어
        self.categorical_tokenizers = nn.ModuleList([
            nn.Embedding(categories, embedding_dim) for categories in categorical_dims
        ])
        
    def forward(self, x_cont, x_cat):
        # 연속형 특성에 대한 임베딩
        cont_tokens = [self.continuous_tokenizer(x.unsqueeze(-1)) for x in x_cont.split(1, dim=-1)]
        
        # 범주형 특성에 대한 임베딩
        cat_tokens = [self.categorical_tokenizers[i](x) for i, x in enumerate(x_cat.split(1, dim=-1))]
        
        # 모든 임베딩 벡터들을 합쳐 최종 샘플 임베딩 생성
        tokens = torch.cat(cont_tokens + cat_tokens, dim=1)
        
        return tokens

In [3]:
import torch
import torch.nn as nn

class FeatureTokenizer(nn.Module):
    def __init__(self, continuous_dim, categorical_dims, embedding_dim=512):
        super(FeatureTokenizer, self).__init__()
        
        # 연속형 특성에 대한 임베딩 레이어
        self.continuous_tokenizer = nn.Linear(1, embedding_dim)
        
        # 범주형 특성에 대한 임베딩 레이어
        self.categorical_tokenizers = nn.ModuleList([
            nn.Embedding(categories, embedding_dim) for categories in categorical_dims
        ])
        
    def forward(self, x_cont, x_cat):
        # 연속형 특성에 대한 임베딩
        cont_tokens = [self.continuous_tokenizer(x.unsqueeze(-1)) for x in x_cont.split(1, dim=-1)]
        
        # 범주형 특성에 대한 임베딩
        cat_tokens = [self.categorical_tokenizers[i](x.squeeze(-1)) for i, x in enumerate(x_cat.split(1, dim=-1))]
        
        # 모든 임베딩 벡터들을 합쳐 최종 샘플 임베딩 생성
        tokens = torch.cat(cont_tokens + cat_tokens, dim=1)
        
        return tokens

# 예제 사용
tokenizer = FeatureTokenizer(continuous_dim=4, categorical_dims=[10, 5, 8], embedding_dim=512)

# 입력 데이터 생성
x_cont = torch.rand(32, 4)  # 32 샘플, 4개의 연속형 특성
x_cat = torch.randint(0, 10, (32, 3))  # 32 샘플, 3개의 범주형 특성 (각 범주의 클래스 수는 10, 5, 8로 맞추기)

# x_cat의 범주형 데이터가 각 임베딩 레이어의 입력 크기 안에 있어야 합니다.
x_cat[:, 1] = torch.randint(0, 5, (32,))  # 두 번째 범주형 특성은 5개의 클래스만 가짐
x_cat[:, 2] = torch.randint(0, 8, (32,))  # 세 번째 범주형 특성은 8개의 클래스만 가짐

tokens = tokenizer(x_cont, x_cat)
print(tokens.shape)  # (32, 7, 512) - 7개의 특성이 512차원으로 임베딩됨


RuntimeError: Tensors must have same number of dimensions: got 3 and 2