In [1]:
import numpy as np
import src
import os
import json
import argparse
import warnings

from tabsyn.latent_utils import recover_data
from utils_train import concat_y_to_X

In [2]:
def make_dataset(
    data_path: str,
    T: src.Transformations,
    task_type,
    change_val: bool,
    concat = True,
):

    # classification
    if task_type == 'binclass' or task_type == 'multiclass':
        X_cat = {} if os.path.exists(os.path.join(data_path, 'X_cat_train.npy'))  else None
        X_num = {} if os.path.exists(os.path.join(data_path, 'X_num_train.npy')) else None
        y = {} if os.path.exists(os.path.join(data_path, 'y_train.npy')) else None

        for split in ['train', 'test']:
            X_num_t, X_cat_t, y_t = src.read_pure_data(data_path, split)
            if X_num is not None:
                X_num[split] = X_num_t
            if X_cat is not None:
                if concat:
                    X_cat_t = concat_y_to_X(X_cat_t, y_t)
                X_cat[split] = X_cat_t  
            if y is not None:
                y[split] = y_t
    else:
        # regression
        X_cat = {} if os.path.exists(os.path.join(data_path, 'X_cat_train.npy')) else None
        X_num = {} if os.path.exists(os.path.join(data_path, 'X_num_train.npy')) else None
        y = {} if os.path.exists(os.path.join(data_path, 'y_train.npy')) else None

        for split in ['train', 'test']:
            X_num_t, X_cat_t, y_t = src.read_pure_data(data_path, split)

            if X_num is not None:
                if concat:
                    X_num_t = concat_y_to_X(X_num_t, y_t)
                X_num[split] = X_num_t
            if X_cat is not None:
                X_cat[split] = X_cat_t
            if y is not None:
                y[split] = y_t

    info = src.load_json(os.path.join(data_path, 'info.json'))
    

    D = src.Dataset(
        X_num,
        X_cat,
        y,
        y_info={},
        task_type=src.TaskType(info['task_type']),
        n_classes=info.get('n_classes')
    )
    

    if change_val:
        D = src.change_val(D)

    # def categorical_to_idx(feature):
    #     unique_categories = np.unique(feature)
    #     idx_mapping = {category: index for index, category in enumerate(unique_categories)}
    #     idx_feature = np.array([idx_mapping[category] for category in feature])
    #     return idx_feature

    # for split in ['train', 'val', 'test']:
    # D.y[split] = categorical_to_idx(D.y[split].squeeze(1))

    D =  src.transform_dataset(D, T, None)
    
    return D

In [3]:
dataname = "petfinder_tab"
task_type = "binclass"
dataset_path = "/Users/leec/lee1carlin@gmail.com - Google Drive/My Drive/gitRepos/tabsynfork/data/petfinder_tab/"
cat_encoding = "one-hot"

In [4]:
with open(f'{dataset_path}/info.json', 'r') as f:
    info = json.load(f)

task_type = info['task_type']
# cat_encoding = args.cat_encoding
concat = True if task_type == 'regression' else False

T_dict = {}

T_dict['normalization'] = "quantile"
T_dict['num_nan_policy'] = 'mean'
T_dict['cat_nan_policy'] =  None
T_dict['cat_min_frequency'] = None
T_dict['cat_encoding'] = cat_encoding
T_dict['y_policy'] = "default"

T = src.Transformations(**T_dict)

In [5]:
dataset = make_dataset(
        data_path = dataset_path,
        T = T,
        task_type = task_type,
        change_val = False,
        concat = concat
    )

No NaNs in numerical features, skipping


In [21]:
dataset.X_num['train'].shape,dataset.X_cat['train'].shape,dataset.y['train'].shape, dataset.n_classes


((9434, 11), (9434, 6), (9434, 1), None)