In [None]:
import numpy as np
import pandas as pd
import os
import json
import tqdm
import toml

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
ip_base_path = "/content/drive/MyDrive/Colab Notebooks/TempTestFolder/Input Data"
model_op_base_path = "/content/drive/MyDrive/Colab Notebooks/TempTestFolder/3_Default_LLM_Prompt_for_each_row"
final_op_base_path = "/content/drive/MyDrive/Colab Notebooks/TempTestFolder/Final Data"

In [None]:
# Combine the C_train, C_test, C_val generated from 3-Default LLM Prompt for each row with then N_train, N_test, N_val with original data because now everything is numeric
# Also Create info.json files
# Here basically we are dealing with folders such as adult_algo_3, bank_marketing_algo_3

algo_list = ["3_Embedding_Prompt_for_each_row"]
data_list = ["adult", "analcatdata", "bank_marketing", "credit-approval", "credit-g", "cylinder-bands", "dresses-sales", "eucalyptus", "kr-vs-kp", "nursery", "titanic", "sick"]
for algo in algo_list:
  for data in tqdm.tqdm(data_list):
    new_data_name = f"{data}_algo_{algo.split('_')[0]}"
    ip_path = f"{ip_base_path}/{data}"
    model_op_path = f"{model_op_base_path}/{data}"
    final_op_path = f"{final_op_base_path}/{new_data_name}"

    os.makedirs(f"{final_op_path}", exist_ok=True)

    y_train = np.load(f"{ip_path}/y_train.npy", allow_pickle=True)
    y_train_df = pd.DataFrame(y_train)

    y_test = np.load(f"{ip_path}/y_test.npy", allow_pickle=True)
    y_val = np.load(f"{ip_path}/y_val.npy", allow_pickle=True)

    if((data != "analcatdata") and (data != "nursery") and (data != "kr-vs-kp")):
      default_N_train = np.load(f"{ip_path}/N_train.npy", allow_pickle=True)
      C_train = np.load(f"{model_op_path}/C_train.npy", allow_pickle=True)
      final_N_train = np.concatenate((default_N_train, C_train), axis=1)
      np.save(f"{final_op_path}/N_train.npy", final_N_train.astype(np.float32))

      default_N_val = np.load(f"{ip_path}/N_val.npy", allow_pickle=True)
      C_val =  np.load(f"{model_op_path}/C_val.npy", allow_pickle=True)
      final_N_val = np.concatenate((default_N_val, C_val), axis=1)
      np.save(f"{final_op_path}/N_val.npy", final_N_val.astype(np.float32))

      default_N_test = np.load(f"{ip_path}/N_test.npy", allow_pickle=True)
      C_test = np.load(f"{model_op_path}/C_test.npy", allow_pickle=True)
      final_N_test = np.concatenate((default_N_test, C_test), axis=1)
      np.save(f"{final_op_path}/N_test.npy", final_N_test.astype(np.float32))

      info_dict = {
              "name": f"{new_data_name}",
              "basename": f"{new_data_name}",
              "split": 0,
              "task_type": "binclass" if y_train_df[0].nunique() == 2 else "multiclass",
              "n_classes": y_train_df[0].nunique(),
              "n_num_features": final_N_train.shape[1],
              "n_cat_features": 0,
              "train_size": final_N_train.shape[0],
              "val_size": final_N_val.shape[0],
              "test_size": final_N_test.shape[0]
          }

    else:
      C_train = np.load(f"{model_op_path}/C_train.npy", allow_pickle=True)
      np.save(f"{final_op_path}/N_train.npy", C_train.astype(np.float32))

      C_val = np.load(f"{model_op_path}/C_val.npy", allow_pickle=True)
      np.save(f"{final_op_path}/N_val.npy", C_val.astype(np.float32))

      C_test = np.load(f"{model_op_path}/C_test.npy", allow_pickle=True)
      np.save(f"{final_op_path}/N_test.npy", C_test.astype(np.float32))

      info_dict = {
              "name": f"{new_data_name}",
              "basename": f"{new_data_name}",
              "split": 0,
              "task_type": "binclass" if y_train_df[0].nunique() == 2 else "multiclass",
              "n_classes": y_train_df[0].nunique(),
              "n_num_features": C_train.shape[1],
              "n_cat_features": 0,
              "train_size": C_train.shape[0],
              "val_size": C_val.shape[0],
              "test_size": C_test.shape[0]
          }

    np.save(f"{final_op_path}/y_train.npy", y_train.astype(np.int64))
    np.save(f"{final_op_path}/y_test.npy", y_test.astype(np.int64))
    np.save(f"{final_op_path}/y_val.npy", y_val.astype(np.int64))


    print(f"For {new_data_name} =======")
    print(info_dict)
    print(f"shape of y_train = {y_train.shape}, y_val = {y_val.shape}, y_test = {y_test.shape} ")
    print()

    with open(f"{final_op_path}/info.json", 'w') as fp:
      json.dump(info_dict, fp)

100%|██████████| 1/1 [00:03<00:00,  3.53s/it]

{'name': 'adult_algo_3', 'basename': 'adult_algo_3', 'split': 0, 'task_type': 'binclass', 'n_classes': 2, 'n_num_features': 774, 'n_cat_features': 0, 'train_size': 39074, 'val_size': 4884, 'test_size': 4884}
shape of y_train = (39074,), y_val = (4884,), y_test = (4884,) 






In [None]:
# Now copy default datasets intp Final Data Folder
for data in tqdm.tqdm(data_list):
    !cp -r '{ip_base_path}/{data}' '{final_op_base_path}/'

100%|██████████| 1/1 [00:00<00:00,  1.41it/s]


In [None]:
# Now create info.json files for default datasets and keep it in Benchmark Data folder

for data in tqdm.tqdm(data_list):

  y_train = np.load(f"{final_op_base_path}/{data}/y_train.npy", allow_pickle=True)
  y_train_df = pd.DataFrame(y_train)

  y_test = np.load(f"{final_op_base_path}/{data}/y_test.npy", allow_pickle=True)
  y_val = np.load(f"{final_op_base_path}/{data}/y_val.npy", allow_pickle=True)

  if((data != "analcatdata") and (data != "nursery") and (data != "kr-vs-kp")):
    default_N_train = np.load(f"{final_op_base_path}/{data}/N_train.npy", allow_pickle=True)

    default_N_test = np.load(f"{final_op_base_path}/{data}/N_test.npy", allow_pickle=True)

    default_N_val = np.load(f"{final_op_base_path}/{data}/N_val.npy", allow_pickle=True)

    default_C_train = np.load(f"{final_op_base_path}/{data}/C_train.npy", allow_pickle=True)

    info_dict = {
                  "name": data,
                  "basename": data,
                  "split": 0,
                  "task_type": "binclass" if y_train_df[0].nunique() == 2 else "multiclass",
                  "n_classes": y_train_df[0].nunique(),
                  "n_num_features": default_N_train.shape[1],
                  "n_cat_features": default_C_train.shape[1],
                  "train_size": default_N_train.shape[0],
                  "val_size": default_N_val.shape[0],
                  "test_size": default_N_test.shape[0]
              }

  else:
    default_C_train = np.load(f"{final_op_base_path}/{data}/C_train.npy", allow_pickle=True)
    default_C_val = np.load(f"{final_op_base_path}/{data}/C_val.npy", allow_pickle=True)
    default_C_test = np.load(f"{final_op_base_path}/{data}/C_test.npy", allow_pickle=True)

    info_dict = {
                  "name": data,
                  "basename": data,
                  "split": 0,
                  "task_type": "binclass" if y_train_df[0].nunique() == 2 else "multiclass",
                  "n_classes": y_train_df[0].nunique(),
                  "n_num_features": 0,
                  "n_cat_features": default_C_train.shape[1],
                  "train_size": default_C_train.shape[0],
                  "val_size": default_C_val.shape[0],
                  "test_size": default_C_test.shape[0]
              }

  np.save(f"{final_op_base_path}/{data}/y_train.npy", y_train.astype(np.int64))
  np.save(f"{final_op_base_path}/{data}/y_test.npy", y_test.astype(np.int64))
  np.save(f"{final_op_base_path}/{data}/y_val.npy", y_val.astype(np.int64))


  print(f"For {data} =======")
  print(info_dict)
  print(f"shape of y_train = {y_train.shape}, y_val = {y_val.shape}, y_test = {y_test.shape} ")
  print()

  with open(f"{final_op_base_path}/{data}/info.json", 'w') as fp:
    json.dump(info_dict, fp)

100%|██████████| 1/1 [00:00<00:00,  6.51it/s]

{'name': 'adult', 'basename': 'adult', 'split': 0, 'task_type': 'binclass', 'n_classes': 2, 'n_num_features': 6, 'n_cat_features': 8, 'train_size': 39074, 'val_size': 4884, 'test_size': 4884}
shape of y_train = (39074,), y_val = (4884,), y_test = (4884,) 






In [None]:
# Type cast all categorical data to '<U26'

for data in tqdm.tqdm(data_list):
  default_C_train = np.load(f"{final_op_base_path}/{data}/C_train.npy", allow_pickle=True)
  default_C_train = np.where(default_C_train == 'None', 'nan', default_C_train)

  default_C_test = np.load(f"{final_op_base_path}/{data}/C_test.npy", allow_pickle=True)
  default_C_test = np.where(default_C_test == 'None', 'nan', default_C_test)

  default_C_val = np.load(f"{final_op_base_path}/{data}/C_val.npy", allow_pickle=True)
  default_C_val = np.where(default_C_val == 'None', 'nan', default_C_val)

  np.save(f"{final_op_base_path}/{data}/C_train.npy", default_C_train.astype('<U26'))
  np.save(f"{final_op_base_path}/{data}/C_test.npy", default_C_test.astype('<U26'))
  np.save(f"{final_op_base_path}/{data}/C_val.npy", default_C_val.astype('<U26'))


100%|██████████| 1/1 [00:00<00:00,  2.62it/s]


In [None]:
for data in tqdm.tqdm(data_list):
  if((data != "analcatdata") and (data != "nursery") and (data != "kr-vs-kp")):
    default_N_train = np.load(f"{final_op_base_path}/{data}/N_train.npy", allow_pickle=True)

    default_N_test = np.load(f"{final_op_base_path}/{data}/N_test.npy", allow_pickle=True)

    default_N_val = np.load(f"{final_op_base_path}/{data}/N_val.npy", allow_pickle=True)

    np.save(f"{final_op_base_path}/{data}/N_train.npy", default_N_train.astype(np.float32))
    np.save(f"{final_op_base_path}/{data}/N_test.npy", default_N_test.astype(np.float32))
    np.save(f"{final_op_base_path}/{data}/N_val.npy", default_N_val.astype(np.float32))

100%|██████████| 1/1 [00:00<00:00, 15.90it/s]


In [None]:
# !zip -r  "{final_op_base_path}/data.zip" "{final_op_base_path}"

In [None]:
# !cp -r "/content/drive/MyDrive/Colab Notebooks/19-21 Oct 2024-Final for publishing/data" "/content/drive/MyDrive/Colab Notebooks/TempTestFolder"