In [None]:
import pandas as pd
import numpy as np
import os
import torch
from tqdm import tqdm
import json
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
ip_base_path = "/content/drive/MyDrive/Colab Notebooks/20-30 Oct 2024- Test Data Algo1-2-3_mlp/Input Data"
op_base_path = "/content/drive/MyDrive/Colab Notebooks/20-30 Oct 2024- Test Data Algo1-2-3_mlp/1_Default_LLM_Lookup_Embd"

In [None]:
os.makedirs(op_base_path, exist_ok=True)

In [None]:
def get_model():
  model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
  return model

In [None]:
def get_tokenizer():
  tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2", clean_up_tokenization_spaces=True)
  if tokenizer.eos_token:
    tokenizer.pad_token = tokenizer.eos_token
  else:
      pad_token = '[PAD]'
      tokenizer.add_special_tokens({'pad_token': pad_token})
      tokenizer.pad_token = pad_token

  return tokenizer

In [None]:
def create_word_dict(df):
  text_dict = {}

  for col in df.columns:
    text_dict[col] = []
    items = df[col].unique()
    for item in items:
      if pd.isna(item):
        item = 'nan'
      text_dict[col].append({'value': item, 'embd': []})

  return text_dict

In [None]:
def create_embd(text_dict, model, tokenizer, max_embd_size=1024):
  for index in text_dict.keys():
    for pos, item in enumerate(text_dict[index]):
      prompt_ = item['value'] if (item['value']!='nan' or pd.isna(item['value'])) else 'Unknown'

      # Tokenize the prompt
      tokenized_prompt = tokenizer(prompt_, return_tensors="pt", padding=True, truncation=True, max_length=max_embd_size)
      input_ids = tokenized_prompt.input_ids #shape [1, 300]

      # Get the embeddings from the model's input embedding layer
      prompt_embeddings = model.get_input_embeddings()(input_ids)  # shape [1, 300, 768] where max_embd_size = 300 or max token count whichever is less

      # Mean pooling across the token dimension to get a single vector
      mean_embedding = prompt_embeddings.mean(dim=1)  # shape [1, 768]

      # If you want to remove the batch dimension as well
      mean_embedding = mean_embedding.squeeze(0)  # shape [768]

      # Convert embeddings to a list if needed (e.g., to store in text_dict)
      text_dict[index][pos]['embd'] = mean_embedding.detach().numpy().tolist()

  return text_dict

In [None]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

In [None]:
def save_embd_mappings(file_name, text_dict):
  with open(file_name, 'w') as json_file:
      json.dump(text_dict, json_file, cls=NpEncoder)

In [None]:
def read_embd(file_name):
  with open(file_name, 'r') as json_file:
      embedding_dict = json.load(json_file)

  return embedding_dict

In [None]:
# Function to apply mapping
def map_values(col, val, embedding_dict):
  if(pd.isna(val)):
    val='nan'
  text_dict_col = embedding_dict[col]
  single_dict = {d['value']: d['embd'] for d in text_dict_col}
  if val in single_dict.keys():
    return single_dict[val]
  else:
    return np.zeros(embd_size)

In [None]:
def create_and_save_embedded_data(df, file_path, embedding_dict):
  for col in df.columns:
    df[col] = df[col].map(lambda x: map_values(col, x, embedding_dict))

  list_length = len(df[0][0])

  # Create a dictionary to hold the new columns
  new_columns = {}

  # Populate the dictionary with the new columns
  for col in df.columns:
      for i in range(list_length):
          new_column_name = f'value_{col}_{i+1}'
          new_columns[new_column_name] = df[col].apply(lambda x: x[i])

  # Create a new DataFrame from the dictionary
  new_df_emb = pd.DataFrame(new_columns)
  train_numpy_array = new_df_emb.to_numpy()
  np.save(file_path, train_numpy_array) #.astype(np.float32)

  print("Shape = ", new_df_emb.shape)

###adult

In [None]:
if __name__=='__main__':
  ip_data_path = f"{ip_base_path}/adult"
  op_data_path = f"{op_base_path}/adult"
  os.makedirs(op_data_path, exist_ok=True)
  max_embd_size = 1024
  embd_size = 768 #for gpt2
  mapping_file = 'text-embedding-gpt.json'

  file_names = ["C_train", "C_test", "C_val"]

  data = np.load(f'{ip_data_path}/C_train.npy', allow_pickle=True)
  df = pd.DataFrame(data)

  llm_model = get_model()
  tokenizer = get_tokenizer()

  text_dict = create_word_dict(df)
  text_dict = create_embd(text_dict, llm_model, tokenizer, max_embd_size)

  save_embd_mappings(mapping_file, text_dict)

  for file_name in tqdm(file_names):
    data = np.load(f'{ip_data_path}/{file_name}.npy', allow_pickle=True)
    df = pd.DataFrame(data)
    create_and_save_embedded_data(df, f'{op_data_path}/{file_name}.npy', text_dict)

  !cp -R text-embedding-gpt.json '{op_data_path}'

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

 33%|███▎      | 1/3 [01:19<02:38, 79.05s/it]

Shape =  (39074, 6144)


 67%|██████▋   | 2/3 [01:33<00:41, 41.34s/it]

Shape =  (4884, 6144)


100%|██████████| 3/3 [01:49<00:00, 36.56s/it]

Shape =  (4884, 6144)





###analcatdata

In [None]:
if __name__=='__main__':
  ip_data_path = f"{ip_base_path}/analcatdata"
  op_data_path = f"{op_base_path}/analcatdata"
  os.makedirs(op_data_path, exist_ok=True)
  max_embd_size = 1024
  embd_size = 768 #for gpt2
  mapping_file = 'text-embedding-gpt.json'

  file_names = ["C_train", "C_test", "C_val"]

  data = np.load(f'{ip_data_path}/C_train.npy', allow_pickle=True)
  df = pd.DataFrame(data)

  llm_model = get_model()
  tokenizer = get_tokenizer()

  text_dict = create_word_dict(df)
  text_dict = create_embd(text_dict, llm_model, tokenizer, max_embd_size)

  save_embd_mappings(mapping_file, text_dict)

  for file_name in tqdm(file_names):
    data = np.load(f'{ip_data_path}/{file_name}.npy', allow_pickle=True)
    df = pd.DataFrame(data)
    create_and_save_embedded_data(df, f'{op_data_path}/{file_name}.npy', text_dict)

  !cp -R text-embedding-gpt.json '{op_data_path}'

 33%|███▎      | 1/3 [00:01<00:02,  1.17s/it]

Shape =  (637, 3072)


 67%|██████▋   | 2/3 [00:01<00:00,  1.04it/s]

Shape =  (80, 3072)


100%|██████████| 3/3 [00:03<00:00,  1.00s/it]

Shape =  (80, 3072)





###credit-approval

In [None]:
if __name__=='__main__':
  ip_data_path = f"{ip_base_path}/credit-approval"
  op_data_path = f"{op_base_path}/credit-approval"
  os.makedirs(op_data_path, exist_ok=True)
  max_embd_size = 1024
  embd_size = 768 #for gpt2
  mapping_file = 'text-embedding-gpt.json'

  file_names = ["C_train", "C_test", "C_val"]

  data = np.load(f'{ip_data_path}/C_train.npy', allow_pickle=True)
  df = pd.DataFrame(data)

  llm_model = get_model()
  tokenizer = get_tokenizer()

  text_dict = create_word_dict(df)
  text_dict = create_embd(text_dict, llm_model, tokenizer, max_embd_size)

  save_embd_mappings(mapping_file, text_dict)

  for file_name in tqdm(file_names):
    data = np.load(f'{ip_data_path}/{file_name}.npy', allow_pickle=True)
    df = pd.DataFrame(data)
    create_and_save_embedded_data(df, f'{op_data_path}/{file_name}.npy', text_dict)

  !cp -R text-embedding-gpt.json '{op_data_path}'

 33%|███▎      | 1/3 [00:02<00:05,  2.66s/it]

Shape =  (552, 6912)


 67%|██████▋   | 2/3 [00:03<00:01,  1.85s/it]

Shape =  (69, 6912)


100%|██████████| 3/3 [00:06<00:00,  2.07s/it]

Shape =  (69, 6912)





###cylinder-bands

In [None]:
if __name__=='__main__':
  ip_data_path = f"{ip_base_path}/cylinder-bands"
  op_data_path = f"{op_base_path}/cylinder-bands"
  os.makedirs(op_data_path, exist_ok=True)
  max_embd_size = 1024
  embd_size = 768 #for gpt2
  mapping_file = 'text-embedding-gpt.json'

  file_names = ["C_train", "C_test", "C_val"]

  data = np.load(f'{ip_data_path}/C_train.npy', allow_pickle=True)
  df = pd.DataFrame(data)

  llm_model = get_model()
  tokenizer = get_tokenizer()

  text_dict = create_word_dict(df)
  text_dict = create_embd(text_dict, llm_model, tokenizer, max_embd_size)

  save_embd_mappings(mapping_file, text_dict)

  for file_name in tqdm(file_names):
    data = np.load(f'{ip_data_path}/{file_name}.npy', allow_pickle=True)
    df = pd.DataFrame(data)
    create_and_save_embedded_data(df, f'{op_data_path}/{file_name}.npy', text_dict)

  !cp -R text-embedding-gpt.json '{op_data_path}'

 33%|███▎      | 1/3 [00:03<00:06,  3.13s/it]

Shape =  (432, 10752)


 67%|██████▋   | 2/3 [00:05<00:02,  2.75s/it]

Shape =  (54, 10752)


100%|██████████| 3/3 [00:07<00:00,  2.63s/it]

Shape =  (54, 10752)





###dresses-sales

In [None]:
if __name__=='__main__':
  ip_data_path = f"{ip_base_path}/dresses-sales"
  op_data_path = f"{op_base_path}/dresses-sales"
  os.makedirs(op_data_path, exist_ok=True)
  max_embd_size = 1024
  embd_size = 768 #for gpt2
  mapping_file = 'text-embedding-gpt.json'

  file_names = ["C_train", "C_test", "C_val"]

  data = np.load(f'{ip_data_path}/C_train.npy', allow_pickle=True)
  df = pd.DataFrame(data)

  llm_model = get_model()
  tokenizer = get_tokenizer()

  text_dict = create_word_dict(df)
  text_dict = create_embd(text_dict, llm_model, tokenizer, max_embd_size)

  save_embd_mappings(mapping_file, text_dict)

  for file_name in tqdm(file_names):
    data = np.load(f'{ip_data_path}/{file_name}.npy', allow_pickle=True)
    df = pd.DataFrame(data)
    create_and_save_embedded_data(df, f'{op_data_path}/{file_name}.npy', text_dict)

  !cp -R text-embedding-gpt.json '{op_data_path}'

 33%|███▎      | 1/3 [00:02<00:05,  2.50s/it]

Shape =  (409, 8448)


 67%|██████▋   | 2/3 [00:03<00:01,  1.79s/it]

Shape =  (46, 8448)


100%|██████████| 3/3 [00:05<00:00,  1.74s/it]

Shape =  (45, 8448)





###nursery

In [None]:
if __name__=='__main__':
  ip_data_path = f"{ip_base_path}/nursery"
  op_data_path = f"{op_base_path}/nursery"
  os.makedirs(op_data_path, exist_ok=True)
  max_embd_size = 1024
  embd_size = 768 #for gpt2
  mapping_file = 'text-embedding-gpt.json'

  file_names = ["C_train", "C_test", "C_val"]

  data = np.load(f'{ip_data_path}/C_train.npy', allow_pickle=True)
  df = pd.DataFrame(data)

  llm_model = get_model()
  tokenizer = get_tokenizer()

  text_dict = create_word_dict(df)
  text_dict = create_embd(text_dict, llm_model, tokenizer, max_embd_size)

  save_embd_mappings(mapping_file, text_dict)

  for file_name in tqdm(file_names):
    data = np.load(f'{ip_data_path}/{file_name}.npy', allow_pickle=True)
    df = pd.DataFrame(data)
    create_and_save_embedded_data(df, f'{op_data_path}/{file_name}.npy', text_dict)

  !cp -R text-embedding-gpt.json '{op_data_path}'

 33%|███▎      | 1/3 [00:23<00:46, 23.49s/it]

Shape =  (10368, 6144)


 67%|██████▋   | 2/3 [00:26<00:11, 11.60s/it]

Shape =  (1296, 6144)


100%|██████████| 3/3 [00:29<00:00,  9.99s/it]

Shape =  (1296, 6144)





###titanic

In [None]:
if __name__=='__main__':
  ip_data_path = f"{ip_base_path}/titanic"
  op_data_path = f"{op_base_path}/titanic"
  os.makedirs(op_data_path, exist_ok=True)
  max_embd_size = 1024
  embd_size = 768 #for gpt2
  mapping_file = 'text-embedding-gpt.json'

  file_names = ["C_train", "C_test", "C_val"]

  data = np.load(f'{ip_data_path}/C_train.npy', allow_pickle=True)
  df = pd.DataFrame(data)

  llm_model = get_model()
  tokenizer = get_tokenizer()

  text_dict = create_word_dict(df)
  text_dict = create_embd(text_dict, llm_model, tokenizer, max_embd_size)

  save_embd_mappings(mapping_file, text_dict)

  for file_name in tqdm(file_names):
    data = np.load(f'{ip_data_path}/{file_name}.npy', allow_pickle=True)
    df = pd.DataFrame(data)
    create_and_save_embedded_data(df, f'{op_data_path}/{file_name}.npy', text_dict)

  !cp -R text-embedding-gpt.json '{op_data_path}'

 33%|███▎      | 1/3 [00:03<00:07,  3.67s/it]

Shape =  (1049, 2304)


 67%|██████▋   | 2/3 [00:04<00:02,  2.07s/it]

Shape =  (130, 2304)


100%|██████████| 3/3 [00:06<00:00,  2.01s/it]

Shape =  (130, 2304)



