In [None]:
import pandas as pd
import numpy as np
import os
import torch
from tqdm import tqdm
import json
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
ip_base_path = "/content/drive/MyDrive/Colab Notebooks/20-30 Oct 2024- Test Data Algo1-2-3_mlp/Input Data"
op_base_path = "/content/drive/MyDrive/Colab Notebooks/20-30 Oct 2024- Test Data Algo1-2-3_mlp/2_Default_LLM_Prompt_for_each_word"

In [None]:
os.makedirs(op_base_path, exist_ok=True)

In [None]:
def get_model():
  model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2") #https://huggingface.co/openai-community/gpt2
  return model

In [None]:
def get_tokenizer():
  tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2", clean_up_tokenization_spaces=True)
  if tokenizer.eos_token:
    tokenizer.pad_token = tokenizer.eos_token
  else:
      pad_token = '[PAD]'
      tokenizer.add_special_tokens({'pad_token': pad_token})
      tokenizer.pad_token = pad_token

  return tokenizer

In [None]:
def create_word_dict(df):
  text_dict = {}

  for col in df.columns:
    text_dict[col] = []
    items = df[col].unique()
    for item in items:
      text_dict[col].append({'value': item, 'embd': []})

  return text_dict

In [None]:
def get_prompt(column_value, column_index, df_meta, dataset_description, task_description):
  columns_info = ", ".join(df_meta.loc[:,"Variable Name"])
  column_name = df_meta.loc[column_index, 'Variable Name']

  prompt = f'''<|start_prompt|>\nDataset description: {dataset_description}\nTask description: {task_description}\nColumns: {columns_info} \nGenerate embeddings when row has'''
  prompt += f'''Column {column_name} which contains {column_value}'''
  prompt += "\n<|end_prompt|>"

  return prompt

In [None]:
def create_embd(text_dict, model, tokenizer, df_meta, dataset_description, task_description, max_embd_size=1024):
  for column_index in text_dict.keys():
    for pos, item in enumerate(text_dict[column_index]):
      column_value = item['value'] if (item['value']!='nan' or pd.isna(item['value'])) else 'Unknown'
      prompt_ = get_prompt(column_value, column_index, df_meta, dataset_description, task_description)

      # Tokenize the prompt
      tokenized_prompt = tokenizer(prompt_, return_tensors="pt", padding=True, truncation=True, max_length=max_embd_size)
      input_ids = tokenized_prompt.input_ids #shape [1, 300]

      # Get the embeddings from the model's input embedding layer
      prompt_embeddings = model.get_input_embeddings()(input_ids)  # shape [1, 300, 768] where max_embd_size = 300 or max token count whichever is less

      # Mean pooling across the token dimension to get a single vector
      mean_embedding = prompt_embeddings.mean(dim=1)  # shape [1, 768]

      # If you want to remove the batch dimension as well
      mean_embedding = mean_embedding.squeeze(0)  # shape [768]

      # Convert embeddings to a list if needed (e.g., to store in text_dict)
      text_dict[column_index][pos]['embd'] = mean_embedding.detach().numpy().tolist()

  return text_dict

In [None]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

In [None]:
def save_embd_mappings(file_name, text_dict):
  with open(file_name, 'w') as json_file:
      json.dump(text_dict, json_file, cls=NpEncoder)

In [None]:
def read_embd(file_name):
  with open(file_name, 'r') as json_file:
      embedding_dict = json.load(json_file)

  return embedding_dict

In [None]:
# Function to apply mapping
def map_values(col, val, embedding_dict):
  if(pd.isna(val)):
    val='nan'
  text_dict_col = embedding_dict[col]
  single_dict = {d['value']: d['embd'] for d in text_dict_col}
  if val in single_dict.keys():
    return single_dict[val]
  else:
    return np.zeros(embd_size)

In [None]:
def create_and_save_embedded_data(df, file_path, embedding_dict):
  for col in df.columns:
    df[col] = df[col].map(lambda x: map_values(col, x, embedding_dict))

  list_length = len(df[0][0])

  # Create a dictionary to hold the new columns
  new_columns = {}

  # Populate the dictionary with the new columns
  for col in df.columns:
      for i in range(list_length):
          new_column_name = f'value_{col}_{i+1}'
          new_columns[new_column_name] = df[col].apply(lambda x: x[i])

  # Create a new DataFrame from the dictionary
  new_df_emb = pd.DataFrame(new_columns)
  train_numpy_array = new_df_emb.to_numpy()
  np.save(file_path, train_numpy_array) #.astype(np.float32)

  print("Shape = ", new_df_emb.shape)

###adult

In [None]:
if __name__=='__main__':
  ip_data_path = f"{ip_base_path}/adult"
  op_data_path = f"{op_base_path}/adult"
  os.makedirs(op_data_path, exist_ok=True)
  max_embd_size = 1024
  embd_size = 768 #for gpt2
  mapping_file = 'text-embedding-gpt.json'
  file_names = ["C_train", "C_test", "C_val"]

  df_meta = pd.read_csv(f'{ip_data_path}/adult.csv')

  dataset_description = '''Extraction was done by Barry Becker from the 1994 Census database. The dataset contains 16 columns, including demographics and other features to describe a person. The target column, Income, is divided into two classes: <=50K and >50K.'''
  task_description = '''Generate embeddings for the categorical columns to facilitate the prediction of whether a person makes over 50K a year.'''

  data = np.load(f'{ip_data_path}/C_train.npy', allow_pickle=True)
  df = pd.DataFrame(data)

  llm_model = get_model()
  tokenizer = get_tokenizer()

  text_dict = create_word_dict(df)
  text_dict = create_embd(text_dict, llm_model, tokenizer, df_meta, dataset_description, task_description, max_embd_size)

  save_embd_mappings(mapping_file, text_dict)

  for file_name in tqdm(file_names):
    data = np.load(f'{ip_data_path}/{file_name}.npy', allow_pickle=True)
    df = pd.DataFrame(data)
    create_and_save_embedded_data(df, f'{op_data_path}/{file_name}.npy', text_dict)

  !cp -R text-embedding-gpt.json '{op_data_path}'

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

 33%|███▎      | 1/3 [01:15<02:30, 75.37s/it]

Shape =  (39074, 6144)


 67%|██████▋   | 2/3 [01:27<00:38, 38.43s/it]

Shape =  (4884, 6144)


100%|██████████| 3/3 [01:45<00:00, 35.32s/it]

Shape =  (4884, 6144)





###analcatdata

In [None]:
if __name__=='__main__':
  ip_data_path = f"{ip_base_path}/analcatdata"
  op_data_path = f"{op_base_path}/analcatdata"
  os.makedirs(op_data_path, exist_ok=True)
  max_embd_size = 1024
  embd_size = 768 #for gpt2
  mapping_file = 'text-embedding-gpt.json'
  file_names = ["C_train", "C_test", "C_val"]

  df_meta = pd.read_csv(f'{ip_data_path}/analcatdata.csv')

  dataset_description = '''The dataset is sourced from the book "Analyzing Categorical Data" by Jeffrey S. Simonoff (2003), containing data on the DMFT Index (Decayed, Missing, and Filled Teeth) before and after different prevention strategies. It consists of 797 instances with 5 features, and no missing values.'''
  task_description = '''Predict the type of prevention strategy applied based on the DMFT index before and after the intervention, as well as demographic information.'''

  data = np.load(f'{ip_data_path}/C_train.npy', allow_pickle=True)
  df = pd.DataFrame(data)

  llm_model = get_model()
  tokenizer = get_tokenizer()

  text_dict = create_word_dict(df)
  text_dict = create_embd(text_dict, llm_model, tokenizer, df_meta, dataset_description, task_description, max_embd_size)

  save_embd_mappings(mapping_file, text_dict)

  for file_name in tqdm(file_names):
    data = np.load(f'{ip_data_path}/{file_name}.npy', allow_pickle=True)
    df = pd.DataFrame(data)
    create_and_save_embedded_data(df, f'{op_data_path}/{file_name}.npy', text_dict)

  !cp -R text-embedding-gpt.json '{op_data_path}'

 33%|███▎      | 1/3 [00:03<00:06,  3.43s/it]

Shape =  (637, 3072)


 67%|██████▋   | 2/3 [00:04<00:01,  1.91s/it]

Shape =  (80, 3072)


100%|██████████| 3/3 [00:05<00:00,  1.78s/it]

Shape =  (80, 3072)





###credit-approval

In [None]:
if __name__=='__main__':
  ip_data_path = f"{ip_base_path}/credit-approval"
  op_data_path = f"{op_base_path}/credit-approval"
  os.makedirs(op_data_path, exist_ok=True)
  max_embd_size = 1024
  embd_size = 768 #for gpt2
  mapping_file = 'text-embedding-gpt.json'
  file_names = ["C_train", "C_test", "C_val"]

  df_meta = pd.read_csv(f'{ip_data_path}/credit-approval.csv')

  dataset_description = '''The Credit Approval dataset consists of 690 instances with 15 features used to evaluate credit card applications. The dataset includes a mix of continuous and categorical attributes, with some having missing values. To ensure confidentiality, attribute names and values have been anonymized.'''
  task_description = '''Predict the credit approval status (target: A16) based on anonymized financial and demographic attributes using a combination of continuous and categorical data while handling any missing values appropriately.'''

  data = np.load(f'{ip_data_path}/C_train.npy', allow_pickle=True)
  df = pd.DataFrame(data)

  llm_model = get_model()
  tokenizer = get_tokenizer()

  text_dict = create_word_dict(df)
  text_dict = create_embd(text_dict, llm_model, tokenizer, df_meta, dataset_description, task_description, max_embd_size)

  save_embd_mappings(mapping_file, text_dict)

  for file_name in tqdm(file_names):
    data = np.load(f'{ip_data_path}/{file_name}.npy', allow_pickle=True)
    df = pd.DataFrame(data)
    create_and_save_embedded_data(df, f'{op_data_path}/{file_name}.npy', text_dict)

  !cp -R text-embedding-gpt.json '{op_data_path}'

 33%|███▎      | 1/3 [00:02<00:05,  2.64s/it]

Shape =  (552, 6912)


 67%|██████▋   | 2/3 [00:04<00:01,  1.99s/it]

Shape =  (69, 6912)


100%|██████████| 3/3 [00:05<00:00,  1.91s/it]

Shape =  (69, 6912)





###cylinder-bands

In [None]:
if __name__=='__main__':
  ip_data_path = f"{ip_base_path}/cylinder-bands"
  op_data_path = f"{op_base_path}/cylinder-bands"
  os.makedirs(op_data_path, exist_ok=True)
  max_embd_size = 1024
  embd_size = 768 #for gpt2
  mapping_file = 'text-embedding-gpt.json'
  file_names = ["C_train", "C_test", "C_val"]

  df_meta = pd.read_csv(f'{ip_data_path}/cylinder-bands.csv')

  dataset_description = "The Cylinder Bands dataset contains data related to process delays (cylinder banding) in rotogravure printing, with 512 instances and 39 features, including categorical, integer, and real values."
  task_description = "Predict and classify process delays (cylinder banding) in rotogravure printing."


  data = np.load(f'{ip_data_path}/C_train.npy', allow_pickle=True)
  df = pd.DataFrame(data)

  llm_model = get_model()
  tokenizer = get_tokenizer()

  text_dict = create_word_dict(df)
  text_dict = create_embd(text_dict, llm_model, tokenizer, df_meta, dataset_description, task_description, max_embd_size)

  save_embd_mappings(mapping_file, text_dict)

  for file_name in tqdm(file_names):
    data = np.load(f'{ip_data_path}/{file_name}.npy', allow_pickle=True)
    df = pd.DataFrame(data)
    create_and_save_embedded_data(df, f'{op_data_path}/{file_name}.npy', text_dict)

  !cp -R text-embedding-gpt.json '{op_data_path}'

 33%|███▎      | 1/3 [00:03<00:06,  3.04s/it]

Shape =  (432, 10752)


 67%|██████▋   | 2/3 [00:04<00:02,  2.39s/it]

Shape =  (54, 10752)


100%|██████████| 3/3 [00:06<00:00,  2.21s/it]

Shape =  (54, 10752)





###dresses-sales

In [None]:
if __name__=='__main__':
  ip_data_path = f"{ip_base_path}/dresses-sales"
  op_data_path = f"{op_base_path}/dresses-sales"
  os.makedirs(op_data_path, exist_ok=True)
  max_embd_size = 1024
  embd_size = 768 #for gpt2
  mapping_file = 'text-embedding-gpt.json'
  file_names = ["C_train", "C_test", "C_val"]

  df_meta = pd.read_csv(f'{ip_data_path}/dresses-sales.csv')

  dataset_description = "The Dresses_Attribute_Sales dataset contains attributes of dresses, such as style, price, rating, size, season, and others, to predict sales recommendations."
  task_description = "Predict dress recommendations based on the given attributes."


  data = np.load(f'{ip_data_path}/C_train.npy', allow_pickle=True)
  df = pd.DataFrame(data)

  llm_model = get_model()
  tokenizer = get_tokenizer()

  text_dict = create_word_dict(df)
  text_dict = create_embd(text_dict, llm_model, tokenizer, df_meta, dataset_description, task_description, max_embd_size)

  save_embd_mappings(mapping_file, text_dict)

  for file_name in tqdm(file_names):
    data = np.load(f'{ip_data_path}/{file_name}.npy', allow_pickle=True)
    df = pd.DataFrame(data)
    create_and_save_embedded_data(df, f'{op_data_path}/{file_name}.npy', text_dict)

  !cp -R text-embedding-gpt.json '{op_data_path}'

 33%|███▎      | 1/3 [00:03<00:06,  3.25s/it]

Shape =  (409, 8448)


 67%|██████▋   | 2/3 [00:05<00:02,  2.54s/it]

Shape =  (46, 8448)


100%|██████████| 3/3 [00:06<00:00,  2.23s/it]

Shape =  (45, 8448)





###nursery

In [None]:
if __name__=='__main__':
  ip_data_path = f"{ip_base_path}/nursery"
  op_data_path = f"{op_base_path}/nursery"
  os.makedirs(op_data_path, exist_ok=True)
  max_embd_size = 1024
  embd_size = 768 #for gpt2
  mapping_file = 'text-embedding-gpt.json'
  file_names = ["C_train", "C_test", "C_val"]

  df_meta = pd.read_csv(f'{ip_data_path}/nursery.csv')

  dataset_description = "The Nursery dataset ranks applications for nursery schools, with attributes such as parents' occupation, child's nursery, family structure, financial standing, social conditions, and health conditions."
  task_description = "Predict the evaluation outcome of nursery school applications based on the given attributes."


  data = np.load(f'{ip_data_path}/C_train.npy', allow_pickle=True)
  df = pd.DataFrame(data)

  llm_model = get_model()
  tokenizer = get_tokenizer()

  text_dict = create_word_dict(df)
  text_dict = create_embd(text_dict, llm_model, tokenizer, df_meta, dataset_description, task_description, max_embd_size)

  save_embd_mappings(mapping_file, text_dict)

  for file_name in tqdm(file_names):
    data = np.load(f'{ip_data_path}/{file_name}.npy', allow_pickle=True)
    df = pd.DataFrame(data)
    create_and_save_embedded_data(df, f'{op_data_path}/{file_name}.npy', text_dict)

  !cp -R text-embedding-gpt.json '{op_data_path}'

 33%|███▎      | 1/3 [00:19<00:38, 19.16s/it]

Shape =  (10368, 6144)


 67%|██████▋   | 2/3 [00:24<00:10, 10.78s/it]

Shape =  (1296, 6144)


100%|██████████| 3/3 [00:27<00:00,  9.15s/it]

Shape =  (1296, 6144)





###titanic

In [None]:
if __name__=='__main__':
  ip_data_path = f"{ip_base_path}/titanic"
  op_data_path = f"{op_base_path}/titanic"
  os.makedirs(op_data_path, exist_ok=True)
  max_embd_size = 1024
  embd_size = 768 #for gpt2
  mapping_file = 'text-embedding-gpt.json'
  file_names = ["C_train", "C_test", "C_val"]

  df_meta = pd.read_csv(f'{ip_data_path}/titanic.csv')

  dataset_description = "The Titanic dataset includes information on passengers' class, age, sex, survival status, and other attributes, excluding crew members. It is commonly used for logistic regression analysis."
  task_description = "Predict the survival status of passengers based on the given attributes."

  data = np.load(f'{ip_data_path}/C_train.npy', allow_pickle=True)
  df = pd.DataFrame(data)

  llm_model = get_model()
  tokenizer = get_tokenizer()

  text_dict = create_word_dict(df)
  text_dict = create_embd(text_dict, llm_model, tokenizer, df_meta, dataset_description, task_description, max_embd_size)

  save_embd_mappings(mapping_file, text_dict)

  for file_name in tqdm(file_names):
    data = np.load(f'{ip_data_path}/{file_name}.npy', allow_pickle=True)
    df = pd.DataFrame(data)
    create_and_save_embedded_data(df, f'{op_data_path}/{file_name}.npy', text_dict)

  !cp -R text-embedding-gpt.json '{op_data_path}'

 33%|███▎      | 1/3 [00:01<00:02,  1.45s/it]

Shape =  (1049, 2304)


 67%|██████▋   | 2/3 [00:02<00:01,  1.23s/it]

Shape =  (130, 2304)


100%|██████████| 3/3 [00:04<00:00,  1.43s/it]

Shape =  (130, 2304)



