In [None]:
import pandas as pd
import numpy as np
import os
import torch
from tqdm import tqdm
import json
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
ip_base_path = "/content/drive/MyDrive/Colab Notebooks/20-30 Oct 2024- Test Data Algo1-2-3_mlp/Input Data"
op_base_path = "/content/drive/MyDrive/Colab Notebooks/20-30 Oct 2024- Test Data Algo1-2-3_mlp/3_Default_LLM_Prompt_for_each_row"

In [None]:
os.makedirs(op_base_path, exist_ok=True)

In [None]:
def get_model():
  model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2") #https://huggingface.co/openai-community/gpt2
  return model

In [None]:
def get_tokenizer():
  tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2", clean_up_tokenization_spaces=True)
  if tokenizer.eos_token:
    tokenizer.pad_token = tokenizer.eos_token
  else:
      pad_token = '[PAD]'
      tokenizer.add_special_tokens({'pad_token': pad_token})
      tokenizer.pad_token = pad_token

  return tokenizer

In [None]:
def get_prompt(row_list, df_meta, dataset_description, task_description):
  columns_info = ", ".join(df_meta.loc[:,"Variable Name"])

  prompt = f'''<|start_prompt|>\nDataset description: {dataset_description}\nTask description: {task_description}\nColumns: {columns_info} \nGenerate embeddings when row has '''

  for index, item in enumerate(row_list):
    column_name = df_meta.loc[index, 'Variable Name']
    column_value = item if(item != 'nan' or pd.isna(item)) else 'Unknown'

    prompt += f'''Column {column_name} contains {column_value}, '''

  prompt += "\n<|end_prompt|>"

  return prompt

In [None]:
def create_and_save_embedded_data(df, df_meta, model, tokenizer, file_path, dataset_description, task_description, batch_size=32, max_embd_size=1024): #Taking 32 rows of train df at a time then finding embd for each rows and saving it
  all_embeddings = []

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  llm_model.to(device)

  # Process in batches
  for start_idx in tqdm(range(0, df.shape[0], batch_size)):
      end_idx = min(start_idx + batch_size, len(df))
      batch_df = df.iloc[start_idx:end_idx]

      prompts = [get_prompt(row.tolist(), df_meta, dataset_description, task_description) for _, row in batch_df.iterrows()]

      # Tokenize prompts
      tokenized_prompts = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=max_embd_size)
      input_ids = tokenized_prompts.input_ids #shape [1, 300] where max_embd_size = 300 or max token count whichever is les se.g [1, 118]

      # Get the embeddings from the model's input embedding layer
      prompt_embeddings = model.get_input_embeddings()(input_ids)  # shape [1, 300, 768] where max_embd_size = 300 or max token count whichever is less e.g [1, 118, 768]

      # Mean pooling across the token dimension to get a single vector
      mean_embedding = prompt_embeddings.mean(dim=1)  # shape [1, 768]

      # If you want to remove the batch dimension as well
      mean_embedding = mean_embedding.squeeze(0).detach().numpy()  # shape [768]
      if(len(mean_embedding.shape) == 1):
        mean_embedding = mean_embedding.reshape(1,embd_size)

      all_embeddings.append(mean_embedding)

  # Concatenate all batch embeddings and save to .npy file
  all_embeddings = np.concatenate(all_embeddings, axis=0)
  np.save(file_path, all_embeddings) #.astype(np.float32)

###adult

In [None]:
if __name__=='__main__':
  ip_data_path = f"{ip_base_path}/adult"
  op_data_path = f"{op_base_path}/adult"
  os.makedirs(op_data_path, exist_ok=True)
  max_embd_size = 1024
  embd_size = 768 #for gpt2
  mapping_file = 'text-embedding-gpt.json'
  file_names = ["C_train", "C_test", "C_val"]

  df_meta = pd.read_csv(f'{ip_data_path}/adult.csv')

  dataset_description = '''Extraction was done by Barry Becker from the 1994 Census database. The dataset contains 16 columns, including demographics and other features to describe a person. The target column, Income, is divided into two classes: <=50K and >50K.'''
  task_description = '''Generate embeddings for the categorical columns to facilitate the prediction of whether a person makes over 50K a year.'''

  data = np.load(f'{ip_data_path}/C_train.npy', allow_pickle=True)
  df = pd.DataFrame(data)

  llm_model = get_model()
  tokenizer = get_tokenizer()

  for file_name in file_names:
    data = np.load(f'{ip_data_path}/{file_name}.npy', allow_pickle=True)
    df = pd.DataFrame(data)
    create_and_save_embedded_data(df, df_meta, llm_model, tokenizer, f'{op_data_path}/{file_name}.npy', dataset_description, task_description)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

100%|██████████| 1222/1222 [01:13<00:00, 16.59it/s]
100%|██████████| 153/153 [00:09<00:00, 15.59it/s]
100%|██████████| 153/153 [00:08<00:00, 17.39it/s]


###analcatdata

In [None]:
if __name__=='__main__':
  ip_data_path = f"{ip_base_path}/analcatdata"
  op_data_path = f"{op_base_path}/analcatdata"
  os.makedirs(op_data_path, exist_ok=True)
  max_embd_size = 1024
  embd_size = 768 #for gpt2
  mapping_file = 'text-embedding-gpt.json'
  file_names = ["C_train", "C_test", "C_val"] #, "C_trainval"

  df_meta = pd.read_csv(f'{ip_data_path}/analcatdata.csv')

  dataset_description = '''The dataset is sourced from the book "Analyzing Categorical Data" by Jeffrey S. Simonoff (2003), containing data on the DMFT Index (Decayed, Missing, and Filled Teeth) before and after different prevention strategies. It consists of 797 instances with 5 features, and no missing values.'''
  task_description = '''Predict the type of prevention strategy applied based on the DMFT index before and after the intervention, as well as demographic information.'''

  data = np.load(f'{ip_data_path}/C_train.npy', allow_pickle=True)
  df = pd.DataFrame(data)

  llm_model = get_model()
  tokenizer = get_tokenizer()

  for file_name in file_names:
    data = np.load(f'{ip_data_path}/{file_name}.npy', allow_pickle=True)
    df = pd.DataFrame(data)
    create_and_save_embedded_data(df, df_meta, llm_model, tokenizer, f'{op_data_path}/{file_name}.npy', dataset_description, task_description)

100%|██████████| 20/20 [00:01<00:00, 13.53it/s]
100%|██████████| 3/3 [00:00<00:00, 28.56it/s]
100%|██████████| 3/3 [00:00<00:00, 27.18it/s]


###bank_marketing

In [None]:
if __name__=='__main__':
  ip_data_path = f"{ip_base_path}/bank_marketing"
  op_data_path = f"{op_base_path}/bank_marketing"
  os.makedirs(op_data_path, exist_ok=True)
  max_embd_size = 1024
  embd_size = 768 #for gpt2
  mapping_file = 'text-embedding-gpt.json'
  file_names = ["C_train", "C_test", "C_val"] #, "C_trainval"

  df_meta = pd.read_csv(f'{ip_data_path}/bank_marketing.csv')

  dataset_description = "The Bank Marketing dataset contains data from phone-based direct marketing campaigns aimed at predicting if a client will subscribe to a term deposit. It includes 16 features with a mix of categorical and integer data types."
  task_description = "Predict if the client will subscribe to a term deposit based on the given attributes."

  data = np.load(f'{ip_data_path}/C_train.npy', allow_pickle=True)
  df = pd.DataFrame(data)

  llm_model = get_model()
  tokenizer = get_tokenizer()

  for file_name in file_names:
    data = np.load(f'{ip_data_path}/{file_name}.npy', allow_pickle=True)
    df = pd.DataFrame(data)
    create_and_save_embedded_data(df, df_meta, llm_model, tokenizer, f'{op_data_path}/{file_name}.npy', dataset_description, task_description)

###credit-approval

In [None]:
if __name__=='__main__':
  ip_data_path = f"{ip_base_path}/credit-approval"
  op_data_path = f"{op_base_path}/credit-approval"
  os.makedirs(op_data_path, exist_ok=True)
  max_embd_size = 1024
  embd_size = 768 #for gpt2
  mapping_file = 'text-embedding-gpt.json'
  file_names = ["C_train", "C_test", "C_val"]

  df_meta = pd.read_csv(f'{ip_data_path}/credit-approval.csv')

  dataset_description = '''The Credit Approval dataset consists of 690 instances with 15 features used to evaluate credit card applications. The dataset includes a mix of continuous and categorical attributes, with some having missing values. To ensure confidentiality, attribute names and values have been anonymized.'''
  task_description = '''Predict the credit approval status (target: A16) based on anonymized financial and demographic attributes using a combination of continuous and categorical data while handling any missing values appropriately.'''

  data = np.load(f'{ip_data_path}/C_train.npy', allow_pickle=True)
  df = pd.DataFrame(data)

  llm_model = get_model()
  tokenizer = get_tokenizer()

  for file_name in file_names:
    data = np.load(f'{ip_data_path}/{file_name}.npy', allow_pickle=True)
    df = pd.DataFrame(data)
    create_and_save_embedded_data(df, df_meta, llm_model, tokenizer, f'{op_data_path}/{file_name}.npy', dataset_description, task_description)

100%|██████████| 18/18 [00:00<00:00, 19.01it/s]
100%|██████████| 3/3 [00:00<00:00, 19.90it/s]
100%|██████████| 3/3 [00:00<00:00, 23.16it/s]


###credit-g

In [None]:
if __name__=='__main__':
  data_name = "credit-g"
  ip_data_path = f"{ip_base_path}/{data_name}"
  op_data_path = f"{op_base_path}/{data_name}"
  os.makedirs(op_data_path, exist_ok=True)
  max_embd_size = 1024
  embd_size = 768 #for gpt2
  mapping_file = 'text-embedding-gpt.json'
  file_names = ["C_train", "C_test", "C_val"] #, "C_trainval"

  df_meta = pd.read_csv(f'{ip_data_path}/{data_name}.csv')

  dataset_description = '''The Statlog (German Credit Data) dataset consists of 1,000 instances with 20 features, used for classifying individuals as good or bad credit risks based on their financial and personal information. The dataset is provided in both a categorical and a numeric format, suitable for different types of algorithms. There are no missing values, and a cost matrix is used to emphasize the cost of misclassification.'''
  task_description = '''Classify individuals as good or bad credit risks based on their financial and demographic attributes while considering the associated cost matrix.'''

  data = np.load(f'{ip_data_path}/C_train.npy', allow_pickle=True)
  df = pd.DataFrame(data)

  llm_model = get_model()
  tokenizer = get_tokenizer()

  for file_name in file_names:
    data = np.load(f'{ip_data_path}/{file_name}.npy', allow_pickle=True)
    df = pd.DataFrame(data)
    create_and_save_embedded_data(df, df_meta, llm_model, tokenizer, f'{op_data_path}/{file_name}.npy', dataset_description, task_description)

###cylinder-bands

In [None]:
if __name__=='__main__':
  ip_data_path = f"{ip_base_path}/cylinder-bands"
  op_data_path = f"{op_base_path}/cylinder-bands"
  os.makedirs(op_data_path, exist_ok=True)
  max_embd_size = 1024
  embd_size = 768 #for gpt2
  mapping_file = 'text-embedding-gpt.json'
  file_names = ["C_train", "C_test", "C_val"]

  df_meta = pd.read_csv(f'{ip_data_path}/cylinder-bands.csv')

  dataset_description = "The Cylinder Bands dataset contains data related to process delays (cylinder banding) in rotogravure printing, with 512 instances and 39 features, including categorical, integer, and real values."
  task_description = "Predict and classify process delays (cylinder banding) in rotogravure printing."


  data = np.load(f'{ip_data_path}/C_train.npy', allow_pickle=True)
  df = pd.DataFrame(data)

  llm_model = get_model()
  tokenizer = get_tokenizer()

  for file_name in file_names:
    data = np.load(f'{ip_data_path}/{file_name}.npy', allow_pickle=True)
    df = pd.DataFrame(data)
    create_and_save_embedded_data(df, df_meta, llm_model, tokenizer, f'{op_data_path}/{file_name}.npy', dataset_description, task_description)

100%|██████████| 14/14 [00:01<00:00, 13.88it/s]
100%|██████████| 2/2 [00:00<00:00, 14.92it/s]
100%|██████████| 2/2 [00:00<00:00, 16.72it/s]


###dresses-sales

In [None]:
if __name__=='__main__':
  ip_data_path = f"{ip_base_path}/dresses-sales"
  op_data_path = f"{op_base_path}/dresses-sales"
  os.makedirs(op_data_path, exist_ok=True)
  max_embd_size = 1024
  embd_size = 768 #for gpt2
  mapping_file = 'text-embedding-gpt.json'
  file_names = ["C_train", "C_test", "C_val"] #, "C_trainval"

  df_meta = pd.read_csv(f'{ip_data_path}/dresses-sales.csv')

  dataset_description = "The Dresses_Attribute_Sales dataset contains attributes of dresses, such as style, price, rating, size, season, and others, to predict sales recommendations."
  task_description = "Predict dress recommendations based on the given attributes."

  data = np.load(f'{ip_data_path}/C_train.npy', allow_pickle=True)
  df = pd.DataFrame(data)

  llm_model = get_model()
  tokenizer = get_tokenizer()

  for file_name in file_names:
    data = np.load(f'{ip_data_path}/{file_name}.npy', allow_pickle=True)
    df = pd.DataFrame(data)
    create_and_save_embedded_data(df, df_meta, llm_model, tokenizer, f'{op_data_path}/{file_name}.npy', dataset_description, task_description)

100%|██████████| 13/13 [00:00<00:00, 18.74it/s]
100%|██████████| 2/2 [00:00<00:00, 21.14it/s]
100%|██████████| 2/2 [00:00<00:00, 28.43it/s]


###eucalyptus

In [None]:
if __name__=='__main__':
  data_name = "eucalyptus"
  ip_data_path = f"{ip_base_path}/{data_name}"
  op_data_path = f"{op_base_path}/{data_name}"
  os.makedirs(op_data_path, exist_ok=True)
  max_embd_size = 1024
  embd_size = 768 #for gpt2
  mapping_file = 'text-embedding-gpt.json'
  file_names = ["C_train", "C_test", "C_val"] #, "C_trainval"

  df_meta = pd.read_csv(f'{ip_data_path}/{data_name}.csv')

  dataset_description = '''The Eucalyptus dataset contains 736 instances with 20 features, including geographic, environmental, measurement, and form information. The data aims to determine the best eucalyptus seedlots for soil conservation in seasonally dry hill country, based on factors such as growth, survival, and form ratings.'''
  task_description = '''Predict the utility rating of eucalyptus species based on various environmental, geographical, and growth characteristics to determine the most suitable seedlots for soil conservation.'''

  data = np.load(f'{ip_data_path}/C_train.npy', allow_pickle=True)
  df = pd.DataFrame(data)

  llm_model = get_model()
  tokenizer = get_tokenizer()

  for file_name in file_names:
    data = np.load(f'{ip_data_path}/{file_name}.npy', allow_pickle=True)
    df = pd.DataFrame(data)
    create_and_save_embedded_data(df, df_meta, llm_model, tokenizer, f'{op_data_path}/{file_name}.npy', dataset_description, task_description)

###kr-vs-kp

In [None]:
if __name__=='__main__':
  data_name = "kr-vs-kp"
  ip_data_path = f"{ip_base_path}/{data_name}"
  op_data_path = f"{op_base_path}/{data_name}"
  os.makedirs(op_data_path, exist_ok=True)
  max_embd_size = 1024
  embd_size = 768 #for gpt2
  mapping_file = 'text-embedding-gpt.json'
  file_names = ["C_train", "C_test", "C_val"] #, "C_trainval"

  df_meta = pd.read_csv(f'{ip_data_path}/{data_name}.csv')

  dataset_description = '''The KRKPA7 dataset contains 3,196 instances with 37 features, representing chess board positions in the King+Rook versus King+Pawn endgame scenario, where the pawn is on the a7 square. It aims to determine whether White can win based on the configuration of the pieces. All features are categorical, and no missing values are present.'''
  task_description = '''The classification task involves predicting whether White can win ("won") or cannot win ("nowin") based on the chessboard's features.'''

  data = np.load(f'{ip_data_path}/C_train.npy', allow_pickle=True)
  df = pd.DataFrame(data)

  llm_model = get_model()
  tokenizer = get_tokenizer()

  for file_name in file_names:
    data = np.load(f'{ip_data_path}/{file_name}.npy', allow_pickle=True)
    df = pd.DataFrame(data)
    create_and_save_embedded_data(df, df_meta, llm_model, tokenizer, f'{op_data_path}/{file_name}.npy', dataset_description, task_description)

###nursery

In [None]:
if __name__=='__main__':
  ip_data_path = f"{ip_base_path}/nursery"
  op_data_path = f"{op_base_path}/nursery"
  os.makedirs(op_data_path, exist_ok=True)
  max_embd_size = 1024
  embd_size = 768 #for gpt2
  mapping_file = 'text-embedding-gpt.json'
  file_names = ["C_train", "C_test", "C_val"]

  df_meta = pd.read_csv(f'{ip_data_path}/nursery.csv')

  dataset_description = "The Nursery dataset ranks applications for nursery schools, with attributes such as parents' occupation, child's nursery, family structure, financial standing, social conditions, and health conditions."
  task_description = "Predict the evaluation outcome of nursery school applications based on the given attributes."

  data = np.load(f'{ip_data_path}/C_train.npy', allow_pickle=True)
  df = pd.DataFrame(data)

  llm_model = get_model()
  tokenizer = get_tokenizer()

  for file_name in file_names:
    data = np.load(f'{ip_data_path}/{file_name}.npy', allow_pickle=True)
    df = pd.DataFrame(data)
    create_and_save_embedded_data(df, df_meta, llm_model, tokenizer, f'{op_data_path}/{file_name}.npy', dataset_description, task_description)

100%|██████████| 324/324 [00:18<00:00, 17.65it/s]
100%|██████████| 41/41 [00:01<00:00, 22.61it/s]
100%|██████████| 41/41 [00:01<00:00, 22.58it/s]


###sick

In [None]:
if __name__=='__main__':
  data_name = "sick"
  ip_data_path = f"{ip_base_path}/{data_name}"
  op_data_path = f"{op_base_path}/{data_name}"
  os.makedirs(op_data_path, exist_ok=True)
  max_embd_size = 1024
  embd_size = 768 #for gpt2
  mapping_file = 'text-embedding-gpt.json'
  file_names = ["C_train", "C_test", "C_val"] #, "C_trainval"

  df_meta = pd.read_csv(f'{ip_data_path}/{data_name}.csv')

  dataset_description = '''This dataset contains records related to thyroid disease, extracted from the Garavan Institute and contributed by J. Ross Quinlan. It includes 30 columns with various attributes such as demographics, medical history, and lab measurements. The target column, 'sick', indicates whether a patient is classified as sick (true) or not sick (false).'''

  task_description = '''Generate embeddings for the categorical columns to facilitate the prediction of whether a patient has thyroid disease. The goal is to enhance the model's ability to learn from the categorical features, improving the classification performance.'''


  data = np.load(f'{ip_data_path}/C_train.npy', allow_pickle=True)
  df = pd.DataFrame(data)

  llm_model = get_model()
  tokenizer = get_tokenizer()

  for file_name in file_names:
    data = np.load(f'{ip_data_path}/{file_name}.npy', allow_pickle=True)
    df = pd.DataFrame(data)
    create_and_save_embedded_data(df, df_meta, llm_model, tokenizer, f'{op_data_path}/{file_name}.npy', dataset_description, task_description)

###titanic

In [None]:
if __name__=='__main__':
  ip_data_path = f"{ip_base_path}/titanic"
  op_data_path = f"{op_base_path}/titanic"
  os.makedirs(op_data_path, exist_ok=True)
  max_embd_size = 1024
  embd_size = 768 #for gpt2
  mapping_file = 'text-embedding-gpt.json'
  file_names = ["C_train", "C_test", "C_val"]

  df_meta = pd.read_csv(f'{ip_data_path}/titanic.csv')

  dataset_description = "The Titanic dataset includes information on passengers' class, age, sex, survival status, and other attributes, excluding crew members. It is commonly used for logistic regression analysis."
  task_description = "Predict the survival status of passengers based on the given attributes."

  data = np.load(f'{ip_data_path}/C_train.npy', allow_pickle=True)
  df = pd.DataFrame(data)

  llm_model = get_model()
  tokenizer = get_tokenizer()

  for file_name in file_names:
    data = np.load(f'{ip_data_path}/{file_name}.npy', allow_pickle=True)
    df = pd.DataFrame(data)
    create_and_save_embedded_data(df, df_meta, llm_model, tokenizer, f'{op_data_path}/{file_name}.npy', dataset_description, task_description)

100%|██████████| 33/33 [00:01<00:00, 30.46it/s]
100%|██████████| 5/5 [00:00<00:00, 35.00it/s]
100%|██████████| 5/5 [00:00<00:00, 39.67it/s]
