# Textual Processing

## 1. Import libraries

In [1]:
## Set parent directory
import sys
sys.path.append("..")

In [20]:
## Import DS libraries
import pandas as pd
import numpy as np

## Import Torch libraries
import torch

## Import other libraries
from tqdm import tqdm
import os

## Import HuggingFace libraries
from transformers import BertTokenizer, BertModel

## Import custom libraries
#from utils.processing.textual import *
from VQA_model import VocabEncoder as VE
import VQA_model.models.seq2vec as seq2vec

## 2. Set global variables

In [4]:
## Set device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "mps")

In [5]:
## Set save path for embeddings
SAVE_PATH = "../../data/text_representations_bert/"

In [6]:
## Set batch size
BATCH_SIZE = 4000

## 3. Get the data

In [7]:
## Set question paths (train, test, val)
PATH_questions_split_train = '../../data/text/USGS_split_train_questions.json'
PATH_questions_split_test = '../../data/text/USGS_split_test_questions.json'
PATH_questions_split_val = '../../data/text/USGS_split_val_questions.json'

PATH_questions = [PATH_questions_split_train, PATH_questions_split_test, PATH_questions_split_val]

In [8]:
## Set answer paths (train, test, val)
PATH_answers_split_train = '../../data/text/USGS_split_train_answers.json'
PATH_answers_split_test = '../../data/text/USGS_split_test_answers.json'
PATH_answers_split_val = '../../data/text/USGS_split_val_answers.json'

PATH_answers = [PATH_answers_split_train, PATH_answers_split_test, PATH_answers_split_val]

In [9]:
## Set all paths (questions, answers)
PATH_all_questions = '../../data/text/USGSquestions.json'
PATH_all_answers = '../../data/text/USGSanswers.json'

PATH_all = [PATH_all_questions, PATH_all_answers]

In [10]:
import json


def json_to_dataframe(json_file_path, delimiter):
    """
    This function converts a JSON file to a pandas DataFrame.

    Args:
    json_file_path : str : the path to the JSON file.

    Returns:
    df : DataFrame : a pandas DataFrame created from the JSON file, or
    None : if an error occurs.
    """
    
    try:
        # Open the JSON file
        with open(json_file_path, 'r') as json_file:
            # Load the content of the file
            # Assuming the JSON structure is a flat dictionary-like structure
            # If the structure is different, this line may need adjustment
            json_data = json.load(json_file)[delimiter]
        
        # Convert the JSON data to a DataFrame
        # Note: Depending on the JSON structure, you might need a different approach
        df = pd.DataFrame(json_data)

        # Return the DataFrame
        return df
    
    except FileNotFoundError:
        print(f"File not found: {json_file_path}")
        return None
    except json.JSONDecodeError:
        print(f"Error occurred while decoding JSON from file: {json_file_path}")
        return None
    except Exception as e:
        # Catch any other exceptions that occur
        print(f"An unexpected error occurred: {str(e)}")
        return None

In [11]:
## Retrieve questions and answers
questions = [json_to_dataframe(path, "questions") for path in PATH_questions]
answers = [json_to_dataframe(path, "answers") for path in PATH_answers]

## 4. Data Manipulation

In [12]:
def remove_nan_rows(df, delimiter):
    """
    Remove rows with NaN in the 'question' column from a DataFrame.

    Parameters:
    df (DataFrame): The input DataFrame.

    Returns:
    DataFrame: The modified DataFrame with rows containing NaN in 'question' column removed.
    """
    # Validate if 'question' column exists in the DataFrame
    if delimiter in df.columns:
        # Remove rows where 'question' column is NaN
        df_clean = df.dropna(subset=[delimiter])
        return df_clean
    else:
        raise ValueError(f"No {delimiter} column found in the DataFrame")

In [13]:
def remove_columns(dataframe, columns_to_remove):
    """
    Remove specified columns from a pandas DataFrame.

    Parameters:
    dataframe (pd.DataFrame): The original DataFrame.
    columns_to_remove (list): A list of column names to remove.

    Returns:
    pd.DataFrame: A new DataFrame with specified columns removed.
    """
    # Check if all columns to remove are in the DataFrame
    for col in columns_to_remove:
        if col not in dataframe.columns:
            raise ValueError(f"Column '{col}' does not exist in the DataFrame.")

    # Drop the columns
    dataframe = dataframe.drop(columns=columns_to_remove)
    return dataframe

In [14]:
def merge_dataframes_on_column(df1, df2, common_column, how='inner'):
    """
    Merge two pandas DataFrames on a specific common column.

    Parameters:
    df1 (pd.DataFrame): The first DataFrame.
    df2 (pd.DataFrame): The second DataFrame.
    common_column (str): The name of the common column to merge on.
    how (str): Type of merge to be performed ('left', 'right', 'outer', 'inner'), default is 'inner'.

    Returns:
    pd.DataFrame: A new DataFrame resulting from the merge of the two input DataFrames.
    """
    # Check if the common column exists in both DataFrames
    if common_column not in df1.columns or common_column not in df2.columns:
        raise ValueError(f"The common column '{common_column}' must exist in both DataFrames.")

    # Merge the DataFrames on the common_column
    result = pd.merge(df1, df2, on=common_column, how=how)
    return result

In [16]:
## Remove NaNs
questions_nan = [remove_nan_rows(question, "question") for question in questions]
answers_nan = [remove_nan_rows(answer, "answer") for answer in answers]

In [17]:
## Remove unnecessary columns
questions_clean = [remove_columns(question, ["active", "date_added", "people_id", "answers_ids"]) for question in questions_nan]
answers_clean = [remove_columns(answer, ["active", "date_added", "people_id", "question_id"]) for answer in answers_nan]

In [18]:
## Concatenate question & answers to generate train, test and val sets
train = merge_dataframes_on_column(questions_clean[0], answers_clean[0], "id")
test = merge_dataframes_on_column(questions_clean[1], answers_clean[1], "id")
val = merge_dataframes_on_column(questions_clean[2], answers_clean[2], "id")

## 5. Generate embeddings

In [21]:
## Create the VocabEncoder objects
encoder_questions = VE.VocabEncoder(PATH_all_questions, questions = True)
encoder_answers = VE.VocabEncoder(PATH_all_answers, questions = False, range_numbers=False)

In [22]:
## Create vocabulary
vocabulary_questions = encoder_questions.getVocab()
vocabulary_answers = encoder_answers.getVocab()

In [23]:
## Create the seq2vec object
seq2vec = seq2vec.factory(vocabulary_questions, {'arch': 'skipthoughts', 'dir_st': 'data/skip-thoughts', 'type': 'BayesianUniSkip', 'dropout': 0.25, 'fixed_emb': False})
for param in seq2vec.parameters():
    param.requires_grad = False

seq2vec.eval()
seq2vec.to(DEVICE)



BayesianUniSkip(
  (embedding): Embedding(145, 620, padding_idx=0)
  (rnn): BayesianGRU(
    (gru_cell): BayesianGRUCell(
      (weight_ir): Linear(in_features=620, out_features=2400, bias=True)
      (weight_ii): Linear(in_features=620, out_features=2400, bias=True)
      (weight_in): Linear(in_features=620, out_features=2400, bias=True)
      (weight_hr): Linear(in_features=2400, out_features=2400, bias=False)
      (weight_hi): Linear(in_features=2400, out_features=2400, bias=False)
      (weight_hn): Linear(in_features=2400, out_features=2400, bias=False)
      (drop_ir): SequentialDropout(0.2500)
      (drop_ii): SequentialDropout(0.2500)
      (drop_in): SequentialDropout(0.2500)
      (drop_hr): SequentialDropout(0.2500)
      (drop_hi): SequentialDropout(0.2500)
      (drop_hn): SequentialDropout(0.2500)
    )
  )
)

In [24]:
## Create the BERT object --> to be included later in the model

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
testing = train.iloc[0].question
tokenizer_testing = tokenizer.encode_plus(testing, add_special_tokens=True, return_attention_mask=True, return_tensors="pt")
output = bert(**tokenizer_testing)
last_hidden_states = output.last_hidden_state
cls_embeddings = last_hidden_states[:, 0, :]
cls_embeddings = cls_embeddings.squeeze(0)
cls_embeddings = cls_embeddings.detach().cpu()

In [None]:
type(cls_embeddings)

In [25]:
def create_batch(dataframe, batch_size, model, tokenizer, save_path):
    
    ## Batch Data Collector
    batch_data = []
    start_idx = 0
    save_idx = 0

    while start_idx < len(dataframe):
    
        for idx in tqdm(range(start_idx, min(start_idx + batch_size, len(dataframe)))):
            
            ## Retrieve row information
            row = dataframe.iloc[idx]
            question = row.question
            answer = row.answer
            question_type = row.type
            img_id = row.img_id

            ## Remove question sign from question (is done automatically for RNN)
            if question[:-1] == "?":
                question = question[:-1]
            else:
                question = question

            ## Retireve answer encoding
            answer_encoded = encoder_answers.encode(answer)
            answer_tensor = torch.tensor(answer_encoded, dtype=torch.long)
            answer_tensor = answer_tensor.cpu().detach()

            ## Retrieve question embedding
            tokenized = tokenizer.encode_plus(question, add_special_tokens=True, return_attention_mask=True, return_tensors="pt")
            output = model(**tokenized)
            last_hidden_states = output.last_hidden_state
            cls_embeddings = last_hidden_states[:, 0, :].squeeze(0).detach().cpu()

            ## Create data dictionary
            data = {
                'question': cls_embeddings,
                'answer': answer_tensor,
                'question_type': question_type,
                'image_id': img_id
                }

            ## Append data to batch
            batch_data.append(data)

        ## Save batch
        batch_save_path = os.path.join(save_path, f"batch_{save_idx}.pt")
        torch.save(batch_data, batch_save_path)
        
        ## Update indices & clear batch
        start_idx += len(batch_data)
        save_idx += 1
        batch_data.clear()

In [27]:
save_directory = "data/text_representations_bert/train"
create_batch(train, BATCH_SIZE, bert, tokenizer, save_directory)

100%|██████████| 4000/4000 [02:47<00:00, 23.87it/s]
100%|██████████| 4000/4000 [02:36<00:00, 25.50it/s]
100%|██████████| 4000/4000 [02:36<00:00, 25.48it/s]
100%|██████████| 4000/4000 [02:36<00:00, 25.54it/s]
100%|██████████| 4000/4000 [02:35<00:00, 25.65it/s]
100%|██████████| 4000/4000 [02:34<00:00, 25.87it/s]
100%|██████████| 4000/4000 [02:34<00:00, 25.91it/s]
100%|██████████| 4000/4000 [02:34<00:00, 25.82it/s]
100%|██████████| 4000/4000 [02:34<00:00, 25.89it/s]
100%|██████████| 4000/4000 [02:34<00:00, 25.82it/s]
100%|██████████| 4000/4000 [02:35<00:00, 25.78it/s]
100%|██████████| 4000/4000 [02:34<00:00, 25.83it/s]
100%|██████████| 4000/4000 [02:34<00:00, 25.87it/s]
100%|██████████| 4000/4000 [02:34<00:00, 25.89it/s]
100%|██████████| 4000/4000 [02:35<00:00, 25.78it/s]
100%|██████████| 4000/4000 [02:36<00:00, 25.53it/s]
100%|██████████| 4000/4000 [02:35<00:00, 25.71it/s]
100%|██████████| 4000/4000 [02:35<00:00, 25.75it/s]
100%|██████████| 4000/4000 [02:34<00:00, 25.86it/s]
100%|███████

In [28]:
save_directory = "data/text_representations_bert/test"
create_batch(test, BATCH_SIZE, bert, tokenizer, save_directory)

100%|██████████| 4000/4000 [02:33<00:00, 26.01it/s]
100%|██████████| 4000/4000 [02:34<00:00, 25.85it/s]
100%|██████████| 4000/4000 [02:34<00:00, 25.96it/s]
100%|██████████| 4000/4000 [02:34<00:00, 25.84it/s]
100%|██████████| 4000/4000 [02:33<00:00, 25.99it/s]
100%|██████████| 4000/4000 [02:33<00:00, 26.03it/s]
100%|██████████| 4000/4000 [02:33<00:00, 25.98it/s]
100%|██████████| 4000/4000 [02:34<00:00, 25.92it/s]
100%|██████████| 4000/4000 [02:33<00:00, 26.01it/s]
100%|██████████| 4000/4000 [02:34<00:00, 25.93it/s]
100%|██████████| 4000/4000 [02:34<00:00, 25.93it/s]
100%|██████████| 4000/4000 [02:33<00:00, 26.05it/s]
100%|██████████| 4000/4000 [02:33<00:00, 25.98it/s]
100%|██████████| 4000/4000 [02:34<00:00, 25.96it/s]
100%|██████████| 4000/4000 [02:34<00:00, 25.97it/s]
100%|██████████| 4000/4000 [02:43<00:00, 24.43it/s]
100%|██████████| 4000/4000 [02:33<00:00, 25.98it/s]
100%|██████████| 4000/4000 [02:34<00:00, 25.95it/s]
100%|██████████| 4000/4000 [02:34<00:00, 25.95it/s]
100%|███████