In [1]:
from torch.utils.data import DataLoader, Dataset

class MyDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        sample = self.dataframe.iloc[idx]
        question = sample["question"]
        answer = sample["answer"]
        question_type = sample["type"]
        img_id = sample["img_id"]        
        
        return question, answer, question_type, img_id

# Textual Processing

## 1. Import libraries

In [3]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [4]:
## Import DS libraries
import pandas as pd
import numpy as np

## Import Torch libraries
import torch

## Import other libraries
from tqdm import tqdm
import os

## Import HuggingFace libraries
from transformers import AutoModel, AutoTokenizer

## Import custom functions
from utils.text_preprocessing_utils import *

## Import VocabEncoder
from utils.VocabEncoder import *

## 2. Set global variables

In [5]:
## Set device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "mps")

In [6]:
## Set save path for embeddings
SAVE_PATH = "../data/text_representations_bert_att/"
SAVE_PATH_TRAIN = SAVE_PATH + "train/"
SAVE_PATH_TEST = SAVE_PATH + "test/"
SAVE_PATH_VAL = SAVE_PATH + "val/"

## Check if path exists, if not create it
create_path(SAVE_PATH)
create_path(SAVE_PATH_TRAIN)
create_path(SAVE_PATH_TEST)
create_path(SAVE_PATH_VAL)

In [7]:
## Set batch size
SAVE_BATCH_SIZE = 1

## 3. Retrieve all data

In [13]:
## Main data path
DATA_PATH = "../../data/text/"

In [14]:
## Set question paths (train, test, val)
PATH_questions_split_train = DATA_PATH + "USGS_split_train_questions.json"
PATH_questions_split_test = DATA_PATH + "USGS_split_test_questions.json"
PATH_questions_split_val = DATA_PATH + "USGS_split_val_questions.json"

PATH_questions = [PATH_questions_split_train, PATH_questions_split_test, PATH_questions_split_val]

In [15]:
## Set answer paths (train, test, val)
PATH_answers_split_train = DATA_PATH + "USGS_split_train_answers.json"
PATH_answers_split_test = DATA_PATH + "USGS_split_test_answers.json"
PATH_answers_split_val = DATA_PATH + "USGS_split_val_answers.json"

PATH_answers = [PATH_answers_split_train, PATH_answers_split_test, PATH_answers_split_val]

In [16]:
## Set all paths (questions, answers)
PATH_all_questions = DATA_PATH + "USGSquestions.json"
PATH_all_answers = DATA_PATH + "USGSanswers.json"

PATH_all = [PATH_all_questions, PATH_all_answers]

In [17]:
## Retrieve questions and answers
questions = [json_to_dataframe(path, "questions") for path in PATH_questions]
answers = [json_to_dataframe(path, "answers") for path in PATH_answers]

## 4. Data Manipulation

In [18]:
## Remove NaNs
questions_nan = [remove_nan_rows(question, "question") for question in questions]
answers_nan = [remove_nan_rows(answer, "answer") for answer in answers]

In [19]:
## Remove unnecessary columns
questions_clean = [remove_columns(question, ["active", "date_added", "people_id", "answers_ids"]) for question in questions_nan]
answers_clean = [remove_columns(answer, ["active", "date_added", "people_id", "question_id"]) for answer in answers_nan]

In [20]:
## Concatenate question & answers to generate train, test and val sets
train = merge_dataframes_on_column(questions_clean[0], answers_clean[0], "id")
test = merge_dataframes_on_column(questions_clean[1], answers_clean[1], "id")
val = merge_dataframes_on_column(questions_clean[2], answers_clean[2], "id")

## 5. Generate & save embeddings

In [41]:
## Create the VocabEncoder objects
encoder_answers = VocabEncoder(PATH_all_answers, questions = False, range_numbers=False)

In [22]:
## Instantiate the tokenizer & BERT model
model_name = "bert-base-uncased"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.pooler = None  # Discard the pooling layer
model.eval()
model.to(DEVICE);

In [None]:
## Check all lengths in tokenized form
lengths_train = length_checker(train, tokenizer=tokenizer)
lengths_test = length_checker(test, tokenizer=tokenizer)
lengths_val = length_checker(val, tokenizer=tokenizer)

print("Max Token Length - Train: ", max(lengths_train))
print("Max Token Length - Test: ", max(lengths_test))
print("Max Token Length - Val: ", max(lengths_val))

625340it [00:47, 13163.96it/s]
105647it [00:07, 13945.72it/s]
102843it [00:07, 14021.35it/s]

Max Token Length - Train:  35
Max Token Length - Test:  28
Max Token Length - Val:  29





In [44]:
def analyze_question_type_distribution(dataframe):
    """
    This function analyzes the distribution of question types in a given dataframe.
    """
    ## Get the number of questions per type
    question_type_counts = dataframe["type"].value_counts()
    ## Get the total number of questions
    total_questions = dataframe.shape[0]
    ## Get the percentage of questions per type
    question_type_percentages = [round((count / total_questions) * 100, 2) for count in question_type_counts]
    ## Create a dataframe with the results
    df = pd.DataFrame({"count": question_type_counts, "percentage": question_type_percentages})
    ## Sort by percentage
    df = df.sort_values(by="percentage", ascending=False)
    ## Return the dataframe
    return df

In [45]:
analyze_question_type_distribution(test)

Unnamed: 0,count,percentage
comp,72923,32.75
presence,58545,26.29
count,58149,26.11
area,33067,14.85


In [19]:
def create_representations(dataframe, model, tokenizer, save_path, device):
    
    save_idx = 0

    ## Iterate over items
    for idx in tqdm(range(len(dataframe))):
            
            
            ## Retrieve information
            question = dataframe.iloc[idx]["question"]
            answer = dataframe.iloc[idx]["answer"]
            question_type = dataframe.iloc[idx]["type"]
            img_id = dataframe.iloc[idx]["img_id"]

            answer = encoder_answers.encode(answer)

            answer = torch.tensor(answer, dtype=torch.long)
    
            ## Remove question sign from question 
            # batch_questions = [question[:-1] if question[-1:] == "?" else question for question in batch_questions]
            if question[-1] == "?":
                question = question[:-1]
            else:
                question = question
    
            ## Retrieve question embedding
            # batch_tokenized = tokenizer.batch_encode_plus(batch_questions, pad_to_multiple_of=40, add_special_tokens=True, return_attention_mask=True, padding=True, return_tensors="pt")
            input = tokenizer.encode_plus(question, pad_to_multiple_of=35, add_special_tokens=True, return_attention_mask=True, padding=True, return_tensors="pt")
            input.to(device)
            
            with torch.no_grad():
                output = model(**input)
                hidden_states = output.last_hidden_state.squeeze(0).detach().cpu()
    
            ## Create data dictionary
            data = {
                'question': hidden_states,
                'answer': answer,
                'question_type': question_type,
                'image_id': img_id
                }
    
                ## Append data to batch
                ## batch_data.append(data)
            
            ## Save item
            save_path_idx = os.path.join(save_path, f"{save_idx}.pt")
            torch.save(data, save_path_idx)
            save_idx += 1

            ## Update save_idx & clear all lists
            # questions.clear()
            # answers.clear()
            # question_types.clear()
            # img_ids.clear()

In [21]:
## For Train
create_representations(train, model, tokenizer, SAVE_PATH_TRAIN, DEVICE)

  0%|          | 0/625340 [00:00<?, ?it/s]

100%|██████████| 625340/625340 [1:56:30<00:00, 89.45it/s]  


In [21]:
## For Test
create_representations(test, model, tokenizer, SAVE_PATH_TEST, DEVICE)

100%|██████████| 105647/105647 [20:17<00:00, 86.79it/s] 


In [22]:
## For Val
create_representations(val, model, tokenizer, SAVE_PATH_VAL, DEVICE)

100%|██████████| 102843/102843 [20:32<00:00, 83.42it/s]


# DEBUGGING

In [None]:
data = torch.load("../preprocessed_data/text_representations_bert/train/batch_1.pt")

In [None]:
data

In [None]:
data[0]["question"]