# Textual Processing

## 1. Import

In [1]:
import pandas as pd 
import numpy as np
import os
from skimage import io
from tqdm import tqdm
import json
import VQA_model.VocabEncoder as VE
import VQA_model.models.seq2vec as seq2vec
from torch.autograd import Variable
import torch
import torch.nn as nn
from tqdm import tqdm

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## 2. Functions

In [3]:
def json_to_dataframe(json_file_path, delimiter):
    """
    This function converts a JSON file to a pandas DataFrame.

    Args:
    json_file_path : str : the path to the JSON file.

    Returns:
    df : DataFrame : a pandas DataFrame created from the JSON file, or
    None : if an error occurs.
    """
    
    try:
        # Open the JSON file
        with open(json_file_path, 'r') as json_file:
            # Load the content of the file
            # Assuming the JSON structure is a flat dictionary-like structure
            # If the structure is different, this line may need adjustment
            json_data = json.load(json_file)[delimiter]
        
        # Convert the JSON data to a DataFrame
        # Note: Depending on the JSON structure, you might need a different approach
        df = pd.DataFrame(json_data)

        # Return the DataFrame
        return df
    
    except FileNotFoundError:
        print(f"File not found: {json_file_path}")
        return None
    except json.JSONDecodeError:
        print(f"Error occurred while decoding JSON from file: {json_file_path}")
        return None
    except Exception as e:
        # Catch any other exceptions that occur
        print(f"An unexpected error occurred: {str(e)}")
        return None

In [4]:
def remove_nan_rows(df, delimiter):
    """
    Remove rows with NaN in the 'question' column from a DataFrame.

    Parameters:
    df (DataFrame): The input DataFrame.

    Returns:
    DataFrame: The modified DataFrame with rows containing NaN in 'question' column removed.
    """
    # Validate if 'question' column exists in the DataFrame
    if delimiter in df.columns:
        # Remove rows where 'question' column is NaN
        df_clean = df.dropna(subset=[delimiter])
        return df_clean
    else:
        raise ValueError(f"No {delimiter} column found in the DataFrame")

In [5]:
def remove_columns(dataframe, columns_to_remove):
    """
    Remove specified columns from a pandas DataFrame.

    Parameters:
    dataframe (pd.DataFrame): The original DataFrame.
    columns_to_remove (list): A list of column names to remove.

    Returns:
    pd.DataFrame: A new DataFrame with specified columns removed.
    """
    # Check if all columns to remove are in the DataFrame
    for col in columns_to_remove:
        if col not in dataframe.columns:
            raise ValueError(f"Column '{col}' does not exist in the DataFrame.")

    # Drop the columns
    dataframe = dataframe.drop(columns=columns_to_remove)
    return dataframe

In [6]:
def merge_dataframes_on_column(df1, df2, common_column, how='inner'):
    """
    Merge two pandas DataFrames on a specific common column.

    Parameters:
    df1 (pd.DataFrame): The first DataFrame.
    df2 (pd.DataFrame): The second DataFrame.
    common_column (str): The name of the common column to merge on.
    how (str): Type of merge to be performed ('left', 'right', 'outer', 'inner'), default is 'inner'.

    Returns:
    pd.DataFrame: A new DataFrame resulting from the merge of the two input DataFrames.
    """
    # Check if the common column exists in both DataFrames
    if common_column not in df1.columns or common_column not in df2.columns:
        raise ValueError(f"The common column '{common_column}' must exist in both DataFrames.")

    # Merge the DataFrames on the common_column
    result = pd.merge(df1, df2, on=common_column, how=how)
    return result

## 3. Get the data

### 3.1 Paths

In [7]:
## Questions

PATH_questions_split_train = 'data/text/USGS_split_train_questions.json'
PATH_questions_split_test = 'data/text/USGS_split_test_questions.json'
PATH_questions_split_val = 'data/text/USGS_split_val_questions.json'

In [8]:
## Answers

PATH_answers_split_train = 'data/text/USGS_split_train_answers.json'
PATH_answers_split_test = 'data/text/USGS_split_test_answers.json'
PATH_answers_split_val = 'data/text/USGS_split_val_answers.json'

In [9]:
## All answers / questions

PATH_all_questions = 'data/text/USGSquestions.json'
PATH_all_answers = 'data/text/USGSanswers.json'

### 3.2 Data Download

In [10]:
## Questions

questions_train = json_to_dataframe(PATH_questions_split_train, "questions")
questions_test = json_to_dataframe(PATH_questions_split_test, "questions")
questions_val = json_to_dataframe(PATH_questions_split_val, "questions")

In [11]:
## Answers

answers_train = json_to_dataframe(PATH_answers_split_train, "answers")
answers_test = json_to_dataframe(PATH_answers_split_test, "answers")
answers_val = json_to_dataframe(PATH_answers_split_val, "answers")

## 4. Data Manipulation

### 4.1 Remove NaN

In [12]:
## Questions

questions_train_nan = remove_nan_rows(questions_train, "question")
questions_test_nan = remove_nan_rows(questions_test, "question")
questions_val_nan = remove_nan_rows(questions_val, "question")

In [13]:
## Answers

answers_train_nan = remove_nan_rows(answers_train, "answer")
answers_test_nan = remove_nan_rows(answers_test, "answer")  
answers_val_nan = remove_nan_rows(answers_val, "answer")    

### 4.2 Remove unnecessary columns

In [14]:
## Questions

questions_train_nan_clean = remove_columns(questions_train_nan, ["active", "date_added", "people_id", "answers_ids"])
questions_test_nan_clean = remove_columns(questions_test_nan, ["active", "date_added", "people_id", "answers_ids"])
questions_val_nan_clean = remove_columns(questions_val_nan, ["active", "date_added", "people_id", "answers_ids"])

In [15]:
## Answers

answers_train_nan_clean = remove_columns(answers_train_nan, ["active", "date_added", "people_id", "question_id"])
answers_test_nan_clean = remove_columns(answers_test_nan, ["active", "date_added", "people_id", "question_id"])
answers_val_nan_clean = remove_columns(answers_val_nan, ["active", "date_added", "people_id", "question_id"])

### 4.3 Concatenating Questions & Answers

In [16]:
## Train

train = merge_dataframes_on_column(questions_train_nan_clean, answers_train_nan_clean, "id")

In [17]:
## Test

test = merge_dataframes_on_column(questions_test_nan_clean, answers_test_nan_clean, "id")

In [18]:
## Test

val = merge_dataframes_on_column(questions_val_nan_clean, answers_val_nan_clean, "id")  

## 5. Generate embeddings

In [19]:
## Create the VocabEncoder objects

encoder_questions = VE.VocabEncoder(PATH_all_questions, questions = True)
encoder_answers = VE.VocabEncoder(PATH_all_answers, questions = False, range_numbers=False)

In [20]:
## Create vocabulary

vocabulary_questions = encoder_questions.getVocab()
vocabulary_answers = encoder_answers.getVocab()
len(vocabulary_questions), len(vocabulary_answers)

(144, 95)

In [21]:
## Create the seq2vec object

seq2vec = seq2vec.factory(vocabulary_questions, {'arch': 'skipthoughts', 'dir_st': 'data/skip-thoughts', 'type': 'BayesianUniSkip', 'dropout': 0.25, 'fixed_emb': False})
for param in seq2vec.parameters():
    param.requires_grad = False

seq2vec.to(device)
seq2vec.eval()



BayesianUniSkip(
  (embedding): Embedding(145, 620, padding_idx=0)
  (rnn): BayesianGRU(
    (gru_cell): BayesianGRUCell(
      (weight_ir): Linear(in_features=620, out_features=2400, bias=True)
      (weight_ii): Linear(in_features=620, out_features=2400, bias=True)
      (weight_in): Linear(in_features=620, out_features=2400, bias=True)
      (weight_hr): Linear(in_features=2400, out_features=2400, bias=False)
      (weight_hi): Linear(in_features=2400, out_features=2400, bias=False)
      (weight_hn): Linear(in_features=2400, out_features=2400, bias=False)
      (drop_ir): SequentialDropout(0.2500)
      (drop_ii): SequentialDropout(0.2500)
      (drop_in): SequentialDropout(0.2500)
      (drop_hr): SequentialDropout(0.2500)
      (drop_hi): SequentialDropout(0.2500)
      (drop_hn): SequentialDropout(0.2500)
    )
  )
)

In [23]:
import torch
import numpy as np
import os
from tqdm import tqdm

BATCH_SIZE = 4000

def create_batch(dataframe, start_idx, batch_size):
    batch_data = []
    for idx in range(start_idx, min(start_idx + batch_size, len(dataframe))):
        row = dataframe.iloc[idx]
        question_encoded = encoder_questions.encode(row.question)
        answer_encoded = encoder_answers.encode(row.answer)

        question_tensor = torch.tensor(question_encoded, dtype=torch.long).unsqueeze(0)  # Unsqueeze here for batch dimension
        answer_tensor = torch.tensor(answer_encoded, dtype=torch.long)

        batch_data.append((row.id, question_tensor, answer_tensor, row.type, row.img_id))

    return batch_data


def process_batch(batch_data, save_batch_accumulator, save_path, current_save_idx):
    all_questions = torch.cat([item[1] for item in batch_data], dim=0).to(device)
    question_representations = seq2vec(all_questions)

    # Iterating over the batch to accumulate data
    for idx, (data_id, _, answer_tensor, question_type, image_id) in enumerate(batch_data):
        question_representation = question_representations[idx].cpu().detach()
        answer_tensor = answer_tensor.cpu().detach()

        data = {
            'question': question_representation,
            'answer': answer_tensor,
            'question_type': question_type,
            'image_id': image_id
        }
        save_batch_accumulator.append(data)

        # Once we accumulate enough items, we save them in a batch and empty the accumulator
        if len(save_batch_accumulator) >= BATCH_SIZE:
            save_accumulated_data(save_batch_accumulator, save_path, current_save_idx)
            save_batch_accumulator.clear()
            current_save_idx += 1

    return current_save_idx  # return updated save index

def save_accumulated_data(data_accumulator, save_path, save_idx):
    batch_save_path = os.path.join(save_path, f"batch_{save_idx}.pt")
    torch.save(data_accumulator, batch_save_path)

# Prepare a list to accumulate processed data before saving
save_batch_accumulator = []
current_save_idx = 0  # This keeps track of the current batch file number for saving

# Directory where batch files will be saved
save_directory = "data/text_representations/val"

# Main loop to handle batch processing
for start_idx in tqdm(range(0, len(val), BATCH_SIZE)):
    batch_data = create_batch(val, start_idx, BATCH_SIZE)
    current_save_idx = process_batch(batch_data, save_batch_accumulator, save_directory, current_save_idx)

# After all batches are processed, there might be residual data that hasn't been saved yet
if save_batch_accumulator:
    save_accumulated_data(save_batch_accumulator, save_directory, current_save_idx)

  0%|          | 0/26 [00:00<?, ?it/s]

100%|██████████| 26/26 [00:52<00:00,  2.02s/it]


### Debugging

In [1]:
test_question = train.iloc[0]["question"]
test_answer = train.iloc[0]["answer"]

test_question_encoded = encoder_questions.encode(test_question)
test_answer_encoded = encoder_answers.encode(test_answer)

test_question_encoded_numpy = np.array(test_question_encoded, dtype="int16")
question_tensor = torch.tensor(test_question_encoded_numpy, dtype=torch.long, device=device).unsqueeze(1).to(device)

representation = seq2vec(question_tensor)

question_tensor.shape

linear_q = nn.Linear(2400, 1200)
x_q = linear_q(representation)

NameError: name 'train' is not defined

In [None]:
## R
visual_representation = torch.load("/Users/kaanaydin/Documents/02 School/02 Master's degree/03 HS23/02 Deep Learning/deep-learning/data-representations/visual/0.pt")

## Building visual head
output_size = (512 / 32)**2
visual = torch.nn.Conv2d(2048,int(2048/output_size), 1)
# visual.to(device=device);

##  Next visual step
visual_next = visual(visual_representation).view(-1, 2048)

## To fusion dimensions
visual_linear = nn.Linear(2048, 1200)
x_v = visual_linear(visual_next)
x_v = nn.Tanh()(x_v)
x_v.shape

In [None]:
linear_classif1 = nn.Linear(1200, 256)
linear_classif2 = nn.Linear(256, 95)

In [None]:
x = torch.mul(x_v, x_q)
x = linear_classif1(x)
x = linear_classif2(x)

In [7]:
import torch
import os
import torch.nn as nn

text = torch.load(os.path.join("data/text_representations/train/", str(1200) + ".pt"))
question = text["question"]
print(question.shape)

img = torch.load(os.path.join("data/image_representations/", str(1200) + ".pt"))
img.shape

## Building visual head
output_size = (512 / 32)**2
visual = torch.nn.Conv2d(2048,int(2048/output_size), 1)
# visual.to(device=device);

##  Next visual step
visual_next = visual(img).view(-1, 2048)

## To fusion dimensions
visual_linear = nn.Linear(2048, 1200)
x_v = visual_linear(visual_next)
x_v = nn.Tanh()(x_v)
x_v.shape

torch.Size([20, 2400])


torch.Size([1, 1200])

In [9]:
linear_q = nn.Linear(2400, 1200)
x_q = linear_q(question)
x_q = nn.Tanh()(x_q)
print(x_q.shape)

torch.Size([20, 1200])


In [10]:
x = torch.mul(x_v, x_q)
x.shape

torch.Size([20, 1200])