# Notebook Objective:
This is final notebook for DOCVQA dataset preparation. This notebook contains all type of preprocessed dataset for train, valid and test.
Contents:
- Basic processing, creating full image path, create dataframe from json file [On training, validation and test]
- Read image in Pillow --> save the form --> Removing full image path [On training, validation and test]
- Converting Pandas Dataframe to Huggingface dataset [On training, validation and test]
- Adding ground truth column which contains ground truth parsing information i.e. gt_parses [On training and Validation]
- Sample data checking whether everything processed correctly or not
- Huggingface dataset dictionary creation to store all the training, validation and test dataset in a single container
- Save the whole processed dataset in disk
- We have taken 2000 train, 400 valid and 200 test dataset
- Lastly I have pushed the dataset both with and without gtparse content so that we can use different dataset as per different model's requirement


## Importing Necessary Dependencies

In [None]:
!pip install -q datasets
!pip install huggingface_hub

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/519.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/519.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m512.0/519.6 kB[0m [31m8.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import os
import json
from google.colab import data_table
import datasets
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
import json
import re
import random
from PIL import Image

In [None]:
## function to check, list of columns and info from dataframe
def colInfo(dfrm):

  print(f'List of Columns: {dfrm.columns}')
  print('************************************************')
  print(f'Details info the dataframe: {dfrm.info()}')
  return None

## S-1: Basic Processing also creating full image path for Train, Valid and Test

In [None]:
## S-1: Custom function to create image path

def process_data(k, main_json_path, base_dir_path):

  # Open the JSON file
  with open(main_json_path, 'r') as file:
      # Load the contents of the file into a dictionary
      data = json.load(file)

  # Convert 'data' list into a dataframe
  df = pd.DataFrame(data['data'])

  df = df.iloc[:k]

  # Define a function to join the base path with the image path
  def join_paths(image_path):
      return os.path.join(base_dir_path, image_path)

  # Apply the function to create a new column with the full image path
  df['full_path_image'] = df['image'].apply(join_paths)

  def extract_ocr_info(col, ocr_base_dir):

    # Construct the ocr_path for the col
    ocr_path = os.path.join(ocr_base_dir, col['ucsf_document_id'] + '_' + col['ucsf_document_page_no'] + '.json')

    # Open the json file and load the data
    with open(ocr_path, 'r') as f:
      ocr_json = json.load(f)

    # Extract the bounding boxes and word lists from the json data
    bbox_row = []
    word_row = []
    for i in ocr_json['recognitionResults'][0]['lines']:
      for j in i['words']:
        bbox_row.append(j['boundingBox'])
        word_row.append(j['text'])

    # Return the bounding boxes and word lists
    return bbox_row, word_row

  ocr_base_dir = os.path.join(base_dir_path, sorted(os.listdir(base_dir_path))[1])

  # Apply the extract_ocr_info function to each row in the DataFrame
  df['bounding_boxes'], df['word_list'] = zip(*df.apply(lambda col: extract_ocr_info(col, ocr_base_dir), axis=1))

  df.drop(['questionId', 'image', 'ucsf_document_id', 'ucsf_document_page_no'], axis=1, inplace=True)

  return df


In [None]:
## Basic Processing of all three Train, Validation and Test dataset

# training usage:
train_path_main = '/content/drive/MyDrive/Datasets/docvqa_old/train/train_v1.0.json'
base_train_path = '/content/drive/MyDrive/Datasets/docvqa_old/train'

df_train_sub_ocr = process_data(2000,train_path_main, base_train_path)


# validation usage:
valid_path_main = '/content/drive/MyDrive/Datasets/docvqa_old/val/val_v1.0.json'
base_valid_path = '/content/drive/MyDrive/Datasets/docvqa_old/val'

df_valid_sub_ocr = process_data(400, valid_path_main, base_valid_path)


# test usage: use as per requirements
# test_path_main = '/content/drive/MyDrive/Datasets/docvqa_old/test/test_v1.0.json'
# base_test_path = '/content/drive/MyDrive/Datasets/docvqa_old/test'

# df_test_sub_ocr = process_data(200, test_path_main, base_test_path)


In [None]:
colInfo(df_train_sub_ocr)

List of Columns: Index(['question', 'docId', 'answers', 'data_split', 'full_path_image',
       'bounding_boxes', 'word_list'],
      dtype='object')
************************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   question         2000 non-null   object
 1   docId            2000 non-null   int64 
 2   answers          2000 non-null   object
 3   data_split       2000 non-null   object
 4   full_path_image  2000 non-null   object
 5   bounding_boxes   2000 non-null   object
 6   word_list        2000 non-null   object
dtypes: int64(1), object(6)
memory usage: 109.5+ KB
Details info the dataframe: None


In [None]:
colInfo(df_valid_sub_ocr)

List of Columns: Index(['question', 'docId', 'answers', 'data_split', 'full_path_image',
       'bounding_boxes', 'word_list'],
      dtype='object')
************************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   question         400 non-null    object
 1   docId            400 non-null    int64 
 2   answers          400 non-null    object
 3   data_split       400 non-null    object
 4   full_path_image  400 non-null    object
 5   bounding_boxes   400 non-null    object
 6   word_list        400 non-null    object
dtypes: int64(1), object(6)
memory usage: 22.0+ KB
Details info the dataframe: None


In [None]:
# colInfo(df_test_sub_ocr)

## S-2: Convert Pandas Dataframe to Huggingface Dataset

In [None]:
## Convert dataframe into huggingface dataset object for all training , validation and test

# 1. Convert training dataset
hf_train_sub_ocr = Dataset.from_pandas(df_train_sub_ocr)

# 2. Convert validation dataset
hf_valid_sub_ocr = Dataset.from_pandas(df_valid_sub_ocr)

# 3. Convert test dataset
# hf_test_sub_ocr = Dataset.from_pandas(df_test_sub_ocr)

## S-3: Removing image path and Adding raw image on Train, Valid and Test

In [None]:
## S-3: Custom function to remove image path and read in Pillow & store all the images also delete the path

# Define a function to open and save images
def process_image(hfsample):
  image_path = hfsample['full_path_image']
  image = Image.open(image_path)
  hfsample['image_raw'] = image
  del hfsample['full_path_image']
  return hfsample

In [None]:
# Apply the function to each row in the dataset for train, valid and test

# 1. For train
hf_train_sub_ocr_imgraw = hf_train_sub_ocr.map(process_image)

# 2. For valid
hf_valid_sub_ocr_imgraw = hf_valid_sub_ocr.map(process_image)

# 3. For test
# hf_test_sub_ocr_imgraw = hf_test_sub_ocr.map(process_image)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [None]:
hf_train_sub_ocr_imgraw

Dataset({
    features: ['question', 'docId', 'answers', 'data_split', 'bounding_boxes', 'word_list', 'image_raw'],
    num_rows: 2000
})

In [None]:
hf_valid_sub_ocr_imgraw

Dataset({
    features: ['question', 'docId', 'answers', 'data_split', 'bounding_boxes', 'word_list', 'image_raw'],
    num_rows: 400
})

In [None]:
# for index,i in enumerate(hf_train_sub_ocr_imgraw['question']):
#   print(index, i)
  # hf_train_json = json.loads(i)
  # print(index,hf_train_json)

## S-4: Add 'ground_truth' on Training and Validation


In [None]:
## S-4: Custom function to add 'ground_truth' column in the training and validation dataset

# Creating custom function for DOCVQA task
def add_ground_truth(hf_examples):
  images = hf_examples['image_raw']
  questions = hf_examples['question']
  answers = hf_examples['answers']

  ground_truths = []    # Creating empty list
  for image,question, answer in zip(images,questions, answers):
    # we need to escape " characters appearing in the query and/or answer
    question = question.replace("\\", "") ## this was just one corrupt example (index 91 of training set)
    question = re.sub(' +', ' ', question)
    question = question.replace('"', '\\"')   # replacement of " from question
    # let's create the ground truth string
    ground_truth_example = '{"gt_parses": ['
    for idx, answ in enumerate(answer):
      # ans = answ.replace('"', '\"')   # replacement of " from answer
      answ = answ.replace("\\", "")
      ans = answ.replace('"', '\\"')
      ground_truth_example += '{"question" : "' + question + '", "answer" : "' + ans + '"}'

      # add comma for more than one element present in the answer list
      if idx != len(answer) - 1:   # when current index is not equal to last index
        ground_truth_example += ', '
    ground_truth_example += ']}'
    ground_truths.append(ground_truth_example)    ## appending ground_truths list for every row

  hf_examples['ground_truth'] = ground_truths

  return hf_examples

In [None]:
## Creating ground truth string for all three dataset train, valid and test dataset

# For Training
hf_train_sub_ocr_imgraw_gt = hf_train_sub_ocr_imgraw.map(add_ground_truth, batched=True)

# For Validation
hf_valid_sub_ocr_imgraw_gt = hf_valid_sub_ocr_imgraw.map(add_ground_truth, batched=True)

# *** Note : For Test data as there is no such answers column so we will not processed further

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

### Train data sample check

In [None]:
hf_train_sub_ocr_imgraw_gt

Dataset({
    features: ['question', 'docId', 'answers', 'data_split', 'bounding_boxes', 'word_list', 'image_raw', 'ground_truth'],
    num_rows: 2000
})

In [None]:
for index,i in enumerate(hf_train_sub_ocr_imgraw_gt['ground_truth']):
  # print(index, i)
  hf_train_json = json.loads(i)
  # print(index,hf_train_json)

### Valid data sample check

In [None]:
hf_valid_sub_ocr_imgraw_gt

Dataset({
    features: ['question', 'docId', 'answers', 'data_split', 'bounding_boxes', 'word_list', 'image_raw', 'ground_truth'],
    num_rows: 400
})

In [None]:
for index,i in enumerate(hf_valid_sub_ocr_imgraw_gt['ground_truth']):
  # print(index, i)
  hf_valid_json = json.loads(i)
  # print(index,hf_valid_json)

### Test data sample check

In [None]:
# hf_test_sub_ocr_imgraw

## S-5.1: Huggingface dataset to csv transform and download

In [None]:
## Download train csv format
# hf_train_sub_ocr_imgraw_gt.to_csv("/content/drive/MyDrive/DOCVQA_Processed_Dataset/docvqa_sub_2600_wo_fuzz/docvqa_sub_train_2000.csv")

In [None]:
## Download valid csv format
# hf_valid_sub_ocr_imgraw_gt.to_csv("/content/drive/MyDrive/DOCVQA_Processed_Dataset/docvqa_sub_2600_wo_fuzz/docvqa_sub_valid_400.csv")

In [None]:
## Download test csv format
# hf_test_sub_ocr_imgraw.to_csv("/content/drive/MyDrive/DOCVQA_Processed_Dataset/docvqa_sub_2600_wo_fuzz/docvqa_sub_test_200.csv")

## S-5.2: Huggingface dataset to json transform and download

In [None]:
## Download train json format
# hf_train_sub_ocr_imgraw_gt.to_json("/content/drive/MyDrive/DOCVQA_Processed_Dataset/docvqa_sub_2600_wo_fuzz/docvqa_sub_train_2000.json")

## Download valid json format
# hf_valid_sub_ocr_imgraw_gt.to_json("/content/drive/MyDrive/DOCVQA_Processed_Dataset/docvqa_sub_2600_wo_fuzz/docvqa_sub_valid_400.json")

## Download test json format
# hf_test_sub_ocr_imgraw.to_json("/content/drive/MyDrive/DOCVQA_Processed_Dataset/docvqa_sub_2600_wo_fuzz/docvqa_sub_test_200.json")

#### Note: Problem when using save_from_disk and Push to hub
- We can save to and load from disk for DatasetDict if different features in training, validation and test dataset have.
- But if different features (columns) will present then we can't push the DatasetDict to hub

## S-6: Upload and Reload sub Dataset in HuggingFace Hub

In [None]:
# With gt_prompt

## To push the DatsetDict into hub we must have same features for all dataset

# for train and validation
processed_sub_train_valid_2400_gtparse = DatasetDict(
    {"train": hf_train_sub_ocr_imgraw_gt,
     "valid": hf_valid_sub_ocr_imgraw_gt,
})

# Without gt_prompt
## To push the DatsetDict into hub we must have same features for all dataset
# for train and validation
processed_sub_train_valid_2400 = DatasetDict(
    {"train": hf_train_sub_ocr_imgraw,
     "valid": hf_valid_sub_ocr_imgraw,
})

# for test data no separate datasets are required
# processed_sub_test_200 = DatasetDict(
#     {"test": hf_test_sub_ocr_imgraw
# })

In [None]:
!huggingface-cli login      ## indra-inc


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the '

In [None]:
# with gt_prompt train and valid
processed_sub_train_valid_2400_gtparse.push_to_hub("indra-inc/docvqa_en_train_valid_2400_gtparse")

# without gt_prompt train and valid
processed_sub_train_valid_2400.push_to_hub("indra-inc/docvqa_en_train_valid_2400")

# test data
# processed_sub_test_200.push_to_hub("indra-inc/docvqa_en_test_200")

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/864 [00:00<?, ?B/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

In [None]:
## Reload the train and valid dataset dict
# processed_trn_vld_loaded_sub_2400_gtparse = load_dataset('indra-inc/docvqa_en_train_valid_2400_gtparse')
# processed_trn_vld_loaded_sub_2400 = load_dataset('indra-inc/docvqa_en_train_valid_2400')

## Reload the test dataset dict
# processed_tst_loaded_sub_200 = load_dataset('indra-inc/docvqa_en_test_200')

In [None]:
## Checking whether reloading done successfully or not
# processed_trn_vld_loaded_sub_2400_gtparse

In [None]:
# processed_trn_vld_loaded_sub_2400

In [None]:
# processed_tst_loaded_sub_200

## End Note:
We did all these steps successfully and our subset dataset uploaded successfully.