# Notebook Objective:
This is final notebook for DOCVQA dataset preparation. This notebook contains all type of preprocessed dataset for train, valid and test.
Contents:
- Basic processing, creating full image path, create dataframe from json file [On training, validation and test]
- Read image in Pillow --> save the form --> Removing full image path [On training, validation and test]
- Converting Pandas Dataframe to Huggingface dataset [On training, validation and test]
- Adding ground truth column which contains ground truth parsing information i.e. gt_parses [On training and Validation]
- Sample data checking whether everything processed correctly or not
- Huggingface dataset dictionary creation to store all the training, validation and test dataset in a single container
- Save the whole processed dataset in disk
- Lastly I have pushed the dataset both with and without gtparse content so that we can use different dataset as per different model's requirement

## Importing necessary Dependencies

In [1]:
!pip install -q datasets
!pip install huggingface_hub

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/519.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━[0m [32m450.6/519.6 kB[0m [31m13.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd
import os
import json
from google.colab import data_table
import datasets
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
import json
import re
import random
from PIL import Image

In [4]:
## function to check, list of columns and info from dataframe
def colInfo(dfrm):

  print(f'List of Columns: {dfrm.columns}')
  print('************************************************')
  print(f'Details info the dataframe: {dfrm.info()}')
  return None

## S-1: Basic Processing also creating full image path for Train, Valid and Test

In [5]:
## S-1: Custom function to create image path

def process_data(main_json_path, base_dir_path):

  # Open the JSON file
  with open(main_json_path, 'r') as file:
      # Load the contents of the file into a dictionary
      data = json.load(file)

  # Convert 'data' list into a dataframe
  df = pd.DataFrame(data['data'])

  # Define a function to join the base path with the image path
  def join_paths(image_path):
      return os.path.join(base_dir_path, image_path)

  # Apply the function to create a new column with the full image path
  df['full_path_image'] = df['image'].apply(join_paths)

  def extract_ocr_info(col, ocr_base_dir):

    # Construct the ocr_path for the col
    ocr_path = os.path.join(ocr_base_dir, col['ucsf_document_id'] + '_' + col['ucsf_document_page_no'] + '.json')

    # Open the json file and load the data
    with open(ocr_path, 'r') as f:
      ocr_json = json.load(f)

    # Extract the bounding boxes and word lists from the json data
    bbox_row = []
    word_row = []
    for i in ocr_json['recognitionResults'][0]['lines']:
      for j in i['words']:
        bbox_row.append(j['boundingBox'])
        word_row.append(j['text'])

    # Return the bounding boxes and word lists
    return bbox_row, word_row

  ocr_base_dir = os.path.join(base_dir_path, sorted(os.listdir(base_dir_path))[1])

  # Apply the extract_ocr_info function to each row in the DataFrame
  df['bounding_boxes'], df['word_list'] = zip(*df.apply(lambda col: extract_ocr_info(col, ocr_base_dir), axis=1))

  df.drop(['questionId', 'image', 'ucsf_document_id', 'ucsf_document_page_no'], axis=1, inplace=True)

  return df


In [6]:
## Basic Processing of all three Train, Validation and Test dataset

# training usage:
train_path_main = '/content/drive/MyDrive/Datasets/docvqa_old/train/train_v1.0.json'
base_train_path = '/content/drive/MyDrive/Datasets/docvqa_old/train'

df_train_ocr = process_data(train_path_main, base_train_path)


# validation usage:
valid_path_main = '/content/drive/MyDrive/Datasets/docvqa_old/val/val_v1.0.json'
base_valid_path = '/content/drive/MyDrive/Datasets/docvqa_old/val'

df_valid_ocr = process_data(valid_path_main, base_valid_path)


# test usage:
test_path_main = '/content/drive/MyDrive/Datasets/docvqa_old/test/test_v1.0.json'
base_test_path = '/content/drive/MyDrive/Datasets/docvqa_old/test'

df_test_ocr = process_data(test_path_main, base_test_path)


In [None]:
colInfo(df_train_ocr)

List of Columns: Index(['question', 'docId', 'answers', 'data_split', 'full_path_image',
       'bounding_boxes', 'word_list'],
      dtype='object')
************************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39463 entries, 0 to 39462
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   question         39463 non-null  object
 1   docId            39463 non-null  int64 
 2   answers          39463 non-null  object
 3   data_split       39463 non-null  object
 4   full_path_image  39463 non-null  object
 5   bounding_boxes   39463 non-null  object
 6   word_list        39463 non-null  object
dtypes: int64(1), object(6)
memory usage: 2.1+ MB
Details info the dataframe: None


In [None]:
colInfo(df_valid_ocr)

List of Columns: Index(['question', 'docId', 'answers', 'data_split', 'full_path_image',
       'bounding_boxes', 'word_list'],
      dtype='object')
************************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5349 entries, 0 to 5348
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   question         5349 non-null   object
 1   docId            5349 non-null   int64 
 2   answers          5349 non-null   object
 3   data_split       5349 non-null   object
 4   full_path_image  5349 non-null   object
 5   bounding_boxes   5349 non-null   object
 6   word_list        5349 non-null   object
dtypes: int64(1), object(6)
memory usage: 292.6+ KB
Details info the dataframe: None


In [None]:
colInfo(df_test_ocr)

List of Columns: Index(['question', 'docId', 'data_split', 'full_path_image', 'bounding_boxes',
       'word_list'],
      dtype='object')
************************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5188 entries, 0 to 5187
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   question         5188 non-null   object
 1   docId            5188 non-null   int64 
 2   data_split       5188 non-null   object
 3   full_path_image  5188 non-null   object
 4   bounding_boxes   5188 non-null   object
 5   word_list        5188 non-null   object
dtypes: int64(1), object(5)
memory usage: 243.3+ KB
Details info the dataframe: None


## S-2: Convert Pandas Dataframe to Huggingface Dataset

In [None]:
## Convert dataframe into huggingface dataset object for all training , validation and test

# 1. Convert training dataset
hf_train_ocr = Dataset.from_pandas(df_train_ocr)

# 2. Convert validation dataset
hf_valid_ocr = Dataset.from_pandas(df_valid_ocr)

# 3. Convert test dataset
hf_test_ocr = Dataset.from_pandas(df_test_ocr)

## S-3: Removing image path and Adding raw image on Train, Valid and Test

In [None]:
## S-3: Custom function to remove image path and read in Pillow & store all the images also delete the path

# Define a function to open and save images
def process_image(hfsample):
    image_path = hfsample['full_path_image']
    image = Image.open(image_path)
    hfsample['image_raw'] = image
    del hfsample['full_path_image']
    return hfsample

In [None]:
# Apply the function to each row in the dataset for train, valid and test

# 1. For train
hf_train_ocr_imgraw = hf_train_ocr.map(process_image)

# 2. For valid
hf_valid_ocr_imgraw = hf_valid_ocr.map(process_image)

# 3. For test
hf_test_ocr_imgraw = hf_test_ocr.map(process_image)

## S-4: Add 'ground_truth' on Training and Validation


In [None]:
## S-4: Custom function to add 'ground_truth' column in the training and validation dataset

# Creating custom function for DOCVQA task
def add_ground_truth(hf_examples):
  images = hf_examples['image_raw']
  questions = hf_examples['question']
  answers = hf_examples['answers']

  ground_truths = []    # Creating empty list
  for image,question, answer in zip(images,questions, answers):
    # we need to escape " characters appearing in the query and/or answer
    question = question.replace("\\", "") ## this was just one corrupt example (index 91 of training set)
    question = re.sub(' +', ' ', question)
    question = question.replace('"', '\\"')   # replacement of " from question
    # let's create the ground truth string
    ground_truth_example = '{"gt_parses": ['
    for idx, answ in enumerate(answer):
      # ans = answ.replace('"', '\"')   # replacement of " from answer
      answ = answ.replace("\\", "")
      ans = answ.replace('"', '\\"')
      ground_truth_example += '{"question" : "' + question + '", "answer" : "' + ans + '"}'

      # add comma for more than one element present in the answer list
      if idx != len(answer) - 1:   # when current index is not equal to last index
        ground_truth_example += ', '
    ground_truth_example += ']}'
    ground_truths.append(ground_truth_example)    ## appending ground_truths list for every row

  hf_examples['ground_truth'] = ground_truths

  return hf_examples

In [None]:
## Creating ground truth string for all three dataset train, valid and test dataset

# For Training
hf_train_ocr_imgraw_gt = hf_train_ocr_imgraw.map(add_ground_truth, batched=True)

# For Validation
hf_valid_ocr_imgraw_gt = hf_valid_ocr_imgraw.map(add_ground_truth, batched=True)

# *** Note : For Test data as there is no such answers column so we will not processed further
# hf_test_ocr_imgraw = hf_test_ocr_imgraw.map(add_ground_truth, batched=True)

Map:   0%|          | 0/39463 [00:00<?, ? examples/s]

Map:   0%|          | 0/5349 [00:00<?, ? examples/s]

### Train data sample check

In [None]:
hf_train_ocr_imgraw_gt

Dataset({
    features: ['question', 'docId', 'answers', 'data_split', 'bounding_boxes', 'word_list', 'image_raw', 'ground_truth'],
    num_rows: 39463
})

In [None]:
for index,i in enumerate(hf_train_ocr_imgraw_gt['ground_truth']):
  # print(index, i)
  hf_train_json = json.loads(i)
  print(index,hf_train_json)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
34463 {'gt_parses': [{'question': 'Whose name is written in "Payee Name" column of second table?', 'answer': 'James Hall'}]}
34464 {'gt_parses': [{'question': 'Which date is mentioned as \'Effective Date:" in the document?', 'answer': '4-6-2001'}]}
34465 {'gt_parses': [{'question': 'What is the quantity of "Sugar Extraction(%)" produced in the year 1970 mentioned under the heading "Production for week ending August 1 " ?', 'answer': '4.849'}]}
34466 {'gt_parses': [{'question': "What is the 'Delivery Point' mentioned?", 'answer': 'Lex KY'}, {'question': "What is the 'Delivery Point' mentioned?", 'answer': 'LEX KY'}]}
34467 {'gt_parses': [{'question': 'What is the Title of the document?', 'answer': 'Classified Material Receipt'}]}
34468 {'gt_parses': [{'question': 'What is the Title and/or number of document?', 'answer': 'EMT # 083095'}]}
34469 {'gt_parses': [{'question': 'What is the "Production for week ending JULY 31" in

### Valid data sample check

In [None]:
hf_valid_ocr_imgraw_gt

Dataset({
    features: ['question', 'docId', 'answers', 'data_split', 'bounding_boxes', 'word_list', 'image_raw', 'ground_truth'],
    num_rows: 5349
})

In [None]:
for index,i in enumerate(hf_valid_ocr_imgraw_gt['ground_truth']):
  # print(index, i)
  hf_valid_json = json.loads(i)
  print(index,hf_valid_json)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
349 {'gt_parses': [{'question': 'What is the quantity of the medication on the top right corner of the page?', 'answer': '1mg'}]}
350 {'gt_parses': [{'question': 'What is the name of the medication at the top right corner of the page?', 'answer': 'Trimegestone (1mg)'}, {'question': 'What is the name of the medication at the top right corner of the page?', 'answer': 'Trimegestone'}]}
351 {'gt_parses': [{'question': 'What is the heading of the first column of the table?', 'answer': 'project'}, {'question': 'What is the heading of the first column of the table?', 'answer': 'Project'}]}
352 {'gt_parses': [{'question': 'What is the target date for WP2(2)?', 'answer': 'mid 1999'}, {'question': 'What is the target date for WP2(2)?', 'answer': 'Mid 1999'}]}
353 {'gt_parses': [{'question': 'What is the target date for WP2(4)?', 'answer': 'late 1999/early 2000'}, {'question': 'What is the target date for WP2(4)?', 'answer': 'Late 1

### Test data sample check

In [None]:
hf_test_ocr_imgraw

Dataset({
    features: ['question', 'docId', 'data_split', 'bounding_boxes', 'word_list', 'image_raw'],
    num_rows: 5188
})

## S-5: Dataset Dictionary Creation

In [None]:
# ## Create the Dataset Dictionary for train, valid and test

# processed_dataset_full = DatasetDict(
#     {"train": hf_train_ocr_imgraw_gt,
#      "valid": hf_valid_ocr_imgraw_gt,
#      "test": hf_test_ocr_imgraw,
# })

In [None]:
# processed_dataset_full

In [None]:
# processed_dataset_full['train']

In [None]:
# processed_dataset_full['valid']

In [None]:
# processed_dataset_full['test']

## S-6: Save & Reload whole dataset in disk

Note

```
Saving a processed dataset on disk and reload it
Once you have your final dataset you can save it on your disk and reuse it later using datasets.load_from_disk. Saving a dataset creates a directory with various files:

arrow files: they contain your dataset’s data

dataset_info.json: contains the description, citations, etc. of the dataset

state.json: contains the list of the arrow files and other informations like the dataset format type, if any (torch or tensorflow for example)

encoded_dataset.save_to_disk("path/of/my/dataset/directory")
...
from datasets import load_from_disk
reloaded_encoded_dataset = load_from_disk("path/of/my/dataset/directory")

Both datasets.Dataset and datasets.DatasetDict objects can be saved on disk, by using respectively datasets.Dataset.save_to_disk() and datasets.DatasetDict.save_to_disk().

Furthermore it is also possible to save datasets.Dataset and datasets.DatasetDict to other filesystems and cloud storages such as S3 by using respectively datasets.Dataset.save_to_disk() and datasets.DatasetDict.save_to_disk() and providing a Filesystem as input fs. To learn more about saving your datasets to other filesystem take a look at FileSystems Integration for cloud storages.

```

```
Exporting a dataset to csv, or to python objects
You can save your dataset in csv format using datasets.Dataset.to_csv(), so that you can use your dataset in other applications if you want to.

To get directly python objects, you can use datasets.Dataset.to_pandas() or datasets.Dataset.to_dict() to export the dataset as a pandas DataFrame or a python dict.

```

In [None]:
# ## Saving sub dataset
# processed_dataset_full.save_to_disk("/content/drive/MyDrive/DOCVQA_Processed_Dataset/docvqa_full_wo_fuzz")
# ## Reload sub dataset
# processed_dataset_full_loaded = load_from_disk("/content/drive/MyDrive/DOCVQA_Processed_Dataset/docvqa_full_wo_fuzz")

In [None]:
# processed_dataset_full_loaded

In [None]:
# processed_dataset_full_loaded['train']

In [None]:
# processed_dataset_full_loaded['train']['image_raw'][2]

In [None]:
# processed_dataset_full_loaded['valid']

In [None]:
# processed_dataset_full_loaded['test']

## S-7: Upload and Reload whole Dataset in HuggingFace Hub

In [None]:
# With gt_prompt

## To push the DatsetDict into hub we must have same features for all dataset

# for train and validation
processed_dataset_train_valid_gtparse = DatasetDict(
    {"train": hf_train_ocr_imgraw_gt,
     "valid": hf_valid_ocr_imgraw_gt,
})

# Without gt_prompt
## To push the DatsetDict into hub we must have same features for all dataset
# for train and validation
processed_dataset_train_valid = DatasetDict(
    {"train": hf_train_ocr_imgraw,
     "valid": hf_valid_ocr_imgraw,
})

# for test data
processed_dataset_test = DatasetDict(
    {"test": hf_test_ocr_imgraw
})

In [None]:
!huggingface-cli login      # indra-inc


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the '

### Uploading training, validation and test dataset

In [None]:
# with gt_prompt train and valid
processed_dataset_train_valid_gtparse.push_to_hub("indra-inc/docvqa_en_full_train_valid_processed_gtparse")

# without gt_prompt train and valid
processed_dataset_train_valid.push_to_hub("indra-inc/docvqa_en_full_train_valid_processed")

# test data
processed_dataset_test.push_to_hub("indra-inc/docvqa_en_full_test_processed")

Map:   0%|          | 0/3036 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/13 [00:00<?, ?it/s]

Map:   0%|          | 0/3036 [00:00<?, ? examples/s]

Map:   0%|          | 0/3036 [00:00<?, ? examples/s]

Map:   0%|          | 0/3036 [00:00<?, ? examples/s]

Map:   0%|          | 0/3036 [00:00<?, ? examples/s]

Map:   0%|          | 0/3036 [00:00<?, ? examples/s]

Map:   0%|          | 0/3036 [00:00<?, ? examples/s]

Map:   0%|          | 0/3036 [00:00<?, ? examples/s]

Map:   0%|          | 0/3035 [00:00<?, ? examples/s]

Map:   0%|          | 0/3035 [00:00<?, ? examples/s]

Map:   0%|          | 0/3035 [00:00<?, ? examples/s]

Map:   0%|          | 0/3035 [00:00<?, ? examples/s]

Map:   0%|          | 0/3035 [00:00<?, ? examples/s]

Map:   0%|          | 0/892 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/6 [00:00<?, ?it/s]

Map:   0%|          | 0/892 [00:00<?, ? examples/s]

Map:   0%|          | 0/892 [00:00<?, ? examples/s]

Map:   0%|          | 0/891 [00:00<?, ? examples/s]

Map:   0%|          | 0/891 [00:00<?, ? examples/s]

Map:   0%|          | 0/891 [00:00<?, ? examples/s]

Downloading metadata:   0%|          | 0.00/889 [00:00<?, ?B/s]

Map:   0%|          | 0/3036 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/13 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/31 [00:00<?, ?ba/s]

Map:   0%|          | 0/3036 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/31 [00:00<?, ?ba/s]

Map:   0%|          | 0/3036 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/31 [00:00<?, ?ba/s]

Map:   0%|          | 0/3036 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/31 [00:00<?, ?ba/s]

Map:   0%|          | 0/3036 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/31 [00:00<?, ?ba/s]

Map:   0%|          | 0/3036 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/31 [00:00<?, ?ba/s]

Map:   0%|          | 0/3036 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/31 [00:00<?, ?ba/s]

Map:   0%|          | 0/3036 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/31 [00:00<?, ?ba/s]

Map:   0%|          | 0/3035 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/31 [00:00<?, ?ba/s]

Map:   0%|          | 0/3035 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/31 [00:00<?, ?ba/s]

Map:   0%|          | 0/3035 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/31 [00:00<?, ?ba/s]

Map:   0%|          | 0/3035 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/31 [00:00<?, ?ba/s]

Map:   0%|          | 0/3035 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/31 [00:00<?, ?ba/s]

Map:   0%|          | 0/892 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/6 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Map:   0%|          | 0/892 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Map:   0%|          | 0/892 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Map:   0%|          | 0/891 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Map:   0%|          | 0/891 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Map:   0%|          | 0/891 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Map:   0%|          | 0/865 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/6 [00:00<?, ?it/s]

Map:   0%|          | 0/865 [00:00<?, ? examples/s]

Map:   0%|          | 0/865 [00:00<?, ? examples/s]

Map:   0%|          | 0/865 [00:00<?, ? examples/s]

Map:   0%|          | 0/864 [00:00<?, ? examples/s]

Map:   0%|          | 0/864 [00:00<?, ? examples/s]

Downloading metadata:   0%|          | 0.00/686 [00:00<?, ?B/s]

### Reloading train, valid and test dataset

In [None]:
## Reload the train and valid dataset dict
# processed_train_valid_loaded_gtparse = load_dataset("indra-inc/docvqa_en_full_train_valid_processed_gtparse")
# processed_train_valid_loaded = load_dataset("indra-inc/docvqa_en_full_train_valid_processed")

## Reload the test dataset dict
# processed_test_loaded = load_dataset("indra-inc/docvqa_en_full_test_processed")

In [None]:
# processed_train_valid_loaded_gtparse

In [None]:
# processed_train_valid_loaded

In [None]:
# processed_test_loaded

## End Note:
We did all these steps successfully and our full dataset uploaded successfully.