# Notebook Objective: 600 train and 100 test on each 16 sub folders

This is the final notebook which contains step by step experiment on RVL-CDIP dataset prepartion for Document classification
- Walk through all directories for training and validation data
- Custom features creation and create dataframe which contain, full image path, Image label which is nothing but class(i.e. here subfolder name), Image id which is encoded form of the class name
- Sorted the list of class name
- Deleting full image path also read and save the images in PILLOW
- After preprocessing of the whole dataset it will be push to huggingface hub

## Importing Necessary Dependencies

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q datasets
!pip install huggingface_hub

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m


In [None]:
import numpy as np
import pandas as pd
import os
import json
from PIL import Image
from datasets import ClassLabel, Features, Value, Dataset, DatasetDict, load_dataset

In [None]:
## Dataset path declaration
path_train = '/content/drive/MyDrive/Datasets/RVL_CDIP_M_600_100/Training'
path_valid = '/content/drive/MyDrive/Datasets/RVL_CDIP_M_600_100/Valid'

## S-1: Create dataframe which will contain image path, label and id, also convert it into Huggingface Dataset along with custom features

In [None]:
# Creating sorted label list to maintain parity
labels_list = sorted([label for label in os.listdir(path_train)])     # pass any path as both train and valid folder contains same named and number of folders
print(labels_list)
print('**********************************************************')
# label to id dictionary format
label2id_dict = {k: v for v,k in enumerate(labels_list)}     # v is label / class name  k is id i.e. 0,1,2...
print(label2id_dict)
print('**********************************************************')
# id to label dictionary format
id2label_dict = {v: k for v,k in enumerate(labels_list)}     # v is label / class name  k is id i.e. 0,1,2...
print(id2label_dict)

['advertisement', 'budget', 'email', 'file_folder', 'form', 'handwritten', 'invoice', 'letter', 'memo', 'news_article', 'presentation', 'questionnaire', 'resume', 'scientific_publication', 'scientific_report', 'specification']
**********************************************************
{'advertisement': 0, 'budget': 1, 'email': 2, 'file_folder': 3, 'form': 4, 'handwritten': 5, 'invoice': 6, 'letter': 7, 'memo': 8, 'news_article': 9, 'presentation': 10, 'questionnaire': 11, 'resume': 12, 'scientific_publication': 13, 'scientific_report': 14, 'specification': 15}
**********************************************************
{0: 'advertisement', 1: 'budget', 2: 'email', 3: 'file_folder', 4: 'form', 5: 'handwritten', 6: 'invoice', 7: 'letter', 8: 'memo', 9: 'news_article', 10: 'presentation', 11: 'questionnaire', 12: 'resume', 13: 'scientific_publication', 14: 'scientific_report', 15: 'specification'}


In [None]:
len(labels_list)

16

In [None]:
## Custom feature creation for Donut Model Execution
custom_features = Features({
'Full_Image_Path': Value(dtype='string', id=None),
'Image_label': Value(dtype='string', id=None),
'Image_id': ClassLabel(names=labels_list,num_classes=len(labels_list), id=None)
})

In [None]:
## Creating custom function for whole data preprocessing using label2ids
def dataProcessing(data_folder_path):

  # Initialize empty lists for storing image paths , labels, and ids
  images_path_list = []
  images_labels_list = []
  images_ids_list = []

  # Iterate throgh all directories and subdirectories and returns list of all the names and path accordingly
  for root, dir_names, files_name in os.walk(data_folder_path):

    ## if base dir is same as root dir then it will continue else not also it will
    if root == data_folder_path:
      continue

    # Extract the label name from the subdirectory path
    label_extracted = os.path.basename(root)

    # Iterate through all the image files in the subdirectory
    for f in files_name:

      # Construct the relative path to the image file
      image_rel_path = os.path.join(root, f)

      # Add the relative image path and label to their respective lists
      images_path_list.append(image_rel_path)
      images_labels_list.append(label_extracted)
      # Add respective ids
      if label_extracted in label2id_dict.keys():
        images_ids_list.append(label2id_dict.get(label_extracted))

  ## Create a Pandas DataFrame
  df = pd.DataFrame({'Full_Image_Path': images_path_list, 'Image_label': images_labels_list, 'Image_id': images_ids_list})

  # Read dataframe as HuggingFace Dataset object
  dataset_hf = Dataset.from_pandas(df,features = custom_features)

  return dataset_hf

In [None]:
hf_train = dataProcessing(path_train)

hf_valid = dataProcessing(path_valid)

In [None]:
print(hf_train)
print(hf_valid)

Dataset({
    features: ['Full_Image_Path', 'Image_label', 'Image_id'],
    num_rows: 9600
})
Dataset({
    features: ['Full_Image_Path', 'Image_label', 'Image_id'],
    num_rows: 1600
})


## S-2: Deleting full image path and save Pillow images also creating ground truth

In [None]:
## Checking features before changing
hf_train.features

{'Full_Image_Path': Value(dtype='string', id=None),
 'Image_label': Value(dtype='string', id=None),
 'Image_id': ClassLabel(names=['advertisement', 'budget', 'email', 'file_folder', 'form', 'handwritten', 'invoice', 'letter', 'memo', 'news_article', 'presentation', 'questionnaire', 'resume', 'scientific_publication', 'scientific_report', 'specification'], id=None)}

In [None]:
hf_valid.features

{'Full_Image_Path': Value(dtype='string', id=None),
 'Image_label': Value(dtype='string', id=None),
 'Image_id': ClassLabel(names=['advertisement', 'budget', 'email', 'file_folder', 'form', 'handwritten', 'invoice', 'letter', 'memo', 'news_article', 'presentation', 'questionnaire', 'resume', 'scientific_publication', 'scientific_report', 'specification'], id=None)}

#### Note:
- We have to change Image id from value to ClassLabel

In [None]:
# Define a function to open, read & save images in PILLOW

template = '{"gt_parse": {"class" : '     ## initiate template variable string

def process_image_gt(hfsample):
  ## part-1: reading images in PIL and
  image_path = hfsample['Full_Image_Path']
  image = Image.open(image_path)
  hfsample['Image_raw'] = image

  ## part-2: creating ground truth
  ground_truths = template + '"' + hfsample['Image_label'] + '"' + "}}"

  hfsample['ground_truth'] = ground_truths

  # deleting the full path
  del hfsample['Full_Image_Path']

  # deleting the image label
  del hfsample['Image_label']

  return hfsample

In [None]:
# 1. For train
hf_train_imgraw_gt = hf_train.map(process_image_gt)

# 2. For valid
hf_valid_imgraw_gt = hf_valid.map(process_image_gt)

Map:   0%|          | 0/9600 [00:00<?, ? examples/s]

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

In [None]:
hf_train_imgraw_gt

Dataset({
    features: ['Image_id', 'Image_raw', 'ground_truth'],
    num_rows: 9600
})

In [None]:
hf_train_imgraw_gt.features

{'Image_id': ClassLabel(names=['advertisement', 'budget', 'email', 'file_folder', 'form', 'handwritten', 'invoice', 'letter', 'memo', 'news_article', 'presentation', 'questionnaire', 'resume', 'scientific_publication', 'scientific_report', 'specification'], id=None),
 'Image_raw': Image(decode=True, id=None),
 'ground_truth': Value(dtype='string', id=None)}

In [None]:
hf_valid_imgraw_gt

Dataset({
    features: ['Image_id', 'Image_raw', 'ground_truth'],
    num_rows: 1600
})

In [None]:
hf_valid_imgraw_gt.features

{'Image_id': ClassLabel(names=['advertisement', 'budget', 'email', 'file_folder', 'form', 'handwritten', 'invoice', 'letter', 'memo', 'news_article', 'presentation', 'questionnaire', 'resume', 'scientific_publication', 'scientific_report', 'specification'], id=None),
 'Image_raw': Image(decode=True, id=None),
 'ground_truth': Value(dtype='string', id=None)}

In [None]:
hf_train_imgraw_gt[7]

{'Image_id': 12,
 'Image_raw': <PIL.TiffImagePlugin.TiffImageFile image mode=L size=754x1000>,
 'ground_truth': '{"gt_parse": {"class" : "resume"}}'}

## S-3: Convert training and validation processed dataset into Datasetdict

In [None]:
processed_dataset = DatasetDict(
    {
        "train" : hf_train_imgraw_gt,
        "valid" : hf_valid_imgraw_gt
    }
)

In [None]:
processed_dataset

DatasetDict({
    train: Dataset({
        features: ['Image_id', 'Image_raw', 'ground_truth'],
        num_rows: 9600
    })
    valid: Dataset({
        features: ['Image_id', 'Image_raw', 'ground_truth'],
        num_rows: 1600
    })
})

## S-4: Uploading the Dataset in Huggingface Hub

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the '

In [None]:
processed_dataset.push_to_hub("indra-inc/rvl_cdip_train600_valid100_ground_truth")

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/3 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/32 [00:00<?, ?ba/s]

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/32 [00:00<?, ?ba/s]

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/32 [00:00<?, ?ba/s]

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/16 [00:00<?, ?ba/s]

In [None]:
# Reload the train and valid dataset dict
processed_dataset_loaded = load_dataset("indra-inc/rvl_cdip_train600_valid100_ground_truth")

Downloading readme:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/356M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/326M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/413M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/186M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/9600 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/1600 [00:00<?, ? examples/s]

In [None]:
processed_dataset_loaded

DatasetDict({
    train: Dataset({
        features: ['Image_id', 'Image_raw', 'ground_truth'],
        num_rows: 9600
    })
    valid: Dataset({
        features: ['Image_id', 'Image_raw', 'ground_truth'],
        num_rows: 1600
    })
})

In [None]:
processed_dataset_loaded['train'].features['Image_id'].names

['advertisement',
 'budget',
 'email',
 'file_folder',
 'form',
 'handwritten',
 'invoice',
 'letter',
 'memo',
 'news_article',
 'presentation',
 'questionnaire',
 'resume',
 'scientific_publication',
 'scientific_report',
 'specification']