# Mini-dataset generation

In [1]:
import os
from shutil import copyfile
from shutil import rmtree
import h5py
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm

%load_ext memory_profiler
%load_ext line_profiler

##### Global parameters:

In [2]:
dataset_path = "/Volumes/Part1/datasets/rvl-cdip"
images_folder_path = os.path.join(dataset_path, "images")
project_folder = "/Users/jasonravagli/Desktop/Scuola/Università-Magistrale/data-and-document-mining/document-classification"
resources_folder = os.path.join(project_folder, "resources")
output_dataset_folder = os.path.join(resources_folder, "output", "mini-dataset")
debug_folder = os.path.join(resources_folder, "debug")

# Number of images classes
n_classes = 16

# Dimensions of the dataset to be generated
mini_train_set_dimension = 1500
mini_valid_set_dimension = 300
mini_test_set_dimension =  200

#### Setup folders

In [3]:
# Delete already existing dataset output folder
rmtree(output_dataset_folder, ignore_errors=True)

os.makedirs(output_dataset_folder)
os.makedirs(debug_folder, exist_ok=True)

# Folders to manually check some processed images
debug_folder_reshaped_imgs = os.path.join(debug_folder, "reshaped_imgs")

# Folders to manually check some images read from the generated dataset
debug_folder_dataset_imgs = os.path.join(debug_folder, "dataset_imgs")

os.makedirs(debug_folder_reshaped_imgs, exist_ok=True)
os.makedirs(debug_folder_dataset_imgs, exist_ok=True)

Utility function to display dataframe info:

In [4]:
def display_info(df=None, rows=5):
    try:
        print("Info")
        df.info(verbose=True)
        print(df.shape)
        print("Head")
        display(df.head(rows))
        print("Sample")
        display(df.sample(rows))
        print("Tail")
        display(df.tail(rows))
    except:
        print("Errore nella visualizzazione")

## Data Sampling

#### Read the labels files:

In [5]:
%memit df_train_labels = pd.read_csv(os.path.join(dataset_path, "labels/train.txt"), sep=" ",\
                                     names=["image", "label"], skipinitialspace=True)
%memit df_valid_labels = pd.read_csv(os.path.join(dataset_path, "labels/val.txt"), sep=" ",\
                                     names=["image", "label"], skipinitialspace=True)
%memit df_test_labels = pd.read_csv(os.path.join(dataset_path, "labels/test.txt"), sep=" ",\
                                    names=["image", "label"], skipinitialspace=True)

# Reset column names because of strange whitespaces added to the column names after reading
df_train_labels.columns = ["image", "label"]
df_valid_labels.columns = ["image", "label"]
df_test_labels.columns = ["image", "label"]

# Display some info to verify the correctness of the data read
display_info(df_train_labels)

peak memory: 166.68 MiB, increment: 70.26 MiB
peak memory: 169.68 MiB, increment: 3.09 MiB
peak memory: 155.43 MiB, increment: 13.53 MiB
Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 320000 entries, 0 to 319999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   image   320000 non-null  object
 1   label   320000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 4.9+ MB
(320000, 2)
Head


Unnamed: 0,image,label
0,imagesq/q/o/c/qoc54c00/80035521.tif,15
1,imagese/e/w/c/ewc23d00/513280028.tif,1
2,imagesw/w/b/t/wbt26e00/2053453161.tif,7
3,imagesm/m/k/m/mkm05e00/2040792992_2040792994.tif,10
4,imageso/o/e/x/oex80d00/522787731+-7732.tif,3


Sample


Unnamed: 0,image,label
376,imagesb/b/d/w/bdw01d00/517519427+-9428.tif,3
7530,imagese/e/v/p/evp30a00/60045146_60045151.tif,6
209622,imageso/o/n/j/onj65d00/504571339.tif,7
86883,imagesx/x/d/h/xdh21d00/515557431+-7432.tif,3
158971,imagese/e/z/z/ezz98e00/2029372143.tif,11


Tail


Unnamed: 0,image,label
319995,imagesu/u/p/p/upp04f00/0000282789.tif,9
319996,imagesa/a/c/z/acz60f00/0011972032.tif,15
319997,imagesu/u/j/m/ujm20a00/10155388.tif,6
319998,imagesd/d/r/r/drr93f00/0000343578.tif,9
319999,imagesp/p/j/x/pjx11d00/518223252+-3253.tif,3


#### Random sample data preserving ratio between classes

In [6]:
train_data_per_class = mini_train_set_dimension//n_classes
valid_data_per_class = mini_valid_set_dimension//n_classes
test_data_per_class = mini_test_set_dimension//n_classes

df_sampled_train_labels = pd.DataFrame(columns = df_train_labels.columns)
df_sampled_valid_labels = pd.DataFrame(columns = df_valid_labels.columns)
df_sampled_test_labels = pd.DataFrame(columns = df_test_labels.columns)

for i in range(n_classes):
    df_sampled_train_labels = pd.concat([df_sampled_train_labels, \
                                         df_train_labels.loc[df_train_labels["label"] == i].sample(n=train_data_per_class)])
    df_sampled_valid_labels = pd.concat([df_sampled_valid_labels, \
                                         df_valid_labels.loc[df_valid_labels["label"] == i].sample(n=valid_data_per_class)])
    df_sampled_test_labels = pd.concat([df_sampled_test_labels, \
                                         df_test_labels.loc[df_test_labels["label"] == i].sample(n=test_data_per_class)])

df_sampled_train_labels = df_sampled_train_labels.sample(frac=1).reset_index(drop=True)
df_sampled_valid_labels = df_sampled_valid_labels.sample(frac=1).reset_index(drop=True)
df_sampled_test_labels = df_sampled_test_labels.sample(frac=1).reset_index(drop=True)

## Data verification and analysis

#### Display generated dataframes summaries to verify their correctness

In [7]:
display_info(df_sampled_train_labels)

Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1488 entries, 0 to 1487
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   image   1488 non-null   object
 1   label   1488 non-null   object
dtypes: object(2)
memory usage: 23.4+ KB
(1488, 2)
Head


Unnamed: 0,image,label
0,imagesf/f/r/k/frk54a00/92246495_6496.tif,11
1,imagesm/m/w/h/mwh93c00/2060927386.tif,1
2,imagesa/a/j/s/ajs00d00/50346296-6298.tif,14
3,imageso/o/t/s/ots77d00/2029118685.tif,1
4,imagesq/q/w/c/qwc90c00/2502401854_1859.tif,6


Sample


Unnamed: 0,image,label
975,imagesa/a/o/c/aoc48c00/2082966088_6089.tif,9
940,imagesq/q/n/j/qnj45c00/2074220090b.tif,2
367,imagesh/h/t/l/htl50c00/ti31709012.tif,9
680,imagest/t/d/m/tdm34e00/2020288833.tif,15
474,imagesd/d/r/e/dre36e00/2048961503_2048961519.tif,13


Tail


Unnamed: 0,image,label
1483,imagesh/h/y/b/hyb24e00/2028391968.tif,9
1484,imagest/t/x/z/txz12c00/2085241559_1586.tif,12
1485,imageso/o/a/u/oau57d00/2044771262.tif,9
1486,imagesd/d/u/o/duo71a00/2057448108_2057448117.tif,7
1487,imagesu/u/p/o/upo25c00/2505300863_0864.tif,5


In [8]:
display_info(df_sampled_valid_labels)

Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288 entries, 0 to 287
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   image   288 non-null    object
 1   label   288 non-null    object
dtypes: object(2)
memory usage: 4.6+ KB
(288, 2)
Head


Unnamed: 0,image,label
0,imagesy/y/h/l/yhl59e00/2024476420.tif,5
1,imagesw/w/t/s/wts37d00/2070910437_0444.tif,12
2,imagesk/k/g/n/kgn66c00/2074588332_8370.tif,5
3,imagesd/d/m/o/dmo28e00/1000829045.tif,15
4,imageso/o/f/k/ofk83f00/0001136733.tif,11


Sample


Unnamed: 0,image,label
18,imagese/e/z/c/ezc96c00/2070713394_3395.tif,4
64,imagesi/i/h/d/ihd70d00/517613877+-3877.tif,3
74,imagesn/n/v/k/nvk54a00/92212040.tif,11
172,imagesw/w/p/x/wpx28d00/2072411151.tif,2
108,imageso/o/w/f/owf64d00/506466383_506466393.tif,7


Tail


Unnamed: 0,image,label
283,imagesc/c/x/i/cxi21d00/522856617+-6618.tif,3
284,imagesr/r/x/o/rxo96c00/50700312-0328.tif,6
285,imagesv/v/j/e/vje71a00/2057421303.tif,7
286,imagesa/a/p/m/apm49d00/501330410.tif,0
287,imagese/e/g/g/egg62e00/2041623408.tif,0


In [9]:
display_info(df_sampled_test_labels)

Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192 entries, 0 to 191
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   image   192 non-null    object
 1   label   192 non-null    object
dtypes: object(2)
memory usage: 3.1+ KB
(192, 2)
Head


Unnamed: 0,image,label
0,imagesl/l/x/p/lxp33a00/506047348+-7349.tif,0
1,imagesw/w/c/k/wck37d00/2075874221_4650.tif,5
2,imagesp/p/q/p/pqp82e00/2053783787.tif,1
3,imagesp/p/f/d/pfd50c00/ti17180517.tif,9
4,imagesd/d/q/i/dqi25a00/529215516+-5602.tif,2


Sample


Unnamed: 0,image,label
165,imagesb/b/k/a/bka60d00/517516310+-6312.tif,3
28,imagest/t/u/w/tuw71e00/01145176.tif,11
156,imagesv/v/b/i/vbi16c00/2071325018.tif,8
45,imagesb/b/q/l/bql62f00/tob12105.71.tif,9
182,imagesc/c/f/u/cfu03e00/2049418253_2049418254.tif,10


Tail


Unnamed: 0,image,label
187,imagesf/f/m/c/fmc92a00/522497059+-7062.tif,13
188,imagesr/r/z/g/rzg21d00/515556447+-6448.tif,3
189,imagesd/d/q/u/dqu31d00/514228853+-8853.tif,1
190,imagesg/g/a/i/gai85a00/531531568+-1573.tif,2
191,imagesi/i/k/d/ikd10c00/2085134699.tif,2


## Mini-dataset generation

### Functions

**generate_minidataset**: Creates in the specified hdf5 file a dataset with name _dataset_name_ and fills the dataset with the images in the dataframe _df_ as they are, without any preprocessing operations

In [10]:
def generate_minidataset(hdf_file, df, dataset_name):
    batch_dimension = 64
    n_imgs = len(df.index)

    print("Generating dataset " + dataset_name + " - " + str(n_imgs) + " images")
    
    # Matrix containing img_path - label
    matrix_labels = np.empty((n_imgs, 2), dtype=np.object)
    
    img_counter = 0
    for row in tqdm(df.itertuples()):
        thousand = img_counter // 10**3 % 10
        hundred = img_counter // 10**2 % 10
        ten = img_counter // 10 % 10
        unit = img_counter % 10
        
        dest_folder_path = os.path.join(output_dataset_folder, dataset_name, str(thousand), str(hundred), str(ten), str(unit))
        image_name = os.path.basename(row.image)
        img_abs_path = os.path.join(dest_folder_path, image_name)
        img_relative_path = os.path.join(dataset_name, str(thousand), str(hundred), str(ten), str(unit), image_name)
        
        os.makedirs(dest_folder_path, exist_ok=True)
        copyfile(os.path.join(images_folder_path, row.image), img_abs_path)
        
        matrix_labels[i, 0] = img_relative_path
        matrix_labels[i, 1] = row.label
            
        img_counter += 1
    
    # Save images relative paths and associated labels in a hdf5 file dictionary
    dataset_labels = hdf_file.create_dataset(dataset_name, (n_imgs, 2), compression="gzip")
    dataset_labels = matrix_labels

#### Generates the mini-dataset

In [11]:
with h5py.File(os.path.join(output_dataset_folder, 'labels.h5'), 'w') as hdf_file:
    generate_minidataset(hdf_file, df_sampled_train_labels, "train")
    generate_minidataset(hdf_file, df_sampled_valid_labels, "valid")
    generate_minidataset(hdf_file, df_sampled_test_labels, "test")

0it [00:00, ?it/s]

Generating dataset train - 1488 images


1488it [02:41,  9.20it/s]
0it [00:00, ?it/s]

Generating dataset valid - 288 images


288it [00:24, 11.87it/s]
1it [00:00,  7.55it/s]

Generating dataset test - 192 images


192it [00:17, 10.67it/s]
