# Mini-dataset generation

In [26]:
import os
from shutil import copyfile
from shutil import rmtree
import h5py
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm

%load_ext memory_profiler
%load_ext line_profiler

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler
The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


##### Global parameters:

In [27]:
dataset_path = "/Volumes/Part1/datasets/rvl-cdip"
images_folder_path = os.path.join(dataset_path, "images")
project_folder = "/Users/jasonravagli/Desktop/Scuola/Università-Magistrale/data-and-document-mining/document-classification"
resources_folder = os.path.join(project_folder, "resources")
output_dataset_folder = os.path.join(resources_folder, "output", "mini-dataset")
debug_folder = os.path.join(resources_folder, "debug")

# Number of images classes
n_classes = 16

# Dimensions of the dataset to be generated
mini_train_set_dimension = 1500
mini_valid_set_dimension = 300
mini_test_set_dimension =  200

#### Setup folders

In [28]:
# Delete already existing dataset output folder
rmtree(output_dataset_folder, ignore_errors=True)

os.makedirs(output_dataset_folder)
os.makedirs(debug_folder, exist_ok=True)

# Folders to manually check some processed images
debug_folder_reshaped_imgs = os.path.join(debug_folder, "reshaped_imgs")

# Folders to manually check some images read from the generated dataset
debug_folder_dataset_imgs = os.path.join(debug_folder, "dataset_imgs")

os.makedirs(debug_folder_reshaped_imgs, exist_ok=True)
os.makedirs(debug_folder_dataset_imgs, exist_ok=True)

Utility function to display dataframe info:

In [29]:
def display_info(df=None, rows=5):
    try:
        print("Info")
        df.info(verbose=True)
        print(df.shape)
        print("Head")
        display(df.head(rows))
        print("Sample")
        display(df.sample(rows))
        print("Tail")
        display(df.tail(rows))
    except:
        print("Errore nella visualizzazione")

## Data Sampling

#### Read the labels files:

In [30]:
%memit df_train_labels = pd.read_csv(os.path.join(dataset_path, "labels/train.txt"), sep=" ",\
                                     names=["image", "label"], skipinitialspace=True)
%memit df_valid_labels = pd.read_csv(os.path.join(dataset_path, "labels/val.txt"), sep=" ",\
                                     names=["image", "label"], skipinitialspace=True)
%memit df_test_labels = pd.read_csv(os.path.join(dataset_path, "labels/test.txt"), sep=" ",\
                                    names=["image", "label"], skipinitialspace=True)

# Reset column names because of strange whitespaces added to the column names after reading
df_train_labels.columns = ["image", "label"]
df_valid_labels.columns = ["image", "label"]
df_test_labels.columns = ["image", "label"]

# Display some info to verify the correctness of the data read
display_info(df_train_labels)

peak memory: 349.48 MiB, increment: 68.36 MiB
peak memory: 318.63 MiB, increment: -22.58 MiB
peak memory: 317.61 MiB, increment: 3.25 MiB
Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 320000 entries, 0 to 319999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   image   320000 non-null  object
 1   label   320000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 4.9+ MB
(320000, 2)
Head


Unnamed: 0,image,label
0,imagesq/q/o/c/qoc54c00/80035521.tif,15
1,imagese/e/w/c/ewc23d00/513280028.tif,1
2,imagesw/w/b/t/wbt26e00/2053453161.tif,7
3,imagesm/m/k/m/mkm05e00/2040792992_2040792994.tif,10
4,imageso/o/e/x/oex80d00/522787731+-7732.tif,3


Sample


Unnamed: 0,image,label
311843,imagesb/b/c/g/bcg18d00/2048869295.tif,15
114441,imagesy/y/f/b/yfb17e00/2031309271.tif,7
147296,imagesv/v/f/u/vfu46e00/2023948549.tif,8
217494,imagesz/z/c/s/zcs40d00/517235901+-5902.tif,3
186095,imagesm/m/z/g/mzg80a00/0060015768.tif,15


Tail


Unnamed: 0,image,label
319995,imagesu/u/p/p/upp04f00/0000282789.tif,9
319996,imagesa/a/c/z/acz60f00/0011972032.tif,15
319997,imagesu/u/j/m/ujm20a00/10155388.tif,6
319998,imagesd/d/r/r/drr93f00/0000343578.tif,9
319999,imagesp/p/j/x/pjx11d00/518223252+-3253.tif,3


#### Random sample data preserving ratio between classes

In [31]:
train_data_per_class = mini_train_set_dimension//n_classes
valid_data_per_class = mini_valid_set_dimension//n_classes
test_data_per_class = mini_test_set_dimension//n_classes

df_sampled_train_labels = pd.DataFrame(columns = df_train_labels.columns)
df_sampled_valid_labels = pd.DataFrame(columns = df_valid_labels.columns)
df_sampled_test_labels = pd.DataFrame(columns = df_test_labels.columns)

for i in range(n_classes):
    df_sampled_train_labels = pd.concat([df_sampled_train_labels, \
                                         df_train_labels.loc[df_train_labels["label"] == i].sample(n=train_data_per_class)])
    df_sampled_valid_labels = pd.concat([df_sampled_valid_labels, \
                                         df_valid_labels.loc[df_valid_labels["label"] == i].sample(n=valid_data_per_class)])
    df_sampled_test_labels = pd.concat([df_sampled_test_labels, \
                                         df_test_labels.loc[df_test_labels["label"] == i].sample(n=test_data_per_class)])

df_sampled_train_labels = df_sampled_train_labels.sample(frac=1).reset_index(drop=True)
df_sampled_valid_labels = df_sampled_valid_labels.sample(frac=1).reset_index(drop=True)
df_sampled_test_labels = df_sampled_test_labels.sample(frac=1).reset_index(drop=True)

## Data verification and analysis

#### Display generated dataframes summaries to verify their correctness

In [32]:
display_info(df_sampled_train_labels)

Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1488 entries, 0 to 1487
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   image   1488 non-null   object
 1   label   1488 non-null   object
dtypes: object(2)
memory usage: 23.4+ KB
(1488, 2)
Head


Unnamed: 0,image,label
0,imagesm/m/v/y/mvy78d00/502592915.tif,4
1,imagesz/z/k/l/zkl56d00/517689505+-9507.tif,3
2,imagesx/x/d/t/xdt33e00/2048127575.tif,8
3,imagesk/k/x/r/kxr16d00/50645859-5860.tif,14
4,imagese/e/i/h/eih01a00/0060154866.tif,8


Sample


Unnamed: 0,image,label
1154,imagesa/a/f/t/aft91e00/2029135020.tif,1
1471,imagesi/i/a/f/iaf00e00/94600318.tif,1
712,imagesv/v/r/u/vru07c00/50688401-8402.tif,14
355,imagesc/c/n/x/cnx82c00/2081520849a.tif,2
1313,imagesp/p/f/y/pfy35e00/2040267305.tif,13


Tail


Unnamed: 0,image,label
1483,imagesc/c/d/a/cda33f00/0000000870.tif,14
1484,imagesw/w/n/a/wna82c00/2078738962_8963.tif,12
1485,imagesn/n/x/p/nxp78d00/502599314.tif,4
1486,imagesv/v/q/m/vqm42c00/2073092487.tif,9
1487,imagesl/l/d/p/ldp71c00/2084522006.tif,9


In [33]:
display_info(df_sampled_valid_labels)

Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288 entries, 0 to 287
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   image   288 non-null    object
 1   label   288 non-null    object
dtypes: object(2)
memory usage: 4.6+ KB
(288, 2)
Head


Unnamed: 0,image,label
0,imagesy/y/w/a/ywa06d00/50488656-8657.tif,14
1,imagesc/c/n/v/cnv86e00/2054136546_2054136547.tif,1
2,imagesk/k/b/r/kbr00e00/87285060.tif,11
3,imagesw/w/a/e/wae59c00/PUBLICATIONS026381-6.tif,6
4,imagesu/u/s/k/usk17e00/2031318406.tif,7


Sample


Unnamed: 0,image,label
220,imagesb/b/b/m/bbm16d00/50600649-0650.tif,14
56,imagesk/k/y/e/kye04e00/2040734349.tif,13
85,imagesv/v/o/o/voo31a00/0071035888.tif,0
74,imagesj/j/r/t/jrt85d00/503609047_503609053.tif,3
241,imagesq/q/e/p/qep86d00/tcal0437220_7221.tif,10


Tail


Unnamed: 0,image,label
283,imagesz/z/j/j/zjj72c00/2078305957.tif,2
284,imagesd/d/w/o/dwo26d00/50640437-0443.tif,6
285,imagesf/f/n/v/fnv72c00/2078454494.tif,2
286,imagesf/f/j/a/fja64a00/89836328_6329.tif,13
287,imagesa/a/s/s/ass76d00/ti16680317_0322.tif,10


In [34]:
display_info(df_sampled_test_labels)

Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192 entries, 0 to 191
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   image   192 non-null    object
 1   label   192 non-null    object
dtypes: object(2)
memory usage: 3.1+ KB
(192, 2)
Head


Unnamed: 0,image,label
0,imagesz/z/b/h/zbh42f00/10004982_10004984.tif,10
1,imagesg/g/p/q/gpq11d00/522714276+-4278.tif,3
2,imagese/e/u/r/eur92d00/515608558.tif,4
3,imagesi/i/g/j/igj56c00/2075566692_6693.tif,9
4,imagesi/i/j/d/ijd10f00/0000544685.tif,15


Sample


Unnamed: 0,image,label
14,imagesw/w/j/v/wjv8aa00/50066208_50066211.tif,14
60,imagesl/l/x/p/lxp09c00/94581460.tif,10
100,imagese/e/g/w/egw47d00/2062318390_8391.tif,1
91,imagesj/j/s/m/jsm33f00/0011862921.tif,0
162,imagesr/r/s/a/rsa63c00/2064822167.tif,9


Tail


Unnamed: 0,image,label
187,imagesl/l/n/b/lnb52c00/2074104024.tif,11
188,imagese/e/k/x/ekx14c00/2080378562_8564.tif,12
189,imagesm/m/f/p/mfp3aa00/11233923.tif,11
190,imagest/t/q/n/tqn39c00/2084390860b.tif,2
191,imagesh/h/w/q/hwq89c00/50280571-0571.tif,6


## Mini-dataset generation

### Functions

**generate_minidataset**: Creates in the specified hdf5 file a dataset with name _dataset_name_ and fills the dataset with the images in the dataframe _df_ as they are, without any preprocessing operations

In [39]:
def generate_minidataset(hdf_file, df, dataset_name):
    batch_dimension = 64
    n_imgs = len(df.index)

    print("Generating dataset " + dataset_name + " - " + str(n_imgs) + " images")
    
    # Matrix containing img_path - label
    matrix_labels = np.empty((n_imgs, 2), dtype=np.object)
    
    img_counter = 0
    for row in tqdm(df.itertuples()):
        thousand = img_counter // 10**3 % 10
        hundred = img_counter // 10**2 % 10
        ten = img_counter // 10 % 10
        unit = img_counter % 10
        
        dest_folder_path = os.path.join(output_dataset_folder, dataset_name, str(thousand), str(hundred), str(ten), str(unit))
        image_name = os.path.basename(row.image)
        img_path = os.path.join(dest_folder_path, image_name)
        
        os.makedirs(dest_folder_path, exist_ok=True)
        copyfile(os.path.join(images_folder_path, row.image), img_path)
        
        matrix_labels[i, 0] = img_path
        matrix_labels[i, 1] = row.label
            
        img_counter += 1
    
    # Save images relative paths and associated labels in a hdf5 file dictionary
    dataset_labels = hdf_file.create_dataset(dataset_name, (n_imgs, 2), compression="gzip")
    dataset_labels = matrix_labels

#### Generates the mini-dataset

In [40]:
with h5py.File(os.path.join(output_dataset_folder, 'labels.h5'), 'w') as hdf_file:
    generate_minidataset(hdf_file, df_sampled_train_labels, "train")
    generate_minidataset(hdf_file, df_sampled_valid_labels, "valid")
    generate_minidataset(hdf_file, df_sampled_test_labels, "test")

2it [00:00, 10.75it/s]

Generating dataset train - 1488 images


1488it [02:18, 10.72it/s]
3it [00:00, 23.39it/s]

Generating dataset valid - 288 images


288it [00:24, 11.56it/s]
2it [00:00, 11.18it/s]

Generating dataset test - 192 images


192it [00:16, 11.85it/s]
