# Preprocessing and mini-dataset generation

In [16]:
import os
import h5py
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from keras.utils import to_categorical

%load_ext memory_profiler
%load_ext line_profiler

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler
The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


##### Global parameters:

In [17]:
dataset_path = "/Volumes/Part1/datasets/rvl-cdip"
images_folder_path = os.path.join(dataset_path, "images")
project_folder = "/Users/jasonravagli/Desktop/Scuola/Università-Magistrale/data-and-document-mining/document-classification"
resources_folder = os.path.join(project_folder, "resources")
output_folder = os.path.join(resources_folder, "output")
debug_folder = os.path.join(resources_folder, "debug")

# Number of images classes
n_classes = 16

# Dimensions of the dataset to be generated
mini_train_set_dimension = 1600
mini_valid_set_dimension = 200
mini_test_set_dimension =  200

# Width threshold: all images having width greater than this will be discarded
width_threshold = 875

# Shape that all images must have inside the final mini-dataset (choose basing on statistical analysis)
target_shape = (1000,750)

#### Setup folders

In [18]:
os.makedirs(output_folder, exist_ok=True)
os.makedirs(debug_folder, exist_ok=True)

# Folders to manually check some processed images
debug_folder_reshaped_imgs = os.path.join(debug_folder, "reshaped_imgs")

# Folders to manually check some images read from the generated dataset
debug_folder_dataset_imgs = os.path.join(debug_folder, "dataset_imgs")

os.makedirs(debug_folder_reshaped_imgs, exist_ok=True)
os.makedirs(debug_folder_dataset_imgs, exist_ok=True)

Utility function to display dataframe info:

In [19]:
def display_info(df=None, rows=5):
    try:
        print("Info")
        df.info(verbose=True)
        print(df.shape)
        print("Head")
        display(df.head(rows))
        print("Sample")
        display(df.sample(rows))
        print("Tail")
        display(df.tail(rows))
    except:
        print("Errore nella visualizzazione")

## Data Sampling

#### Read the labels files:

In [20]:
%memit df_train_labels = pd.read_csv(os.path.join(dataset_path, "labels/train.txt"), sep=" ",\
                                     names=["image", "label"], skipinitialspace=True)
%memit df_valid_labels = pd.read_csv(os.path.join(dataset_path, "labels/val.txt"), sep=" ",\
                                     names=["image", "label"], skipinitialspace=True)
%memit df_test_labels = pd.read_csv(os.path.join(dataset_path, "labels/test.txt"), sep=" ",\
                                    names=["image", "label"], skipinitialspace=True)

# Reset column names because of strange whitespaces added to the column names after reading
df_train_labels.columns = ["image", "label"]
df_valid_labels.columns = ["image", "label"]
df_test_labels.columns = ["image", "label"]

# Display some info to verify the correctness of the data read
display_info(df_train_labels)

peak memory: 307.91 MiB, increment: 61.28 MiB
peak memory: 312.24 MiB, increment: 4.32 MiB
peak memory: 297.32 MiB, increment: 14.41 MiB
Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 320000 entries, 0 to 319999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   image   320000 non-null  object
 1   label   320000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 4.9+ MB
(320000, 2)
Head


Unnamed: 0,image,label
0,imagesq/q/o/c/qoc54c00/80035521.tif,15
1,imagese/e/w/c/ewc23d00/513280028.tif,1
2,imagesw/w/b/t/wbt26e00/2053453161.tif,7
3,imagesm/m/k/m/mkm05e00/2040792992_2040792994.tif,10
4,imageso/o/e/x/oex80d00/522787731+-7732.tif,3


Sample


Unnamed: 0,image,label
186163,imagesu/u/j/s/ujs53c00/98704446_4455.tif,15
167711,imagesj/j/d/i/jdi35e00/2024525715.tif,11
168088,imagesp/p/q/j/pqj93d00/508482687_508482694.tif,7
294772,imagesu/u/h/w/uhw73c00/2065165656.tif,12
267695,imagesh/h/h/z/hhz75c00/2077981711.tif,1


Tail


Unnamed: 0,image,label
319995,imagesu/u/p/p/upp04f00/0000282789.tif,9
319996,imagesa/a/c/z/acz60f00/0011972032.tif,15
319997,imagesu/u/j/m/ujm20a00/10155388.tif,6
319998,imagesd/d/r/r/drr93f00/0000343578.tif,9
319999,imagesp/p/j/x/pjx11d00/518223252+-3253.tif,3


#### Random sample data preserving ratio between classes

In [21]:
train_data_per_class = mini_train_set_dimension//n_classes
valid_data_per_class = mini_valid_set_dimension//n_classes
test_data_per_class = mini_test_set_dimension//n_classes

df_sampled_train_labels = pd.DataFrame(columns = df_train_labels.columns)
df_sampled_valid_labels = pd.DataFrame(columns = df_valid_labels.columns)
df_sampled_test_labels = pd.DataFrame(columns = df_test_labels.columns)

for i in range(n_classes):
    df_sampled_train_labels = pd.concat([df_sampled_train_labels, \
                                         df_train_labels.loc[df_train_labels["label"] == i].sample(n=train_data_per_class)])
    df_sampled_valid_labels = pd.concat([df_sampled_valid_labels, \
                                         df_valid_labels.loc[df_valid_labels["label"] == i].sample(n=valid_data_per_class)])
    df_sampled_test_labels = pd.concat([df_sampled_test_labels, \
                                         df_test_labels.loc[df_test_labels["label"] == i].sample(n=test_data_per_class)])

df_sampled_train_labels = df_sampled_train_labels.sample(frac=1).reset_index(drop=True)
df_sampled_valid_labels = df_sampled_valid_labels.sample(frac=1).reset_index(drop=True)
df_sampled_test_labels = df_sampled_test_labels.sample(frac=1).reset_index(drop=True)

## Data verification and analysis

#### Display generated dataframes summaries to verify their correctness

In [22]:
display_info(df_sampled_train_labels)

Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   image   1600 non-null   object
 1   label   1600 non-null   object
dtypes: object(2)
memory usage: 25.1+ KB
(1600, 2)
Head


Unnamed: 0,image,label
0,imagesg/g/n/z/gnz96d00/timo0006156_6158.tif,10
1,imagesd/d/g/d/dgd92e00/2061832119.tif,11
2,imagesv/v/t/a/vta10f00/0000554960.tif,15
3,imagesl/l/y/y/lyy87e00/2060579707_2060579711.tif,5
4,imagesy/y/w/q/ywq99d00/88802621_88802632.tif,5


Sample


Unnamed: 0,image,label
97,imagesx/x/z/z/xzz98e00/2029372111_2029372112.tif,11
1427,imagesb/b/o/h/boh25f00/0060212770.tif,8
1089,imagesi/i/m/r/imr72a00/528151513+-1514.tif,2
575,imagesp/p/p/g/ppg05e00/2041374188_2041374189.tif,10
3,imagesl/l/y/y/lyy87e00/2060579707_2060579711.tif,5


Tail


Unnamed: 0,image,label
1595,imagesr/r/u/k/ruk83e00/2028956806_2028956810.tif,5
1596,imagesg/g/n/w/gnw67c00/2078801070a.tif,2
1597,imagesn/n/i/g/nig32e00/2501403915_2501403925.tif,12
1598,imagest/t/m/w/tmw52f00/tob12722.90_tob12722.97...,10
1599,imagesy/y/n/o/yno62d00/81638153.tif,9


In [23]:
display_info(df_sampled_valid_labels)

Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192 entries, 0 to 191
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   image   192 non-null    object
 1   label   192 non-null    object
dtypes: object(2)
memory usage: 3.1+ KB
(192, 2)
Head


Unnamed: 0,image,label
0,imageso/o/b/q/obq30d00/524384863+-4864.tif,3
1,imagesf/f/d/o/fdo00e00/91694274.tif,11
2,imagesu/u/s/v/usv42f00/tob16805.45.tif,0
3,imagesv/v/b/x/vbx31e00/85240117_85240118.tif,15
4,imagesz/z/z/c/zzc27e00/2028706323.tif,1


Sample


Unnamed: 0,image,label
176,imagesr/r/s/w/rsw43e00/2062077610_2062077619.tif,5
28,imagesg/g/r/o/gro09d00/50450512-0513.tif,14
113,imagesy/y/e/n/yen26d00/50619510-9510.tif,6
166,imagesj/j/t/u/jtu64d00/98672767_2782.tif,15
32,imagesm/m/r/p/mrp65d00/504500235.tif,7


Tail


Unnamed: 0,image,label
187,imagesi/i/a/e/iae07c00/70062294-2294.tif,13
188,imagesw/w/v/o/wvo74e00/1000217672.tif,6
189,imageso/o/s/h/osh11d00/520755520+-5521.tif,3
190,imagesk/k/x/y/kxy48d00/2063753039.tif,9
191,imagesp/p/q/a/pqa64e00/2056975783.tif,11


In [24]:
display_info(df_sampled_test_labels)

Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192 entries, 0 to 191
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   image   192 non-null    object
 1   label   192 non-null    object
dtypes: object(2)
memory usage: 3.1+ KB
(192, 2)
Head


Unnamed: 0,image,label
0,imagese/e/b/i/ebi63c00/2064938051.tif,9
1,imagesu/u/l/f/ulf01d00/518214896+-4897.tif,3
2,imagesz/z/p/h/zph63a00/507204402+-4417.tif,13
3,imagesh/h/o/q/hoq40d00/517303711+-3712.tif,3
4,imagesm/m/d/x/mdx34c00/83553602_3608.tif,11


Sample


Unnamed: 0,image,label
151,imagesm/m/o/r/mor36d00/50597614-7615.tif,14
105,imagesv/v/k/b/vkb04e00/2040956524.tif,10
70,imagesu/u/v/n/uvn59e00/2024482928.tif,8
61,imagesy/y/n/z/ynz49d00/501163645_501163648.tif,0
161,imagesu/u/l/p/ulp44c00/85282788.tif,15


Tail


Unnamed: 0,image,label
187,imagesz/z/t/x/ztx85c00/2073735615.tif,9
188,imagesf/f/g/m/fgm13f00/0000398573.tif,15
189,imagesw/w/p/p/wpp20f00/0001482830.tif,7
190,imagesx/x/n/k/xnk43f00/0000722304.tif,4
191,imagese/e/k/n/ekn71d00/502605449+-5449.tif,4


### Calculate some statistics concerning images shape

In [25]:
def calc_dataset_shape_statistics(df_dataset, stat_per_class=False):
    array_data = np.zeros((len(df_dataset.index), 4), dtype=np.int32)
    row_index = 0
    for row in tqdm(df_dataset.itertuples()):
        image = cv2.imread(os.path.join(images_folder_path, row.image), cv2.IMREAD_GRAYSCALE)
        array_data[row_index] = [image.shape[0], image.shape[1], image.shape[0]*image.shape[1], row.label]
        row_index += 1
    
    df_shapes = pd.DataFrame(data=array_data, columns=['Height', 'Width', 'Area', 'Class'])
    
    # stat_per_class = True -> Satistics are presented for each class
    if stat_per_class:
        print(df_shapes.groupby(['Class']).agg(['min', 'max', q1, q2, q3]))
    else:
        print(df_shapes.describe())

Functions for per class statistics (percentiles):

In [26]:
def q1(x):
    return x.quantile(0.25)

def q2(x):
    return x.median()

def q3(x):
    return x.quantile(0.75)

#### Training set

In [27]:
calc_dataset_shape_statistics(df_sampled_train_labels)

1600it [02:47,  9.56it/s]

       Height        Width           Area        Class
count  1600.0  1600.000000    1600.000000  1600.000000
mean   1000.0   765.546250  765546.250000     7.500000
std       0.0    20.717924   20717.924152     4.611213
min    1000.0   607.000000  607000.000000     0.000000
25%    1000.0   754.000000  754000.000000     3.750000
50%    1000.0   754.000000  754000.000000     7.500000
75%    1000.0   777.000000  777000.000000    11.250000
max    1000.0   874.000000  874000.000000    15.000000





#### Validation set

In [28]:
calc_dataset_shape_statistics(df_sampled_valid_labels)

192it [00:19,  9.87it/s]

       Height       Width           Area       Class
count   192.0  192.000000     192.000000  192.000000
mean   1000.0  765.479167  765479.166667    7.500000
std       0.0   21.647572   21647.571798    4.621824
min    1000.0  627.000000  627000.000000    0.000000
25%    1000.0  754.000000  754000.000000    3.750000
50%    1000.0  754.000000  754000.000000    7.500000
75%    1000.0  777.000000  777000.000000   11.250000
max    1000.0  868.000000  868000.000000   15.000000





#### Test set

In [29]:
calc_dataset_shape_statistics(df_sampled_test_labels)

192it [00:18, 10.14it/s]

       Height       Width           Area       Class
count   192.0  192.000000     192.000000  192.000000
mean   1000.0  763.651042  763651.041667    7.500000
std       0.0   20.481842   20481.842270    4.621824
min    1000.0  611.000000  611000.000000    0.000000
25%    1000.0  754.000000  754000.000000    3.750000
50%    1000.0  754.000000  754000.000000    7.500000
75%    1000.0  777.000000  777000.000000   11.250000
max    1000.0  830.000000  830000.000000   15.000000





#### Image shape statistics per class (training set)

In [30]:
calc_dataset_shape_statistics(df_sampled_train_labels, stat_per_class=True)

1600it [01:38, 16.32it/s]

      Height                         Width                             Area  \
         min   max    q1    q2    q3   min  max   q1     q2      q3     min   
Class                                                                         
0       1000  1000  1000  1000  1000   754  823  754  762.0  777.00  754000   
1       1000  1000  1000  1000  1000   616  848  754  754.0  777.00  616000   
2       1000  1000  1000  1000  1000   754  802  754  754.0  754.00  754000   
3       1000  1000  1000  1000  1000   754  830  754  754.0  761.00  754000   
4       1000  1000  1000  1000  1000   611  843  754  755.5  775.00  611000   
5       1000  1000  1000  1000  1000   713  847  754  754.0  777.00  713000   
6       1000  1000  1000  1000  1000   677  808  754  754.0  772.00  677000   
7       1000  1000  1000  1000  1000   754  874  762  777.0  784.00  754000   
8       1000  1000  1000  1000  1000   703  868  754  764.5  778.50  703000   
9       1000  1000  1000  1000  1000   611  847  754




## Preprocessing and mini-dataset generation

### Functions

**check_image_dimensions**: Check if the specified image respects the width threshold. If not, another random image belonging to the same class and respecting the threshold is selected. The path to this image is then returned

In [31]:
def check_image_dimensions(image, image_label, df_complete_dataset, df_sampled_dataset):
    if image.shape[1] <= width_threshold:
        return True, None
    
    # Consider all images belonging to the same class of the image to be replaced AND
    # not already in the sampled dataset
    df_filtered_by_class = df_complete_dataset[(df_complete_dataset.label == image_label)&\
                                               (~df_complete_dataset.image.isin(df_sampled_dataset.image))]
    
    # Random sample an image until I found one with an acceptable width
    replaced_img_path = None
    replaced = False
    while not replaced:
        sampled_row = df_filtered_by_class.sample(n=1).iloc[0]
        img_replaced = cv2.imread(os.path.join(images_folder_path, sampled_row.image), cv2.IMREAD_GRAYSCALE)
        if img_replaced.shape[1] <= width_threshold:
            replaced_img_path = sampled_row.image
            replaced = True
    
    assert replaced_img_path is not None
    
    return False, replaced_img_path

**replace_imgs_with_invalid_width**: check if all images in _df_sampled_dataset_ respect the width threshold. Images too large will be replaced with other images from _df_complete_dataset_ belonging to the same class

In [32]:
def replace_imgs_with_invalid_width(df_sampled_dataset, df_complete_dataset):
    for index, row in tqdm(df_sampled_dataset.iterrows()):
        image = cv2.imread(os.path.join(images_folder_path, row.image), cv2.IMREAD_GRAYSCALE)
        valid, replaced_img_path = check_image_dimensions(image, row.label, df_complete_dataset, df_sampled_dataset)
        
        if not valid:
            df_sampled_dataset.at[index , 'image'] = replaced_img_path

**reshape_image**: Resizes the specified images to the target shape, adding a black border to the image to preserve the original height/width ratio. Adding a black border instead of a white one could be a hint to the CNN to focus on the center of the image.

In [33]:
def reshape_image(image, target_shape):
    # A rescale is needed if the width is less than the target_shape width (all images have height 1000)
    if image.shape[1] > target_shape[1]:
        scale_factor = float(target_shape[1])/image.shape[1]
        scaled_height = int(image.shape[0]*scale_factor)
        scaled_width = int(image.shape[1]*scale_factor)
    
        # interpolation=cv2.INTER_AREA is preferred for image shrinking
        img_scaled = cv2.resize(image, (scaled_width, scaled_height), interpolation=cv2.INTER_AREA)
    else:
        img_scaled = image
    
    img_reshaped = np.zeros((target_shape[0], target_shape[1]), dtype=int)
    row_start = (target_shape[0] - img_scaled.shape[0])//2
    col_start = (target_shape[1] - img_scaled.shape[1])//2
    img_reshaped[row_start:row_start + img_scaled.shape[0], col_start:col_start + img_scaled.shape[1]] = img_scaled
    return img_reshaped

**normalize_image**: Normalizes all pixel values of the specified image.

In [34]:
def normalize_image(image):
    image_array = np.asarray(image, dtype=np.float32)
    return image_array / 255

**generate_minidataset**: Creates in the specified hdf5 file a dataset with name _dataset_name_ ,  preprocesses the data in the dataframe _df_ and fills the dataset with them

In [55]:
def generate_minidataset(hdf_file, df, dataset_name):
    n_debug_images = 5
    # Number of image composing the batch to be written into the dataset at the same time (details below)
    batch_dimension = 64
    n_imgs = len(df.index)

    print("Generating dataset " + dataset_name + " - " + str(n_imgs) + " images")
    
    dataset_imgs = hdf_file.create_dataset(dataset_name, (n_imgs, target_shape[0], target_shape[1]),\
                                      dtype="float32", compression="gzip")
    dataset_labels = hdf_file.create_dataset(dataset_name + "_labels", (n_imgs, n_classes),\
                                      dtype="int8", compression="gzip")
    
    # Convert labels column of the dataframe to one-hot encoding to be used by keras
    df["label"] = to_categorical(df["label"], num_classes=n_classes, dtype='int8').tolist()
    
    # Arrays to store batch data
    batch_array_imgs = np.zeros((batch_dimension, target_shape[0], target_shape[1]), dtype=np.float32)
    batch_array_labels = np.zeros((batch_dimension, n_classes))
    
    batch_index = 0
    batch_number = 0
    for row in tqdm(df.itertuples()):
        image = cv2.imread(os.path.join(images_folder_path, row.image), cv2.IMREAD_GRAYSCALE)
        reshaped = reshape_image(image, target_shape)
        normalized = normalize_image(reshaped)
        
        # Put image and label into batch arrays
        batch_array_imgs[batch_index] = normalized
        batch_array_labels[batch_index] = row.label

        # Save some reshaped images to manually check them
        if n_debug_images > 0:
            image_name = os.path.basename(row.image)
            cv2.imwrite(os.path.join(debug_folder_reshaped_imgs, dataset_name + "-" + image_name), reshaped)
            n_debug_images -= 1
        
        # Save data into database in batches to speedup the writing process and save RAM memory at the same time
        if batch_index == batch_dimension - 1:
            start_index = batch_number*batch_dimension
            end_index = start_index + batch_dimension
            dataset_imgs[start_index:end_index,:,:] = batch_array_imgs
            dataset_labels[start_index:end_index,:] = batch_array_labels
            
            batch_index = 0
            batch_number += 1
            
        batch_index += 1
    
    # Write remaining data inside the batches into the dataset
    if batch_index != 0:
        start_index = batch_number*batch_dimension
        end_index = start_index + batch_dimension
        dataset_imgs[start_index:end_index,:,:] = batch_array_imgs[:batch_index,:,:]
        dataset_labels[start_index:end_index,:] = batch_array_labels[:batch_index,:]

#### Replace all images that do not respect the width threshold

In [36]:
replace_imgs_with_invalid_width(df_sampled_train_labels, df_train_labels)
replace_imgs_with_invalid_width(df_sampled_valid_labels, df_valid_labels)
replace_imgs_with_invalid_width(df_sampled_test_labels, df_test_labels)

1600it [01:35, 16.81it/s]
192it [00:12, 15.18it/s]
192it [00:11, 16.81it/s]


#### Generates a HDF5 file containing the mini-dataset. For each set (train, validation, test) a different dataset is created inside the file

In [56]:
with h5py.File(os.path.join(output_folder, 'data.h5'), 'w') as hdf_file:
    # Instead of passing a copy of each dataframe we could directly pass the dataframe to save memory and time
    # generate_minidataset modify te dataframe so in the development phase i need to retain a clean state of
    # each dataframe
    generate_minidataset(hdf_file, df_sampled_train_labels.copy(), "train")
    generate_minidataset(hdf_file, df_sampled_valid_labels.copy(), "valid")
    generate_minidataset(hdf_file, df_sampled_test_labels.copy(), "test")

0it [00:00, ?it/s]

Generating dataset train - 1600 images


1600it [02:00, 13.24it/s]
2it [00:00, 16.64it/s]

Generating dataset valid - 192 images


192it [00:12, 15.81it/s]
2it [00:00, 18.27it/s]

Generating dataset test - 192 images


192it [00:11, 16.03it/s]


#### Read some data from the generated HFD5 file and reconvert them to images to verify the correctness of the datasets

In [57]:
n_debug_images = 5

with h5py.File(os.path.join(output_folder, 'data.h5'), 'r') as hdf_file:
    ds_train = hdf_file["train"]
    ds_train_labels = hdf_file["train_labels"]
    for i in range(n_debug_images):
        normalized_image = ds_train[i]
        print("Image " + str(i) + " - shape: " + str(normalized_image.shape) + " - label: " + str(ds_train_labels[i]))
        # Bring back pixel values to the [0,255] range
        image = np.asarray(normalized_image * 255, dtype=np.int32)
        cv2.imwrite(os.path.join(debug_folder_dataset_imgs, "img-" + str(i) + ".tif"), image)

Image 0 - shape: (1000, 750) - label: [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
Image 1 - shape: (1000, 750) - label: [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
Image 2 - shape: (1000, 750) - label: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
Image 3 - shape: (1000, 750) - label: [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
Image 4 - shape: (1000, 750) - label: [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
