# Preprocessing and mini-dataset generation

In [8]:
import os
import h5py
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from keras.utils import to_categorical

%load_ext memory_profiler
%load_ext line_profiler

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler
The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


##### Global parameters:

In [27]:
dataset_path = "/Volumes/Part1/datasets/rvl-cdip"
images_folder_path = os.path.join(dataset_path, "images")
output_folder = "/Users/jasonravagli/Desktop/Scuola/Università-Magistrale/data-and-document-mining/project-document-classification/output"
debug_folder = "/Users/jasonravagli/Desktop/Scuola/Università-Magistrale/data-and-document-mining/project-document-classification/debug"
# Number of images classes
n_classes = 16

#### Setup folders

In [28]:
os.makedirs(output_folder, exist_ok=True)
os.makedirs(debug_folder, exist_ok=True)

# Folders to manually check some processed images
debug_folder_reshaped_imgs = os.path.join(debug_folder, "reshaped_imgs")

# Folders to manually check some images read from the generated dataset
debug_folder_dataset_imgs = os.path.join(debug_folder, "dataset_imgs")

os.makedirs(debug_folder_reshaped_imgs, exist_ok=True)
os.makedirs(debug_folder_dataset_imgs, exist_ok=True)

Utility function to display dataframe info:

In [11]:
def display_info(df=None, rows=5):
    try:
        print("Info")
        df.info(verbose=True)
        print(df.shape)
        print("Head")
        display(df.head(rows))
        print("Sample")
        display(df.sample(rows))
        print("Tail")
        display(df.tail(rows))
    except:
        print("Errore nella visualizzazione")

## Data Sampling

#### Specify the dimensions of the mini-dataset to be generated

In [12]:
mini_train_set_dimension = 1600
mini_valid_set_dimension = 200
mini_test_set_dimension = 200

#### Read the labels files:

In [13]:
%memit df_train_labels = pd.read_csv(os.path.join(dataset_path, "labels/train.txt"), sep=" ",\
                                     names=["image", "label"], skipinitialspace=True)
%memit df_valid_labels = pd.read_csv(os.path.join(dataset_path, "labels/val.txt"), sep=" ",\
                                     names=["image", "label"], skipinitialspace=True)
%memit df_test_labels = pd.read_csv(os.path.join(dataset_path, "labels/test.txt"), sep=" ",\
                                    names=["image", "label"], skipinitialspace=True)

# Reset column names because of strange whitespaces added to the column names after reading
df_train_labels.columns = ["image", "label"]
df_valid_labels.columns = ["image", "label"]
df_test_labels.columns = ["image", "label"]

# Display some info to verify the correctness of the data read
display_info(df_train_labels)

peak memory: 281.36 MiB, increment: 55.63 MiB
peak memory: 284.18 MiB, increment: 3.09 MiB
peak memory: 274.33 MiB, increment: 7.22 MiB
Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 320000 entries, 0 to 319999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   image   320000 non-null  object
 1   label   320000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 4.9+ MB
(320000, 2)
Head


Unnamed: 0,image,label
0,imagesq/q/o/c/qoc54c00/80035521.tif,15
1,imagese/e/w/c/ewc23d00/513280028.tif,1
2,imagesw/w/b/t/wbt26e00/2053453161.tif,7
3,imagesm/m/k/m/mkm05e00/2040792992_2040792994.tif,10
4,imageso/o/e/x/oex80d00/522787731+-7732.tif,3


Sample


Unnamed: 0,image,label
44836,imagesq/q/y/k/qyk26d00/50609248-9249.tif,14
39943,imagest/t/l/e/tle72f00/tob10615.20.tif,9
16183,imagesz/z/u/m/zum13e00/2057992153_2057992154.tif,14
308304,imagesq/q/p/i/qpi25e00/2028596417.tif,15
135573,imagesg/g/y/x/gyx11a00/0071003362.tif,8


Tail


Unnamed: 0,image,label
319995,imagesu/u/p/p/upp04f00/0000282789.tif,9
319996,imagesa/a/c/z/acz60f00/0011972032.tif,15
319997,imagesu/u/j/m/ujm20a00/10155388.tif,6
319998,imagesd/d/r/r/drr93f00/0000343578.tif,9
319999,imagesp/p/j/x/pjx11d00/518223252+-3253.tif,3


#### Random sample data preserving ratio between classes

In [14]:
train_data_per_class = mini_train_set_dimension//n_classes
valid_data_per_class = mini_valid_set_dimension//n_classes
test_data_per_class = mini_test_set_dimension//n_classes

df_sampled_train_labels = pd.DataFrame(columns = df_train_labels.columns)
df_sampled_valid_labels = pd.DataFrame(columns = df_valid_labels.columns)
df_sampled_test_labels = pd.DataFrame(columns = df_test_labels.columns)

for i in range(n_classes):
    df_sampled_train_labels = pd.concat([df_sampled_train_labels, \
                                         df_train_labels.loc[df_train_labels["label"] == i].sample(n=train_data_per_class)])
    df_sampled_valid_labels = pd.concat([df_sampled_valid_labels, \
                                         df_valid_labels.loc[df_valid_labels["label"] == i].sample(n=valid_data_per_class)])
    df_sampled_test_labels = pd.concat([df_sampled_test_labels, \
                                         df_test_labels.loc[df_test_labels["label"] == i].sample(n=test_data_per_class)])

df_sampled_train_labels = df_sampled_train_labels.sample(frac=1).reset_index(drop=True)
df_sampled_valid_labels = df_sampled_valid_labels.sample(frac=1).reset_index(drop=True)
df_sampled_test_labels = df_sampled_test_labels.sample(frac=1).reset_index(drop=True)

## Data verification and analysis

#### Display generated dataframes summaries to verify their correctness

In [15]:
display_info(df_sampled_train_labels)

Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   image   1600 non-null   object
 1   label   1600 non-null   object
dtypes: object(2)
memory usage: 25.1+ KB
(1600, 2)
Head


Unnamed: 0,image,label
0,imagesk/k/m/g/kmg10d00/515240240+-0240.tif,1
1,imagesb/b/r/e/bre12e00/2028707859.tif,11
2,imagesc/c/r/d/crd2aa00/10082581_10082587.tif,6
3,imagesn/n/y/i/nyi80c00/2041472201.tif,2
4,imagesq/q/x/l/qxl11d00/522821893+-1897.tif,3


Sample


Unnamed: 0,image,label
335,imagesv/v/v/q/vvq36c00/2072175703.tif,4
1584,imagesk/k/a/t/kat47e00/2030471844.tif,1
546,imagesv/v/r/c/vrc91d00/881672.tif,0
1580,imageso/o/m/p/omp48c00/2083778701.tif,9
1204,imagesy/y/r/u/yru22e00/2501042059.tif,9


Tail


Unnamed: 0,image,label
1595,imagesd/d/t/k/dtk57e00/2030910473_2030910475.tif,12
1596,imagesj/j/w/v/jwv98d00/50412722-2723.tif,14
1597,imagesa/a/w/z/awz61a00/2057409812.tif,7
1598,imagesx/x/d/k/xdk93a00/526664130+-4135.tif,13
1599,imagesn/n/r/d/nrd55f00/0060084691.tif,11


In [16]:
display_info(df_sampled_valid_labels)

Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192 entries, 0 to 191
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   image   192 non-null    object
 1   label   192 non-null    object
dtypes: object(2)
memory usage: 3.1+ KB
(192, 2)
Head


Unnamed: 0,image,label
0,imagesc/c/n/j/cnj03a00/518477227+-7228.tif,11
1,imagesc/c/n/f/cnf70f00/0011986732.tif,12
2,imagesb/b/i/e/bie70e00/85866687_85866794.tif,6
3,imagesp/p/r/o/pro26d00/50634495-4495.tif,6
4,imagesw/w/m/w/wmw15e00/2026497815.tif,0


Sample


Unnamed: 0,image,label
81,imagesr/r/l/o/rlo81a00/1000370022_1000370025.tif,13
184,imagesw/w/j/k/wjk90e00/92339545_92339550.tif,6
71,imagesm/m/b/w/mbw69e00/2028939965.tif,8
104,imagesv/v/g/a/vga94d00/505949779.tif,3
72,imagesc/c/l/p/clp07e00/2057923056_2057923066.tif,6


Tail


Unnamed: 0,image,label
187,imagesd/d/v/u/dvu36e00/2023269110.tif,9
188,imagest/t/i/y/tiy20e00/92313949.tif,12
189,imagesj/j/q/l/jql70d00/522905345+-5347.tif,4
190,imagesc/c/t/h/cth70e00/92758115.tif,10
191,imagesl/l/j/s/ljs65d00/504480257.tif,0


In [17]:
display_info(df_sampled_test_labels)

Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192 entries, 0 to 191
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   image   192 non-null    object
 1   label   192 non-null    object
dtypes: object(2)
memory usage: 3.1+ KB
(192, 2)
Head


Unnamed: 0,image,label
0,imagesy/y/w/d/ywd60e00/93765191_93765193.tif,12
1,imagesv/v/z/d/vzd10d00/50370011-0012.tif,14
2,imagesj/j/o/s/jos96c00/CTRCONTRACTS005921-5.tif,11
3,imagesf/f/q/o/fqo92e00/2045620515_2045620516.tif,13
4,imagesp/p/w/o/pwo72c00/2078347987.tif,2


Sample


Unnamed: 0,image,label
79,imagesq/q/o/k/qok22c00/2069729380.tif,7
182,imagesv/v/n/s/vns76e00/2043371020.tif,4
77,imagesj/j/w/l/jwl61a00/2051531022_2051531030.tif,7
130,imagesx/x/z/x/xzx21d00/515944137+-4142.tif,3
166,imagesv/v/m/a/vma56d00/50265921-5921.tif,0


Tail


Unnamed: 0,image,label
187,imagese/e/x/t/ext98d00/50406766-6767.tif,14
188,imagese/e/b/u/ebu26e00/2053462247.tif,8
189,imagesx/x/r/u/xru26d00/50679313-9313.tif,6
190,imagesv/v/y/w/vyw69e00/2028982028.tif,8
191,imagesb/b/i/a/bia41a00/0071043880.tif,13


### Calculate some statistics concerning images shape

In [18]:
def calc_dataset_shape_statistics(df_dataset):
    array_data = np.zeros((len(df_dataset.index), 3), dtype=np.int32)
    row_index = 0
    for row in tqdm(df_dataset.itertuples()):
        image = cv2.imread(os.path.join(images_folder_path, row.image), cv2.IMREAD_GRAYSCALE)
        array_data[row_index] = [image.shape[0], image.shape[1], image.shape[0]*image.shape[1]]
        row_index += 1
   
    df_shapes = pd.DataFrame(data=array_data, columns=['Height', 'Width', 'Area'])
    print(df_shapes.describe())

#### Training set

In [19]:
calc_dataset_shape_statistics(df_sampled_train_labels)

1600it [02:46,  9.63it/s]

       Height        Width          Area
count  1600.0  1600.000000  1.600000e+03
mean   1000.0   767.636250  7.676362e+05
std       0.0    49.844275  4.984427e+04
min    1000.0   611.000000  6.110000e+05
25%    1000.0   754.000000  7.540000e+05
50%    1000.0   761.500000  7.615000e+05
75%    1000.0   777.000000  7.770000e+05
max    1000.0  2405.000000  2.405000e+06





#### Validation set

In [20]:
calc_dataset_shape_statistics(df_sampled_valid_labels)

192it [00:18, 10.42it/s]

       Height       Width           Area
count   192.0  192.000000     192.000000
mean   1000.0  765.911458  765911.458333
std       0.0   20.952639   20952.638539
min    1000.0  611.000000  611000.000000
25%    1000.0  754.000000  754000.000000
50%    1000.0  758.000000  758000.000000
75%    1000.0  777.000000  777000.000000
max    1000.0  874.000000  874000.000000





#### Test set

In [21]:
calc_dataset_shape_statistics(df_sampled_test_labels)

192it [00:17, 10.89it/s]

       Height       Width           Area
count   192.0  192.000000     192.000000
mean   1000.0  765.817708  765817.708333
std       0.0   26.732984   26732.984488
min    1000.0  615.000000  615000.000000
25%    1000.0  754.000000  754000.000000
50%    1000.0  762.000000  762000.000000
75%    1000.0  777.000000  777000.000000
max    1000.0  870.000000  870000.000000





## Preprocessing and mini-dataset generation

### Functions

**get_target_shape_from_dataset**: Calculates the dimension to resize all mini-dataset images at. This dimension have the width of the wider image in the specified the dataset and the height of the taller image. A black border will be added to smaller images.

In [22]:
def get_target_shape_from_dataset(df_dataset):
    image = cv2.imread(os.path.join(images_folder_path, df_dataset.iloc[0].image), cv2.IMREAD_GRAYSCALE)
    target_shape = image.shape

    for row in tqdm(df_dataset.itertuples()):
        image = cv2.imread(os.path.join(images_folder_path, row.image), cv2.IMREAD_GRAYSCALE)
        if image.shape[0] > target_shape[0]:
            target_shape = (image.shape[0], target_shape[1])
        if image.shape[1] > target_shape[1]:
            target_shape = (target_shape[0], image.shape[1])
            
    return target_shape

**reshape_image**: Resizes the specified images to the target shape, adding a black border to the image. We observed that all images have the same height, so a rescale is not necessary. Adding a black border instead of a white one could be a hint to the CNN to focus on the center of the image.

In [23]:
def reshape_image(image, target_shape):
    reshaped = np.zeros((target_shape[0], target_shape[1]), dtype=int)
    row_start = (target_shape[0] - image.shape[0])//2
    col_start = (target_shape[1] - image.shape[1])//2
    reshaped[row_start:row_start + image.shape[0], col_start:col_start + image.shape[1]] = image
    return reshaped

**normalize_image**: Normalizes all pixel values of the specified image.

In [24]:
def normalize_image(image):
    image_array = np.asarray(image, dtype=np.float32)
    return image_array / 255

**generate_minidataset**: Creates in the specified hdf5 file a dataset with name _dataset_name_ ,  preprocesses the data in the dataframe _df_ and fills the dataset with them

In [25]:
def generate_minidataset(hdf_file, df, dataset_name):
    n_debug_images = 5
    # Number of image composing the batch to be written into the dataset at the same time (details below)
    batch_dimension = 64
    n_imgs = len(df.index)

    print("Generating dataset " + dataset_name + " - " + str(n_imgs) + " images")
    
    dataset_imgs = hdf_file.create_dataset(dataset_name, (n_imgs, target_shape[0], target_shape[1]),\
                                      dtype="float32", compression="gzip")
    dataset_labels = hdf_file.create_dataset(dataset_name + "_labels", (n_imgs, n_classes),\
                                      dtype="float32", compression="gzip")
    
    # Convert labels column of the dataframe to one-hot encoding to be used by keras
    df["label"] = to_categorical(df["label"], num_classes=n_classes)
    
    # Arrays to store batch data
    batch_array_imgs = np.zeros((batch_dimension, target_shape[0], target_shape[1]), dtype=np.float32)
    batch_array_labels = np.zeros((batch_dimension, n_classes))
    
    batch_index = 0
    batch_number = 0
    for row in tqdm(df.itertuples()):
        image = cv2.imread(os.path.join(images_folder_path, row.image), cv2.IMREAD_GRAYSCALE)
        reshaped = reshape_image(image, target_shape)
        normalized = normalize_image(reshaped)
        
        # Put image and label into batch arrays
        batch_array_imgs[batch_index] = normalized
        batch_array_labels[batch_index] = row.label

        # Save some reshaped images to manually check them
        if n_debug_images > 0:
            image_name = os.path.basename(row.image)
            cv2.imwrite(os.path.join(debug_folder_reshaped_imgs, dataset_name + "-" + image_name), reshaped)
            n_debug_images -= 1
        
        # Save data into database in batches to speedup the writing process and save RAM memory at the same time
        if batch_index == batch_dimension - 1:
            start_index = batch_number*batch_dimension
            end_index = start_index + batch_dimension
            dataset_imgs[start_index:end_index,:,:] = batch_array_imgs
            dataset_labels[start_index:end_index,:] = batch_array_labels
            
            batch_index = 0
            batch_number += 1
            
        batch_index += 1
    
    # Write remaining data inside the batches into the dataset
    if batch_index != 0:
        start_index = batch_number*batch_dimension
        end_index = start_index + batch_dimension
        dataset_imgs[start_index:end_index,:,:] = batch_array_imgs[:batch_index,:,:]
        dataset_labels[start_index:end_index,:] = batch_array_labels[:batch_index,:]

#### Calculate the dimension to resize all mini-dataset images at

In [29]:
print("Iterating through training set")
target_shape_train = get_target_shape_from_dataset(df_sampled_train_labels)
print("Iterating through validation set")
target_shape_valid = get_target_shape_from_dataset(df_sampled_valid_labels)
print("Iterating through test set")
target_shape_test = get_target_shape_from_dataset(df_sampled_test_labels)

# The target image shape is the smaller among all datasets
target_shape = (np.max([target_shape_train[0], target_shape_valid[0], target_shape_test[0]]),\
                np.max([target_shape_train[1], target_shape_valid[1], target_shape_test[1]]))

print("Target shape: " + str(target_shape))

4it [00:00, 39.02it/s]

Iterating through training set


1600it [01:08, 23.37it/s]
4it [00:00, 35.37it/s]

Iterating through validation set


192it [00:08, 23.41it/s]
5it [00:00, 41.35it/s]

Iterating through test set


192it [00:08, 23.98it/s]

Target shape: (1000, 2405)





#### Generates a HDF5 file containing the mini-dataset. For each set (train, validation, test) a different dataset is created inside the file

In [30]:
with h5py.File(os.path.join(output_folder, 'data.h5'), 'w') as hdf_file:
    # Instead of passing a copy of each dataframe we could directly pass the dataframe to save memory and time
    # generate_minidataset modify te dataframe so in the development phase i need to retain a clean state of
    # each dataframe
    %lprun -f generate_minidataset generate_minidataset(hdf_file, df_sampled_train_labels.copy(), "train")
    generate_minidataset(hdf_file, df_sampled_valid_labels.copy(), "valid")
    generate_minidataset(hdf_file, df_sampled_test_labels.copy(), "test")

2it [00:00, 16.01it/s]

Generating dataset train - 1600 images


15it [00:00, 26.52it/s]
2it [00:00, 19.68it/s]

*** KeyboardInterrupt exception caught in code being profiled.Generating dataset valid - 192 images


126it [00:09, 12.61it/s]


KeyboardInterrupt: 

#### Read some data from the generated HFD5 file and reconvert them to images to verify the correctness of the datasets

In [None]:
n_debug_images = 5

with h5py.File(os.path.join(output_folder, 'data.h5'), 'r') as hdf_file:
    ds_train = hdf_file["train"]
    for i in range(n_debug_images):
        normalized_image = ds_train[i]
        print("Image " + str(i) + " - shape: " + str(normalized_image.shape))
        # Bring back pixel values to the [0,255] range
        image = np.asarray(normalized_image * 255, dtype=np.int32)
        cv2.imwrite(os.path.join(debug_folder_dataset_imgs, "img-" + str(i) + ".tif"), image)

In [None]:
lprun?