# Mini-dataset generation

In [13]:
import os
from shutil import copyfile
from shutil import rmtree
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm

%load_ext memory_profiler
%load_ext line_profiler

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler
The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


##### Global parameters:

In [14]:
dataset_path = "/Volumes/Part1/datasets/rvl-cdip"
images_folder_path = os.path.join(dataset_path, "images")
project_folder = "/Users/jasonravagli/Desktop/Scuola/Università-Magistrale/data-and-document-mining/document-classification"
resources_folder = os.path.join(project_folder, "resources")
output_dataset_folder = os.path.join(resources_folder, "output", "mini-dataset")
debug_folder = os.path.join(resources_folder, "debug")

# Number of images classes
n_classes = 16

# Dimensions of the dataset to be generated
mini_train_set_dimension = 32000
mini_valid_set_dimension = 4000
mini_test_set_dimension =  4000

#### Setup folders

In [15]:
# Delete already existing dataset output folder
rmtree(output_dataset_folder, ignore_errors=True)

os.makedirs(output_dataset_folder)
os.makedirs(debug_folder, exist_ok=True)

# Folders to manually check some processed images
debug_folder_reshaped_imgs = os.path.join(debug_folder, "reshaped_imgs")

# Folders to manually check some images read from the generated dataset
debug_folder_dataset_imgs = os.path.join(debug_folder, "dataset_imgs")

os.makedirs(debug_folder_reshaped_imgs, exist_ok=True)
os.makedirs(debug_folder_dataset_imgs, exist_ok=True)

Utility function to display dataframe info:

In [16]:
def display_info(df=None, rows=5):
    try:
        print("Info")
        df.info(verbose=True)
        print(df.shape)
        print("Head")
        display(df.head(rows))
        print("Sample")
        display(df.sample(rows))
        print("Tail")
        display(df.tail(rows))
    except:
        print("Errore nella visualizzazione")

## Data Sampling

#### Read the labels files:

In [17]:
%memit df_train_labels = pd.read_csv(os.path.join(dataset_path, "labels/train.txt"), sep=" ",\
                                     names=["image", "label"], skipinitialspace=True)
%memit df_valid_labels = pd.read_csv(os.path.join(dataset_path, "labels/val.txt"), sep=" ",\
                                     names=["image", "label"], skipinitialspace=True)
%memit df_test_labels = pd.read_csv(os.path.join(dataset_path, "labels/test.txt"), sep=" ",\
                                    names=["image", "label"], skipinitialspace=True)

# Reset column names because of strange whitespaces added to the column names after reading
df_train_labels.columns = ["image", "label"]
df_valid_labels.columns = ["image", "label"]
df_test_labels.columns = ["image", "label"]

# Display some info to verify the correctness of the data read
display_info(df_train_labels)

peak memory: 210.02 MiB, increment: 70.78 MiB
peak memory: 190.66 MiB, increment: -16.45 MiB
peak memory: 171.94 MiB, increment: 1.33 MiB
Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 320000 entries, 0 to 319999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   image   320000 non-null  object
 1   label   320000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 4.9+ MB
(320000, 2)
Head


Unnamed: 0,image,label
0,imagesq/q/o/c/qoc54c00/80035521.tif,15
1,imagese/e/w/c/ewc23d00/513280028.tif,1
2,imagesw/w/b/t/wbt26e00/2053453161.tif,7
3,imagesm/m/k/m/mkm05e00/2040792992_2040792994.tif,10
4,imageso/o/e/x/oex80d00/522787731+-7732.tif,3


Sample


Unnamed: 0,image,label
246531,imageso/o/p/z/opz01e00/87726883.tif,1
132047,imagesc/c/k/a/cka08d00/2072098344_8347.tif,12
244433,imagese/e/v/c/evc88e00/2020158028_2020158030.tif,12
242141,imagesl/l/x/h/lxh76c00/2077261791.tif,9
144032,imagesv/v/i/k/vik78e00/2010015407.tif,12


Tail


Unnamed: 0,image,label
319995,imagesu/u/p/p/upp04f00/0000282789.tif,9
319996,imagesa/a/c/z/acz60f00/0011972032.tif,15
319997,imagesu/u/j/m/ujm20a00/10155388.tif,6
319998,imagesd/d/r/r/drr93f00/0000343578.tif,9
319999,imagesp/p/j/x/pjx11d00/518223252+-3253.tif,3


#### Random sample data preserving ratio between classes

In [18]:
train_data_per_class = mini_train_set_dimension//n_classes
valid_data_per_class = mini_valid_set_dimension//n_classes
test_data_per_class = mini_test_set_dimension//n_classes

df_sampled_train_labels = pd.DataFrame(columns = df_train_labels.columns)
df_sampled_valid_labels = pd.DataFrame(columns = df_valid_labels.columns)
df_sampled_test_labels = pd.DataFrame(columns = df_test_labels.columns)

for i in range(n_classes):
    df_sampled_train_labels = pd.concat([df_sampled_train_labels, \
                                         df_train_labels.loc[df_train_labels["label"] == i].sample(n=train_data_per_class)])
    df_sampled_valid_labels = pd.concat([df_sampled_valid_labels, \
                                         df_valid_labels.loc[df_valid_labels["label"] == i].sample(n=valid_data_per_class)])
    df_sampled_test_labels = pd.concat([df_sampled_test_labels, \
                                         df_test_labels.loc[df_test_labels["label"] == i].sample(n=test_data_per_class)])

df_sampled_train_labels = df_sampled_train_labels.sample(frac=1).reset_index(drop=True)
df_sampled_valid_labels = df_sampled_valid_labels.sample(frac=1).reset_index(drop=True)
df_sampled_test_labels = df_sampled_test_labels.sample(frac=1).reset_index(drop=True)

## Data verification and analysis

#### Display generated dataframes summaries to verify their correctness

In [19]:
display_info(df_sampled_train_labels)

Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000 entries, 0 to 31999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   image   32000 non-null  object
 1   label   32000 non-null  object
dtypes: object(2)
memory usage: 500.1+ KB
(32000, 2)
Head


Unnamed: 0,image,label
0,imagesc/c/x/z/cxz73f00/0001144246.tif,11
1,imagesl/l/k/i/lki86c00/2081876012.tif,8
2,imagesf/f/e/e/fee19e00/2058030748.tif,7
3,imagesw/w/f/q/wfq40d00/517303977+-3978.tif,3
4,imagesk/k/x/l/kxl22f00/11007258.tif,10


Sample


Unnamed: 0,image,label
18724,imagesg/g/l/o/glo14d00/507793386.tif,4
2776,imagesf/f/c/r/fcr09d00/50454550-4551.tif,14
5580,imagesc/c/p/r/cpr54c00/01838895_8899.tif,13
21425,imagesx/x/r/v/xrv75e00/2048691557.tif,10
19019,imagesd/d/w/h/dwh16e00/2048740161_2048740167.tif,13


Tail


Unnamed: 0,image,label
31995,imagesc/c/a/b/cab05d00/506063154_506063156.tif,4
31996,imagesy/y/s/g/ysg21d00/515624196+-4198.tif,3
31997,imagesp/p/k/w/pkw29e00/2501382214.tif,8
31998,imagesc/c/q/l/cql21e00/87803225.tif,11
31999,imagesl/l/w/w/lww01d00/517511850+-1854.tif,3


In [20]:
display_info(df_sampled_valid_labels)

Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   image   4000 non-null   object
 1   label   4000 non-null   object
dtypes: object(2)
memory usage: 62.6+ KB
(4000, 2)
Head


Unnamed: 0,image,label
0,imagesm/m/y/g/myg08c00/527828471+-8471.tif,2
1,imagesp/p/i/p/pip68e00/2023407789.tif,8
2,imagesg/g/h/r/ghr32c00/2071483923_3929.tif,5
3,imagesx/x/f/a/xfa42d00/2063559025_9027.tif,9
4,imagesb/b/q/i/bqi53c00/94253115_3118.tif,1


Sample


Unnamed: 0,image,label
3701,imagesp/p/f/p/pfp3aa00/11233928_11233929.tif,11
640,imagese/e/u/q/euq55d00/504785521_504785526.tif,13
2905,imagesb/b/f/p/bfp22d00/2062199378_9380.tif,12
2423,imagesi/i/c/r/icr94e00/1003175509_1003175519.tif,6
1926,imagesm/m/w/h/mwh16e00/2048739938_2048739956.tif,13


Tail


Unnamed: 0,image,label
3995,imagesv/v/k/n/vkn69d00/500713769.tif,4
3996,imagesy/y/x/v/yxv53f00/0001218722.tif,0
3997,imagesq/q/f/t/qft62c00/2077774819.tif,8
3998,imagesq/q/o/f/qof71d00/517500869+-0869.tif,4
3999,imagesd/d/p/u/dpu26c00/2071382966.tif,11


In [21]:
display_info(df_sampled_test_labels)

Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   image   4000 non-null   object
 1   label   4000 non-null   object
dtypes: object(2)
memory usage: 62.6+ KB
(4000, 2)
Head


Unnamed: 0,image,label
0,imagesq/q/i/e/qie40e00/87636525_87636526.tif,5
1,imagesl/l/q/u/lqu71a00/2057467166_2057467167.tif,7
2,imagesg/g/w/m/gwm20c00/2085792115a.tif,2
3,imagesg/g/u/v/guv45c00/2076334356.tif,8
4,imagesw/w/n/l/wnl01d00/522196788+-6788.tif,1


Sample


Unnamed: 0,image,label
3781,imagese/e/j/f/ejf02c00/2085011720_1738.tif,13
1111,imagesd/d/q/y/dqy60c00/2078195527b_5529.tif,9
1920,imagesm/m/h/k/mhk35d00/505019415_505019417.tif,3
2396,imagesq/q/s/n/qsn53c00/95602469.tif,11
2771,imagesg/g/m/o/gmo69c00/50171367-1367.tif,15


Tail


Unnamed: 0,image,label
3995,imagesx/x/p/f/xpf80f00/0011920196.tif,7
3996,imagesr/r/b/t/rbt67e00/2063576176.tif,11
3997,imagesq/q/h/q/qhq84e00/1000291086.tif,13
3998,imagesh/h/d/y/hdy08d00/2070124004_4008.tif,13
3999,imagesb/b/r/i/bri35d00/505027324_505027325.tif,0


## Mini-dataset generation

### Functions

**generate_minidataset**: Creates in the specified hdf5 file a dataset with name _dataset_name_ and fills the dataset with the images in the dataframe _df_ as they are, without any preprocessing operations

In [22]:
def generate_minidataset(df, dataset_name):
    batch_dimension = 64
    n_imgs = len(df.index)

    print("Generating dataset " + dataset_name + " - " + str(n_imgs) + " images")
    
    # Matrix containing img_path - label
    matrix_labels = np.empty((n_imgs, 2), dtype=np.object)
    
    img_counter = 0
    max_path_length = 0
    for row in tqdm(df.itertuples()):
        ten_thousand = img_counter // 10**4 % 10
        thousand = img_counter // 10**3 % 10
        hundred = img_counter // 10**2 % 10
        ten = img_counter // 10 % 10
        unit = img_counter % 10
        
        dest_folder_path = os.path.join(output_dataset_folder, dataset_name, str(ten_thousand), str(thousand), str(hundred), str(ten), str(unit))
        image_name = os.path.basename(row.image)
        img_abs_path = os.path.join(dest_folder_path, image_name)
        img_relative_path = os.path.join(dataset_name, str(ten_thousand), str(thousand), str(hundred), str(ten), str(unit), image_name)
        
        os.makedirs(dest_folder_path, exist_ok=True)
        copyfile(os.path.join(images_folder_path, row.image), img_abs_path)
        
        matrix_labels[img_counter, 0] = img_relative_path
        matrix_labels[img_counter, 1] = str(row.label)
        
        if len(img_relative_path) > max_path_length:
            max_path_length = len(img_relative_path)
            
        img_counter += 1
    # Save images relative paths and associated labels in a csv file    
    np.savetxt(os.path.join(output_dataset_folder, dataset_name + '-labels.csv'), matrix_labels, delimiter=',', fmt='%s')

#### Generates the mini-dataset

In [23]:
generate_minidataset(df_sampled_train_labels, "train")
generate_minidataset(df_sampled_valid_labels, "valid")
generate_minidataset(df_sampled_test_labels, "test")

0it [00:00, ?it/s]

Generating dataset train - 32000 images


32000it [44:47, 11.91it/s]
2it [00:00, 17.58it/s]

Generating dataset valid - 4000 images


4000it [05:26, 12.23it/s]
2it [00:00, 13.35it/s]

Generating dataset test - 4000 images


4000it [05:36, 11.90it/s]
