# Image preparation

The original image size is 55x90 pixels with a color depth of 3 (RGB).
The below code can be used to transform the images in an input directory (Input_dir) to the right size (20x32 pixels) into an output directory (Output_dir). Inside the directory the pictures are stored in subdirectories according their labeling (0 ... 9 + NaN).
Any other image converter can be used as well.

### Prerequisite
Installed OpenCV libary within python (opencv)

In [1]:
import glob
import os
from PIL import Image 

Input_dir = 'data_raw_all'
Output_dir= 'data_resize_all'

target_size_x = 32
target_size_y = 32

In [2]:
files = glob.glob(Output_dir + '/*.jpg')
for f in files:
    os.remove(f)
print(str(len(files)) + " files have been deleted.")

231 files have been deleted.


In [3]:
import hashlib

files = glob.glob(Input_dir + '/*.jpg')
hashes={}
for i,aktfile in enumerate(files):
    if i%500==0:
        print(i, aktfile)
    test_image = Image.open(aktfile)
    hash=hashlib.sha256(test_image.tobytes()).hexdigest()
    if hash in hashes:
        hashes[hash].append(aktfile)
    else:
        hashes[hash]=[aktfile]
    test_image = test_image.resize((target_size_x, target_size_y), Image.NEAREST)
    base=os.path.basename(aktfile)
    save_name = Output_dir + '/' + base
    test_image.save(save_name, "JPEG")

0 data_raw_all\0.0_0.0.jpg


  test_image = test_image.resize((target_size_x, target_size_y), Image.NEAREST)


500 data_raw_all\0.7_cf3528b9b2433f66e17535a84aabe8b1.jpg
1000 data_raw_all\1.4_0584_zeiger3_2019-06-04T201009.jpg
1500 data_raw_all\2.0_0940_zeiger4_2019-06-04T054009.jpg
2000 data_raw_all\2.6_697ad8d9b31ecfa8ee5d8f2eb67bac87.jpg
2500 data_raw_all\3.3_1628_zeiger3_2019-06-01T184019.jpg
3000 data_raw_all\4.0_1971_zeiger2_2020-04-29_11-00-02.jpg
3500 data_raw_all\4.6_2329_zeiger3_2019-06-02T040013.jpg
4000 data_raw_all\5.3_2585_zeiger1_2019-11-19_00-57-03.jpg
4500 data_raw_all\6.0_2923_zeiger2_2019-06-04T102009.jpg
5000 data_raw_all\6.6_3305_zeiger4_2019-06-05T072009.jpg
5500 data_raw_all\7.3_3749_zeiger4_2019-11-19_01-22-03.jpg
6000 data_raw_all\8.0_4053_zeiger4_2019-11-19_07-52-03.jpg
6500 data_raw_all\8.8_afcb1a4e5ca440e0a9b91f4ab1ac6a51.jpg
7000 data_raw_all\9.4_4735_zeiger2_2019-06-04T141009.jpg


# Removing duplicate files

In [4]:
# duplicate files are a risk to the metrics, they pollute the validation dataset
for hash in hashes:
    if len(hashes[hash])>1:
        print(hashes[hash])    
        for duplicate in hashes[hash][1:]:
            # remove all except the first
            os.remove(duplicate)    

['data_raw_all\\0.3_f38f2eb5263c3f8159ae2d2a7168d5d5.jpg', 'data_raw_all\\0.4_f38f2eb5263c3f8159ae2d2a7168d5d5.jpg']
['data_raw_all\\1.1_e7f7d50ef31f40cf0287bf392636fff4.jpg', 'data_raw_all\\1.2_e7f7d50ef31f40cf0287bf392636fff4.jpg']
['data_raw_all\\3.4_bb2454a9079fb538291cb22e4a1294ad.jpg', 'data_raw_all\\3.5_dc83d3e47e3e8bdf992e23fa31ff4093.jpg']
['data_raw_all\\5.3_e046159c358ff9fdc11dc68ec08f8d42.jpg', 'data_raw_all\\5.4_96345da8a67b01f07d974c488425c3e5.jpg']
